reassembly: fix buffer usage counter
[vpp.git] / src / vnet / ip / ip6_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv6 Reassembly.
19  *
20  * This file contains the source code for IPv6 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_48_8.h>
27 #include <vnet/ip/ip6_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP6_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP6_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP6_REASS_HT_LOAD_FACTOR (0.75)
34
35 typedef enum
36 {
37   IP6_REASS_RC_OK,
38   IP6_REASS_RC_INTERNAL_ERROR,
39 } ip6_reass_rc_t;
40
41 typedef struct
42 {
43   union
44   {
45     struct
46     {
47       ip6_address_t src;
48       ip6_address_t dst;
49       u32 xx_id;
50       u32 frag_id;
51       u8 unused[7];
52       u8 proto;
53     };
54     u64 as_u64[6];
55   };
56 } ip6_reass_key_t;
57
58 always_inline u32
59 ip6_reass_buffer_get_data_offset (vlib_buffer_t * b)
60 {
61   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
62   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
63 }
64
65 always_inline u16
66 ip6_reass_buffer_get_data_len (vlib_buffer_t * b)
67 {
68   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
69   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
70     (vnb->ip.reass.fragment_first + ip6_reass_buffer_get_data_offset (b)) + 1;
71 }
72
73 typedef struct
74 {
75   // hash table key
76   ip6_reass_key_t key;
77   // time when last packet was received
78   f64 last_heard;
79   // internal id of this reassembly
80   u64 id;
81   // buffer index of first buffer in this reassembly context
82   u32 first_bi;
83   // last octet of packet, ~0 until fragment without more_fragments arrives
84   u32 last_packet_octet;
85   // length of data collected so far
86   u32 data_len;
87   // trace operation counter
88   u32 trace_op_counter;
89   // next index - used by non-feature node
90   u8 next_index;
91   // minimum fragment length for this reassembly - used to estimate MTU
92   u16 min_fragment_length;
93 } ip6_reass_t;
94
95 typedef struct
96 {
97   ip6_reass_t *pool;
98   u32 reass_n;
99   u32 buffers_n;
100   u32 id_counter;
101   clib_spinlock_t lock;
102 } ip6_reass_per_thread_t;
103
104 typedef struct
105 {
106   // IPv6 config
107   u32 timeout_ms;
108   f64 timeout;
109   u32 expire_walk_interval_ms;
110   u32 max_reass_n;
111
112   // IPv6 runtime
113   clib_bihash_48_8_t hash;
114
115   // per-thread data
116   ip6_reass_per_thread_t *per_thread_data;
117
118   // convenience
119   vlib_main_t *vlib_main;
120   vnet_main_t *vnet_main;
121
122   // node index of ip6-drop node
123   u32 ip6_drop_idx;
124   u32 ip6_icmp_error_idx;
125   u32 ip6_reass_expire_node_idx;
126
127 } ip6_reass_main_t;
128
129 ip6_reass_main_t ip6_reass_main;
130
131 typedef enum
132 {
133   IP6_REASSEMBLY_NEXT_INPUT,
134   IP6_REASSEMBLY_NEXT_DROP,
135   IP6_REASSEMBLY_NEXT_ICMP_ERROR,
136   IP6_REASSEMBLY_N_NEXT,
137 } ip6_reass_next_t;
138
139 typedef enum
140 {
141   RANGE_NEW,
142   RANGE_OVERLAP,
143   ICMP_ERROR_RT_EXCEEDED,
144   ICMP_ERROR_FL_TOO_BIG,
145   ICMP_ERROR_FL_NOT_MULT_8,
146   FINALIZE,
147 } ip6_reass_trace_operation_e;
148
149 typedef struct
150 {
151   u16 range_first;
152   u16 range_last;
153   u32 range_bi;
154   i32 data_offset;
155   u32 data_len;
156   u32 first_bi;
157 } ip6_reass_range_trace_t;
158
159 typedef struct
160 {
161   ip6_reass_trace_operation_e action;
162   u32 reass_id;
163   ip6_reass_range_trace_t trace_range;
164   u32 size_diff;
165   u32 op_id;
166   u32 fragment_first;
167   u32 fragment_last;
168   u32 total_data_len;
169 } ip6_reass_trace_t;
170
171 static void
172 ip6_reass_trace_details (vlib_main_t * vm, u32 bi,
173                          ip6_reass_range_trace_t * trace)
174 {
175   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
176   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
177   trace->range_first = vnb->ip.reass.range_first;
178   trace->range_last = vnb->ip.reass.range_last;
179   trace->data_offset = ip6_reass_buffer_get_data_offset (b);
180   trace->data_len = ip6_reass_buffer_get_data_len (b);
181   trace->range_bi = bi;
182 }
183
184 static u8 *
185 format_ip6_reass_range_trace (u8 * s, va_list * args)
186 {
187   ip6_reass_range_trace_t *trace = va_arg (*args, ip6_reass_range_trace_t *);
188   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
189               trace->range_last, trace->data_offset, trace->data_len,
190               trace->range_bi);
191   return s;
192 }
193
194 static u8 *
195 format_ip6_reass_trace (u8 * s, va_list * args)
196 {
197   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
198   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
199   ip6_reass_trace_t *t = va_arg (*args, ip6_reass_trace_t *);
200   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
201   u32 indent = format_get_indent (s);
202   s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
203               t->trace_range.first_bi, t->total_data_len, t->fragment_first,
204               t->fragment_last);
205   switch (t->action)
206     {
207     case RANGE_NEW:
208       s = format (s, "\n%Unew %U", format_white_space, indent,
209                   format_ip6_reass_range_trace, &t->trace_range);
210       break;
211     case RANGE_OVERLAP:
212       s = format (s, "\n%Uoverlap %U", format_white_space, indent,
213                   format_ip6_reass_range_trace, &t->trace_range);
214       break;
215     case ICMP_ERROR_FL_TOO_BIG:
216       s = format (s, "\n%Uicmp-error - frag_len > 65535 %U",
217                   format_white_space, indent, format_ip6_reass_range_trace,
218                   &t->trace_range);
219       break;
220     case ICMP_ERROR_FL_NOT_MULT_8:
221       s = format (s, "\n%Uicmp-error - frag_len mod 8 != 0 %U",
222                   format_white_space, indent, format_ip6_reass_range_trace,
223                   &t->trace_range);
224       break;
225     case ICMP_ERROR_RT_EXCEEDED:
226       s = format (s, "\n%Uicmp-error - reassembly time exceeded",
227                   format_white_space, indent);
228       break;
229     case FINALIZE:
230       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
231       break;
232     }
233   return s;
234 }
235
236 static void
237 ip6_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
238                      ip6_reass_main_t * rm, ip6_reass_t * reass,
239                      u32 bi, ip6_reass_trace_operation_e action,
240                      u32 size_diff)
241 {
242   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
243   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
244   if (pool_is_free_index (vm->trace_main.trace_buffer_pool, b->trace_index))
245     {
246       // this buffer's trace is gone
247       b->flags &= ~VLIB_BUFFER_IS_TRACED;
248       return;
249     }
250   ip6_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
251   t->reass_id = reass->id;
252   t->action = action;
253   ip6_reass_trace_details (vm, bi, &t->trace_range);
254   t->size_diff = size_diff;
255   t->op_id = reass->trace_op_counter;
256   ++reass->trace_op_counter;
257   t->fragment_first = vnb->ip.reass.fragment_first;
258   t->fragment_last = vnb->ip.reass.fragment_last;
259   t->trace_range.first_bi = reass->first_bi;
260   t->total_data_len = reass->data_len;
261 #if 0
262   static u8 *s = NULL;
263   s = format (s, "%U", format_ip6_reass_trace, NULL, NULL, t);
264   printf ("%.*s\n", vec_len (s), s);
265   fflush (stdout);
266   vec_reset_length (s);
267 #endif
268 }
269
270 always_inline void
271 ip6_reass_free (ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
272                 ip6_reass_t * reass)
273 {
274   clib_bihash_kv_48_8_t kv;
275   kv.key[0] = reass->key.as_u64[0];
276   kv.key[1] = reass->key.as_u64[1];
277   kv.key[2] = reass->key.as_u64[2];
278   kv.key[3] = reass->key.as_u64[3];
279   kv.key[4] = reass->key.as_u64[4];
280   kv.key[5] = reass->key.as_u64[5];
281   clib_bihash_add_del_48_8 (&rm->hash, &kv, 0);
282   pool_put (rt->pool, reass);
283   --rt->reass_n;
284 }
285
286 always_inline void
287 ip6_reass_drop_all (vlib_main_t * vm, ip6_reass_main_t * rm,
288                     ip6_reass_t * reass, u32 ** vec_drop_bi)
289 {
290   u32 range_bi = reass->first_bi;
291   vlib_buffer_t *range_b;
292   vnet_buffer_opaque_t *range_vnb;
293   while (~0 != range_bi)
294     {
295       range_b = vlib_get_buffer (vm, range_bi);
296       range_vnb = vnet_buffer (range_b);
297       u32 bi = range_bi;
298       while (~0 != bi)
299         {
300           vec_add1 (*vec_drop_bi, bi);
301           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
302           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
303             {
304               bi = b->next_buffer;
305               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
306             }
307           else
308             {
309               bi = ~0;
310             }
311         }
312       range_bi = range_vnb->ip.reass.next_range_bi;
313     }
314 }
315
316 always_inline void
317 ip6_reass_on_timeout (vlib_main_t * vm, vlib_node_runtime_t * node,
318                       ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
319                       ip6_reass_t * reass, u32 * icmp_bi, u32 ** vec_timeout)
320 {
321   if (~0 == reass->first_bi)
322     {
323       return;
324     }
325   vlib_buffer_t *b = vlib_get_buffer (vm, reass->first_bi);
326   if (0 == vnet_buffer (b)->ip.reass.fragment_first)
327     {
328       *icmp_bi = reass->first_bi;
329       if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
330         {
331           ip6_reass_add_trace (vm, node, rm, reass, reass->first_bi,
332                                ICMP_ERROR_RT_EXCEEDED, 0);
333         }
334       // fragment with offset zero received - send icmp message back
335       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
336         {
337           // separate first buffer from chain and steer it towards icmp node
338           b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
339           reass->first_bi = b->next_buffer;
340         }
341       else
342         {
343           reass->first_bi = vnet_buffer (b)->ip.reass.next_range_bi;
344         }
345       --rt->buffers_n;
346       icmp6_error_set_vnet_buffer (b, ICMP6_time_exceeded,
347                                    ICMP6_time_exceeded_fragment_reassembly_time_exceeded,
348                                    0);
349     }
350   ip6_reass_drop_all (vm, rm, reass, vec_timeout);
351 }
352
353 always_inline ip6_reass_t *
354 ip6_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
355                           ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
356                           ip6_reass_key_t * k, u32 * icmp_bi,
357                           u32 ** vec_timeout)
358 {
359   ip6_reass_t *reass = NULL;
360   f64 now = vlib_time_now (rm->vlib_main);
361   clib_bihash_kv_48_8_t kv, value;
362   kv.key[0] = k->as_u64[0];
363   kv.key[1] = k->as_u64[1];
364   kv.key[2] = k->as_u64[2];
365   kv.key[3] = k->as_u64[3];
366   kv.key[4] = k->as_u64[4];
367   kv.key[5] = k->as_u64[5];
368
369   if (!clib_bihash_search_48_8 (&rm->hash, &kv, &value))
370     {
371       reass = pool_elt_at_index (rt->pool, value.value);
372       if (now > reass->last_heard + rm->timeout)
373         {
374           ip6_reass_on_timeout (vm, node, rm, rt, reass, icmp_bi,
375                                 vec_timeout);
376           ip6_reass_free (rm, rt, reass);
377           reass = NULL;
378         }
379     }
380
381   if (reass)
382     {
383       reass->last_heard = now;
384       return reass;
385     }
386
387   if (rt->reass_n >= rm->max_reass_n)
388     {
389       reass = NULL;
390       return reass;
391     }
392   else
393     {
394       pool_get (rt->pool, reass);
395       clib_memset (reass, 0, sizeof (*reass));
396       reass->id =
397         ((u64) os_get_thread_index () * 1000000000) + rt->id_counter;
398       ++rt->id_counter;
399       reass->first_bi = ~0;
400       reass->last_packet_octet = ~0;
401       reass->data_len = 0;
402       ++rt->reass_n;
403     }
404
405   reass->key.as_u64[0] = kv.key[0] = k->as_u64[0];
406   reass->key.as_u64[1] = kv.key[1] = k->as_u64[1];
407   reass->key.as_u64[2] = kv.key[2] = k->as_u64[2];
408   reass->key.as_u64[3] = kv.key[3] = k->as_u64[3];
409   reass->key.as_u64[4] = kv.key[4] = k->as_u64[4];
410   reass->key.as_u64[5] = kv.key[5] = k->as_u64[5];
411   kv.value = reass - rt->pool;
412   reass->last_heard = now;
413
414   if (clib_bihash_add_del_48_8 (&rm->hash, &kv, 1))
415     {
416       ip6_reass_free (rm, rt, reass);
417       reass = NULL;
418     }
419
420   return reass;
421 }
422
423 always_inline ip6_reass_rc_t
424 ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
425                     ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
426                     ip6_reass_t * reass, u32 * bi0, u32 * next0,
427                     u32 * error0, u32 ** vec_drop_compress, bool is_feature)
428 {
429   *bi0 = reass->first_bi;
430   *error0 = IP6_ERROR_NONE;
431   ip6_frag_hdr_t *frag_hdr;
432   vlib_buffer_t *last_b = NULL;
433   u32 sub_chain_bi = reass->first_bi;
434   u32 total_length = 0;
435   u32 buf_cnt = 0;
436   u32 dropped_cnt = 0;
437   do
438     {
439       u32 tmp_bi = sub_chain_bi;
440       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
441       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
442       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
443           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
444         {
445           return IP6_REASS_RC_INTERNAL_ERROR;
446         }
447
448       u32 data_len = ip6_reass_buffer_get_data_len (tmp);
449       u32 trim_front = vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
450         sizeof (*frag_hdr) + ip6_reass_buffer_get_data_offset (tmp);
451       u32 trim_end =
452         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
453       if (tmp_bi == reass->first_bi)
454         {
455           /* first buffer - keep ip6 header */
456           if (0 != ip6_reass_buffer_get_data_offset (tmp))
457             {
458               return IP6_REASS_RC_INTERNAL_ERROR;
459             }
460           trim_front = 0;
461           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
462             (vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
463              sizeof (*frag_hdr));
464           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
465             {
466               return IP6_REASS_RC_INTERNAL_ERROR;
467             }
468         }
469       u32 keep_data =
470         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
471       while (1)
472         {
473           ++buf_cnt;
474           if (trim_front)
475             {
476               if (trim_front > tmp->current_length)
477                 {
478                   /* drop whole buffer */
479                   vec_add1 (*vec_drop_compress, tmp_bi);
480                   ++dropped_cnt;
481                   trim_front -= tmp->current_length;
482                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
483                     {
484                       return IP6_REASS_RC_INTERNAL_ERROR;
485                     }
486                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
487                   tmp_bi = tmp->next_buffer;
488                   tmp = vlib_get_buffer (vm, tmp_bi);
489                   continue;
490                 }
491               else
492                 {
493                   vlib_buffer_advance (tmp, trim_front);
494                   trim_front = 0;
495                 }
496             }
497           if (keep_data)
498             {
499               if (last_b)
500                 {
501                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
502                   last_b->next_buffer = tmp_bi;
503                 }
504               last_b = tmp;
505               if (keep_data <= tmp->current_length)
506                 {
507                   tmp->current_length = keep_data;
508                   keep_data = 0;
509                 }
510               else
511                 {
512                   keep_data -= tmp->current_length;
513                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
514                     {
515                       return IP6_REASS_RC_INTERNAL_ERROR;
516                     }
517                 }
518               total_length += tmp->current_length;
519             }
520           else
521             {
522               vec_add1 (*vec_drop_compress, tmp_bi);
523               if (reass->first_bi == tmp_bi)
524                 {
525                   return IP6_REASS_RC_INTERNAL_ERROR;
526                 }
527               ++dropped_cnt;
528             }
529           if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
530             {
531               tmp_bi = tmp->next_buffer;
532               tmp = vlib_get_buffer (vm, tmp->next_buffer);
533             }
534           else
535             {
536               break;
537             }
538         }
539       sub_chain_bi =
540         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
541         reass.next_range_bi;
542     }
543   while (~0 != sub_chain_bi);
544
545   if (!last_b)
546     {
547       return IP6_REASS_RC_INTERNAL_ERROR;
548     }
549   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
550   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
551   if (total_length < first_b->current_length)
552     {
553       return IP6_REASS_RC_INTERNAL_ERROR;
554     }
555   total_length -= first_b->current_length;
556   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
557   first_b->total_length_not_including_first_buffer = total_length;
558   // drop fragment header
559   vnet_buffer_opaque_t *first_b_vnb = vnet_buffer (first_b);
560   ip6_header_t *ip = vlib_buffer_get_current (first_b);
561   u16 ip6_frag_hdr_offset = first_b_vnb->ip.reass.ip6_frag_hdr_offset;
562   ip6_ext_header_t *prev_hdr;
563   ip6_ext_header_find_t (ip, prev_hdr, frag_hdr,
564                          IP_PROTOCOL_IPV6_FRAGMENTATION);
565   if (prev_hdr)
566     {
567       prev_hdr->next_hdr = frag_hdr->next_hdr;
568     }
569   else
570     {
571       ip->protocol = frag_hdr->next_hdr;
572     }
573   if (!((u8 *) frag_hdr - (u8 *) ip == ip6_frag_hdr_offset))
574     {
575       return IP6_REASS_RC_INTERNAL_ERROR;
576     }
577   memmove (frag_hdr, (u8 *) frag_hdr + sizeof (*frag_hdr),
578            first_b->current_length - ip6_frag_hdr_offset -
579            sizeof (ip6_frag_hdr_t));
580   first_b->current_length -= sizeof (*frag_hdr);
581   ip->payload_length =
582     clib_host_to_net_u16 (total_length + first_b->current_length -
583                           sizeof (*ip));
584   vlib_buffer_chain_compress (vm, first_b, vec_drop_compress);
585   rt->buffers_n -= buf_cnt - vec_len (*vec_drop_compress);
586   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
587     {
588       ip6_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
589 #if 0
590       // following code does a hexdump of packet fragments to stdout ...
591       do
592         {
593           u32 bi = reass->first_bi;
594           u8 *s = NULL;
595           while (~0 != bi)
596             {
597               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
598               s = format (s, "%u: %U\n", bi, format_hexdump,
599                           vlib_buffer_get_current (b), b->current_length);
600               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
601                 {
602                   bi = b->next_buffer;
603                 }
604               else
605                 {
606                   break;
607                 }
608             }
609           printf ("%.*s\n", vec_len (s), s);
610           fflush (stdout);
611           vec_free (s);
612         }
613       while (0);
614 #endif
615     }
616   if (is_feature)
617     {
618       *next0 = IP6_REASSEMBLY_NEXT_INPUT;
619     }
620   else
621     {
622       *next0 = reass->next_index;
623     }
624   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
625   ip6_reass_free (rm, rt, reass);
626   reass = NULL;
627   return IP6_REASS_RC_OK;
628 }
629
630 always_inline u32
631 ip6_reass_get_buffer_chain_length (vlib_main_t * vm, vlib_buffer_t * b)
632 {
633   u32 len = 0;
634   while (b)
635     {
636       ++len;
637       if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
638         {
639           b = vlib_get_buffer (vm, b->next_buffer);
640         }
641       else
642         {
643           break;
644         }
645     }
646   return len;
647 }
648
649 always_inline void
650 ip6_reass_insert_range_in_chain (vlib_main_t * vm, ip6_reass_main_t * rm,
651                                  ip6_reass_per_thread_t * rt,
652                                  ip6_reass_t * reass, u32 prev_range_bi,
653                                  u32 new_next_bi)
654 {
655
656   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
657   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
658   if (~0 != prev_range_bi)
659     {
660       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
661       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
662       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
663       prev_vnb->ip.reass.next_range_bi = new_next_bi;
664     }
665   else
666     {
667       if (~0 != reass->first_bi)
668         {
669           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
670         }
671       reass->first_bi = new_next_bi;
672     }
673   reass->data_len += ip6_reass_buffer_get_data_len (new_next_b);
674   rt->buffers_n += ip6_reass_get_buffer_chain_length (vm, new_next_b);
675 }
676
677 always_inline ip6_reass_rc_t
678 ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
679                   ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
680                   ip6_reass_t * reass, u32 * bi0, u32 * next0,
681                   u32 * error0, ip6_frag_hdr_t * frag_hdr,
682                   u32 ** vec_drop_overlap, u32 ** vec_drop_compress,
683                   bool is_feature)
684 {
685   int consumed = 0;
686   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
687   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
688   reass->next_index = fvnb->ip.reass.next_index;        // store next_index before it's overwritten
689   fvnb->ip.reass.ip6_frag_hdr_offset =
690     (u8 *) frag_hdr - (u8 *) vlib_buffer_get_current (fb);
691   ip6_header_t *fip = vlib_buffer_get_current (fb);
692   if (fb->current_length < sizeof (*fip) ||
693       fvnb->ip.reass.ip6_frag_hdr_offset == 0 ||
694       fvnb->ip.reass.ip6_frag_hdr_offset >= fb->current_length)
695     {
696       return IP6_REASS_RC_INTERNAL_ERROR;
697     }
698
699   u32 fragment_first = fvnb->ip.reass.fragment_first =
700     ip6_frag_hdr_offset_bytes (frag_hdr);
701   u32 fragment_length =
702     vlib_buffer_length_in_chain (vm, fb) -
703     (fvnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
704   u32 fragment_last = fvnb->ip.reass.fragment_last =
705     fragment_first + fragment_length - 1;
706   int more_fragments = ip6_frag_hdr_more (frag_hdr);
707   u32 candidate_range_bi = reass->first_bi;
708   u32 prev_range_bi = ~0;
709   fvnb->ip.reass.range_first = fragment_first;
710   fvnb->ip.reass.range_last = fragment_last;
711   fvnb->ip.reass.next_range_bi = ~0;
712   if (!more_fragments)
713     {
714       reass->last_packet_octet = fragment_last;
715     }
716   if (~0 == reass->first_bi)
717     {
718       // starting a new reassembly
719       ip6_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
720                                        *bi0);
721       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
722         {
723           ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
724         }
725       reass->min_fragment_length = clib_net_to_host_u16 (fip->payload_length);
726       *bi0 = ~0;
727       return IP6_REASS_RC_OK;
728     }
729   reass->min_fragment_length =
730     clib_min (clib_net_to_host_u16 (fip->payload_length),
731               fvnb->ip.reass.estimated_mtu);
732   while (~0 != candidate_range_bi)
733     {
734       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
735       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
736       if (fragment_first > candidate_vnb->ip.reass.range_last)
737         {
738           // this fragments starts after candidate range
739           prev_range_bi = candidate_range_bi;
740           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
741           if (candidate_vnb->ip.reass.range_last < fragment_last &&
742               ~0 == candidate_range_bi)
743             {
744               // special case - this fragment falls beyond all known ranges
745               ip6_reass_insert_range_in_chain (vm, rm, rt, reass,
746                                                prev_range_bi, *bi0);
747               consumed = 1;
748               break;
749             }
750           continue;
751         }
752       if (fragment_last < candidate_vnb->ip.reass.range_first)
753         {
754           // this fragment ends before candidate range without any overlap
755           ip6_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
756                                            *bi0);
757           consumed = 1;
758         }
759       else if (fragment_first == candidate_vnb->ip.reass.range_first &&
760                fragment_last == candidate_vnb->ip.reass.range_last)
761         {
762           // duplicate fragment - ignore
763         }
764       else
765         {
766           // overlapping fragment - not allowed by RFC 8200
767           ip6_reass_drop_all (vm, rm, reass, vec_drop_overlap);
768           ip6_reass_free (rm, rt, reass);
769           if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
770             {
771               ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_OVERLAP,
772                                    0);
773             }
774           *next0 = IP6_REASSEMBLY_NEXT_DROP;
775           *error0 = IP6_ERROR_REASS_OVERLAPPING_FRAGMENT;
776           return IP6_REASS_RC_OK;
777         }
778       break;
779     }
780   if (consumed)
781     {
782       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
783         {
784           ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
785         }
786     }
787   if (~0 != reass->last_packet_octet &&
788       reass->data_len == reass->last_packet_octet + 1)
789     {
790       return ip6_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
791                                  vec_drop_compress, is_feature);
792     }
793   else
794     {
795       if (consumed)
796         {
797           *bi0 = ~0;
798         }
799       else
800         {
801           *next0 = IP6_REASSEMBLY_NEXT_DROP;
802           *error0 = IP6_ERROR_REASS_DUPLICATE_FRAGMENT;
803         }
804     }
805   return IP6_REASS_RC_OK;
806 }
807
808 always_inline bool
809 ip6_reass_verify_upper_layer_present (vlib_node_runtime_t * node,
810                                       vlib_buffer_t * b,
811                                       ip6_frag_hdr_t * frag_hdr)
812 {
813   ip6_ext_header_t *tmp = (ip6_ext_header_t *) frag_hdr;
814   while (ip6_ext_hdr (tmp->next_hdr))
815     {
816       tmp = ip6_ext_next_header (tmp);
817     }
818   if (IP_PROTOCOL_IP6_NONXT == tmp->next_hdr)
819     {
820       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
821                                    ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain,
822                                    0);
823       b->error = node->errors[IP6_ERROR_REASS_MISSING_UPPER];
824
825       return false;
826     }
827   return true;
828 }
829
830 always_inline bool
831 ip6_reass_verify_fragment_multiple_8 (vlib_main_t * vm,
832                                       vlib_node_runtime_t * node,
833                                       vlib_buffer_t * b,
834                                       ip6_frag_hdr_t * frag_hdr)
835 {
836   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
837   ip6_header_t *ip = vlib_buffer_get_current (b);
838   int more_fragments = ip6_frag_hdr_more (frag_hdr);
839   u32 fragment_length =
840     vlib_buffer_length_in_chain (vm, b) -
841     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
842   if (more_fragments && 0 != fragment_length % 8)
843     {
844       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
845                                    ICMP6_parameter_problem_erroneous_header_field,
846                                    (u8 *) & ip->payload_length - (u8 *) ip);
847       return false;
848     }
849   return true;
850 }
851
852 always_inline bool
853 ip6_reass_verify_packet_size_lt_64k (vlib_main_t * vm,
854                                      vlib_node_runtime_t * node,
855                                      vlib_buffer_t * b,
856                                      ip6_frag_hdr_t * frag_hdr)
857 {
858   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
859   u32 fragment_first = ip6_frag_hdr_offset_bytes (frag_hdr);
860   u32 fragment_length =
861     vlib_buffer_length_in_chain (vm, b) -
862     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
863   if (fragment_first + fragment_length > 65535)
864     {
865       ip6_header_t *ip0 = vlib_buffer_get_current (b);
866       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
867                                    ICMP6_parameter_problem_erroneous_header_field,
868                                    (u8 *) & frag_hdr->fragment_offset_and_more
869                                    - (u8 *) ip0);
870       return false;
871     }
872   return true;
873 }
874
875 always_inline uword
876 ip6_reassembly_inline (vlib_main_t * vm,
877                        vlib_node_runtime_t * node,
878                        vlib_frame_t * frame, bool is_feature)
879 {
880   u32 *from = vlib_frame_vector_args (frame);
881   u32 n_left_from, n_left_to_next, *to_next, next_index;
882   ip6_reass_main_t *rm = &ip6_reass_main;
883   ip6_reass_per_thread_t *rt = &rm->per_thread_data[os_get_thread_index ()];
884   clib_spinlock_lock (&rt->lock);
885
886   n_left_from = frame->n_vectors;
887   next_index = node->cached_next_index;
888   static u32 *vec_timeout = NULL;       // indexes of buffers which timed out
889   static u32 *vec_drop_overlap = NULL;  // indexes of buffers dropped due to overlap
890   static u32 *vec_drop_internal_error = NULL;   // indexes of buffers dropped due to internal errors
891   static u32 *vec_drop_compress = NULL; // indexes of buffers dropped due to buffer compression
892   while (n_left_from > 0 || vec_len (vec_timeout) > 0
893          || vec_len (vec_drop_overlap) > 0 || vec_len (vec_drop_compress) > 0
894          || vec_len (vec_drop_internal_error) > 0)
895     {
896       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
897
898       while (vec_len (vec_timeout) > 0 && n_left_to_next > 0)
899         {
900           u32 bi = vec_pop (vec_timeout);
901           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
902           b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
903           to_next[0] = bi;
904           to_next += 1;
905           n_left_to_next -= 1;
906           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
907                                            n_left_to_next, bi,
908                                            IP6_REASSEMBLY_NEXT_DROP);
909           --rt->buffers_n;
910         }
911
912       while (vec_len (vec_drop_overlap) > 0 && n_left_to_next > 0)
913         {
914           u32 bi = vec_pop (vec_drop_overlap);
915           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
916           b->error = node->errors[IP6_ERROR_REASS_OVERLAPPING_FRAGMENT];
917           to_next[0] = bi;
918           to_next += 1;
919           n_left_to_next -= 1;
920           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
921                                            n_left_to_next, bi,
922                                            IP6_REASSEMBLY_NEXT_DROP);
923           --rt->buffers_n;
924         }
925
926       while (vec_len (vec_drop_compress) > 0 && n_left_to_next > 0)
927         {
928           u32 bi = vec_pop (vec_drop_compress);
929           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
930           b->error = node->errors[IP6_ERROR_NONE];
931           to_next[0] = bi;
932           to_next += 1;
933           n_left_to_next -= 1;
934           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
935                                            n_left_to_next, bi,
936                                            IP6_REASSEMBLY_NEXT_DROP);
937           --rt->buffers_n;
938         }
939       while (vec_len (vec_drop_internal_error) > 0 && n_left_to_next > 0)
940         {
941           u32 bi = vec_pop (vec_drop_internal_error);
942           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
943           b->error = node->errors[IP6_ERROR_REASS_INTERNAL_ERROR];
944           to_next[0] = bi;
945           to_next += 1;
946           n_left_to_next -= 1;
947           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
948                                            n_left_to_next, bi,
949                                            IP6_REASSEMBLY_NEXT_DROP);
950           --rt->buffers_n;
951         }
952
953       while (n_left_from > 0 && n_left_to_next > 0)
954         {
955           u32 bi0;
956           vlib_buffer_t *b0;
957           u32 next0;
958           u32 error0 = IP6_ERROR_NONE;
959           u32 icmp_bi = ~0;
960
961           bi0 = from[0];
962           b0 = vlib_get_buffer (vm, bi0);
963
964           ip6_header_t *ip0 = vlib_buffer_get_current (b0);
965           ip6_frag_hdr_t *frag_hdr = NULL;
966           ip6_ext_header_t *prev_hdr;
967           if (ip6_ext_hdr (ip0->protocol))
968             {
969               ip6_ext_header_find_t (ip0, prev_hdr, frag_hdr,
970                                      IP_PROTOCOL_IPV6_FRAGMENTATION);
971             }
972           if (!frag_hdr)
973             {
974               // this is a regular packet - no fragmentation
975               next0 = IP6_REASSEMBLY_NEXT_INPUT;
976               goto skip_reass;
977             }
978           if (0 == ip6_frag_hdr_offset (frag_hdr))
979             {
980               // first fragment - verify upper-layer is present
981               if (!ip6_reass_verify_upper_layer_present (node, b0, frag_hdr))
982                 {
983                   next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
984                   goto skip_reass;
985                 }
986             }
987           if (!ip6_reass_verify_fragment_multiple_8 (vm, node, b0, frag_hdr)
988               || !ip6_reass_verify_packet_size_lt_64k (vm, node, b0,
989                                                        frag_hdr))
990             {
991               next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
992               goto skip_reass;
993             }
994           vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset =
995             (u8 *) frag_hdr - (u8 *) ip0;
996
997           ip6_reass_key_t k;
998           k.as_u64[0] = ip0->src_address.as_u64[0];
999           k.as_u64[1] = ip0->src_address.as_u64[1];
1000           k.as_u64[2] = ip0->dst_address.as_u64[0];
1001           k.as_u64[3] = ip0->dst_address.as_u64[1];
1002           k.as_u64[4] =
1003             (u64) vnet_buffer (b0)->
1004             sw_if_index[VLIB_RX] << 32 | frag_hdr->identification;
1005           k.as_u64[5] = ip0->protocol;
1006           ip6_reass_t *reass =
1007             ip6_reass_find_or_create (vm, node, rm, rt, &k, &icmp_bi,
1008                                       &vec_timeout);
1009
1010           if (reass)
1011             {
1012               switch (ip6_reass_update (vm, node, rm, rt, reass, &bi0, &next0,
1013                                         &error0, frag_hdr, &vec_drop_overlap,
1014                                         &vec_drop_compress, is_feature))
1015                 {
1016                 case IP6_REASS_RC_OK:
1017                   /* nothing to do here */
1018                   break;
1019                 case IP6_REASS_RC_INTERNAL_ERROR:
1020                   /* drop everything and start with a clean slate */
1021                   ip6_reass_drop_all (vm, rm, reass,
1022                                       &vec_drop_internal_error);
1023                   ip6_reass_free (rm, rt, reass);
1024                   goto next_packet;
1025                   break;
1026                 }
1027             }
1028           else
1029             {
1030               next0 = IP6_REASSEMBLY_NEXT_DROP;
1031               error0 = IP6_ERROR_REASS_LIMIT_REACHED;
1032             }
1033
1034           b0->error = node->errors[error0];
1035
1036           if (~0 != bi0)
1037             {
1038             skip_reass:
1039               to_next[0] = bi0;
1040               to_next += 1;
1041               n_left_to_next -= 1;
1042               if (is_feature && IP6_ERROR_NONE == error0)
1043                 {
1044                   b0 = vlib_get_buffer (vm, bi0);
1045                   vnet_feature_next (&next0, b0);
1046                 }
1047               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1048                                                n_left_to_next, bi0, next0);
1049             }
1050
1051           if (~0 != icmp_bi)
1052             {
1053               next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
1054               to_next[0] = icmp_bi;
1055               to_next += 1;
1056               n_left_to_next -= 1;
1057               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1058                                                n_left_to_next, icmp_bi,
1059                                                next0);
1060             }
1061         next_packet:
1062           from += 1;
1063           n_left_from -= 1;
1064         }
1065
1066       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1067     }
1068
1069   clib_spinlock_unlock (&rt->lock);
1070   return frame->n_vectors;
1071 }
1072
1073 static char *ip6_reassembly_error_strings[] = {
1074 #define _(sym, string) string,
1075   foreach_ip6_error
1076 #undef _
1077 };
1078
1079 static uword
1080 ip6_reassembly (vlib_main_t * vm, vlib_node_runtime_t * node,
1081                 vlib_frame_t * frame)
1082 {
1083   return ip6_reassembly_inline (vm, node, frame, false /* is_feature */ );
1084 }
1085
1086 /* *INDENT-OFF* */
1087 VLIB_REGISTER_NODE (ip6_reass_node, static) = {
1088     .function = ip6_reassembly,
1089     .name = "ip6-reassembly",
1090     .vector_size = sizeof (u32),
1091     .format_trace = format_ip6_reass_trace,
1092     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1093     .error_strings = ip6_reassembly_error_strings,
1094     .n_next_nodes = IP6_REASSEMBLY_N_NEXT,
1095     .next_nodes =
1096         {
1097                 [IP6_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1098                 [IP6_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1099                 [IP6_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1100         },
1101 };
1102 /* *INDENT-ON* */
1103
1104 VLIB_NODE_FUNCTION_MULTIARCH (ip6_reass_node, ip6_reassembly);
1105
1106 static uword
1107 ip6_reassembly_feature (vlib_main_t * vm,
1108                         vlib_node_runtime_t * node, vlib_frame_t * frame)
1109 {
1110   return ip6_reassembly_inline (vm, node, frame, true /* is_feature */ );
1111 }
1112
1113 /* *INDENT-OFF* */
1114 VLIB_REGISTER_NODE (ip6_reass_node_feature, static) = {
1115     .function = ip6_reassembly_feature,
1116     .name = "ip6-reassembly-feature",
1117     .vector_size = sizeof (u32),
1118     .format_trace = format_ip6_reass_trace,
1119     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1120     .error_strings = ip6_reassembly_error_strings,
1121     .n_next_nodes = IP6_REASSEMBLY_N_NEXT,
1122     .next_nodes =
1123         {
1124                 [IP6_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1125                 [IP6_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1126                 [IP6_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1127         },
1128 };
1129 /* *INDENT-ON* */
1130
1131 VLIB_NODE_FUNCTION_MULTIARCH (ip6_reass_node_feature, ip6_reassembly_feature);
1132
1133 /* *INDENT-OFF* */
1134 VNET_FEATURE_INIT (ip6_reassembly_feature, static) = {
1135     .arc_name = "ip6-unicast",
1136     .node_name = "ip6-reassembly-feature",
1137     .runs_before = VNET_FEATURES ("ip6-lookup"),
1138     .runs_after = 0,
1139 };
1140 /* *INDENT-ON* */
1141
1142 static u32
1143 ip6_reass_get_nbuckets ()
1144 {
1145   ip6_reass_main_t *rm = &ip6_reass_main;
1146   u32 nbuckets;
1147   u8 i;
1148
1149   nbuckets = (u32) (rm->max_reass_n / IP6_REASS_HT_LOAD_FACTOR);
1150
1151   for (i = 0; i < 31; i++)
1152     if ((1 << i) >= nbuckets)
1153       break;
1154   nbuckets = 1 << i;
1155
1156   return nbuckets;
1157 }
1158
1159 typedef enum
1160 {
1161   IP6_EVENT_CONFIG_CHANGED = 1,
1162 } ip6_reass_event_t;
1163
1164 typedef struct
1165 {
1166   int failure;
1167   clib_bihash_48_8_t *new_hash;
1168 } ip6_rehash_cb_ctx;
1169
1170 static void
1171 ip6_rehash_cb (clib_bihash_kv_48_8_t * kv, void *_ctx)
1172 {
1173   ip6_rehash_cb_ctx *ctx = _ctx;
1174   if (clib_bihash_add_del_48_8 (ctx->new_hash, kv, 1))
1175     {
1176       ctx->failure = 1;
1177     }
1178 }
1179
1180 static void
1181 ip6_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1182                       u32 expire_walk_interval_ms)
1183 {
1184   ip6_reass_main.timeout_ms = timeout_ms;
1185   ip6_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1186   ip6_reass_main.max_reass_n = max_reassemblies;
1187   ip6_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1188 }
1189
1190 vnet_api_error_t
1191 ip6_reass_set (u32 timeout_ms, u32 max_reassemblies,
1192                u32 expire_walk_interval_ms)
1193 {
1194   u32 old_nbuckets = ip6_reass_get_nbuckets ();
1195   ip6_reass_set_params (timeout_ms, max_reassemblies,
1196                         expire_walk_interval_ms);
1197   vlib_process_signal_event (ip6_reass_main.vlib_main,
1198                              ip6_reass_main.ip6_reass_expire_node_idx,
1199                              IP6_EVENT_CONFIG_CHANGED, 0);
1200   u32 new_nbuckets = ip6_reass_get_nbuckets ();
1201   if (ip6_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1202     {
1203       clib_bihash_48_8_t new_hash;
1204       clib_memset (&new_hash, 0, sizeof (new_hash));
1205       ip6_rehash_cb_ctx ctx;
1206       ctx.failure = 0;
1207       ctx.new_hash = &new_hash;
1208       clib_bihash_init_48_8 (&new_hash, "ip6-reass", new_nbuckets,
1209                              new_nbuckets * 1024);
1210       clib_bihash_foreach_key_value_pair_48_8 (&ip6_reass_main.hash,
1211                                                ip6_rehash_cb, &ctx);
1212       if (ctx.failure)
1213         {
1214           clib_bihash_free_48_8 (&new_hash);
1215           return -1;
1216         }
1217       else
1218         {
1219           clib_bihash_free_48_8 (&ip6_reass_main.hash);
1220           clib_memcpy_fast (&ip6_reass_main.hash, &new_hash,
1221                             sizeof (ip6_reass_main.hash));
1222         }
1223     }
1224   return 0;
1225 }
1226
1227 vnet_api_error_t
1228 ip6_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1229                u32 * expire_walk_interval_ms)
1230 {
1231   *timeout_ms = ip6_reass_main.timeout_ms;
1232   *max_reassemblies = ip6_reass_main.max_reass_n;
1233   *expire_walk_interval_ms = ip6_reass_main.expire_walk_interval_ms;
1234   return 0;
1235 }
1236
1237 static clib_error_t *
1238 ip6_reass_init_function (vlib_main_t * vm)
1239 {
1240   ip6_reass_main_t *rm = &ip6_reass_main;
1241   clib_error_t *error = 0;
1242   u32 nbuckets;
1243   vlib_node_t *node;
1244
1245   rm->vlib_main = vm;
1246   rm->vnet_main = vnet_get_main ();
1247
1248   vec_validate (rm->per_thread_data, vlib_num_workers ());
1249   ip6_reass_per_thread_t *rt;
1250   vec_foreach (rt, rm->per_thread_data)
1251   {
1252     clib_spinlock_init (&rt->lock);
1253     pool_alloc (rt->pool, rm->max_reass_n);
1254   }
1255
1256   node = vlib_get_node_by_name (vm, (u8 *) "ip6-reassembly-expire-walk");
1257   ASSERT (node);
1258   rm->ip6_reass_expire_node_idx = node->index;
1259
1260   ip6_reass_set_params (IP6_REASS_TIMEOUT_DEFAULT_MS,
1261                         IP6_REASS_MAX_REASSEMBLIES_DEFAULT,
1262                         IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1263
1264   nbuckets = ip6_reass_get_nbuckets ();
1265   clib_bihash_init_48_8 (&rm->hash, "ip6-reass", nbuckets, nbuckets * 1024);
1266
1267   node = vlib_get_node_by_name (vm, (u8 *) "ip6-drop");
1268   ASSERT (node);
1269   rm->ip6_drop_idx = node->index;
1270   node = vlib_get_node_by_name (vm, (u8 *) "ip6-icmp-error");
1271   ASSERT (node);
1272   rm->ip6_icmp_error_idx = node->index;
1273
1274   if ((error = vlib_call_init_function (vm, ip_main_init)))
1275     return error;
1276   ip6_register_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION,
1277                          ip6_reass_node.index);
1278
1279   return error;
1280 }
1281
1282 VLIB_INIT_FUNCTION (ip6_reass_init_function);
1283
1284 static uword
1285 ip6_reass_walk_expired (vlib_main_t * vm,
1286                         vlib_node_runtime_t * node, vlib_frame_t * f)
1287 {
1288   ip6_reass_main_t *rm = &ip6_reass_main;
1289   uword event_type, *event_data = 0;
1290
1291   while (true)
1292     {
1293       vlib_process_wait_for_event_or_clock (vm,
1294                                             (f64) rm->expire_walk_interval_ms
1295                                             / (f64) MSEC_PER_SEC);
1296       event_type = vlib_process_get_events (vm, &event_data);
1297
1298       switch (event_type)
1299         {
1300         case ~0:                /* no events => timeout */
1301           /* nothing to do here */
1302           break;
1303         case IP6_EVENT_CONFIG_CHANGED:
1304           break;
1305         default:
1306           clib_warning ("BUG: event type 0x%wx", event_type);
1307           break;
1308         }
1309       f64 now = vlib_time_now (vm);
1310
1311       ip6_reass_t *reass;
1312       u32 *vec_timeout = NULL;
1313       int *pool_indexes_to_free = NULL;
1314
1315       uword thread_index = 0;
1316       int index;
1317       const uword nthreads = vlib_num_workers () + 1;
1318       u32 *vec_icmp_bi = NULL;
1319       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1320         {
1321           ip6_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1322           clib_spinlock_lock (&rt->lock);
1323
1324           vec_reset_length (pool_indexes_to_free);
1325           /* *INDENT-OFF* */
1326           pool_foreach_index (index, rt->pool, ({
1327                                 reass = pool_elt_at_index (rt->pool, index);
1328                                 if (now > reass->last_heard + rm->timeout)
1329                                   {
1330                                     vec_add1 (pool_indexes_to_free, index);
1331                                   }
1332                               }));
1333           /* *INDENT-ON* */
1334           int *i;
1335           /* *INDENT-OFF* */
1336           vec_foreach (i, pool_indexes_to_free)
1337           {
1338             ip6_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1339             u32 icmp_bi = ~0;
1340             u32 before = vec_len (vec_timeout);
1341             vlib_buffer_t *b = vlib_get_buffer (vm, reass->first_bi);
1342             if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
1343               {
1344                 if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
1345                                         b->trace_index))
1346                   {
1347                     /* the trace is gone, don't trace this buffer anymore */
1348                     b->flags &= ~VLIB_BUFFER_IS_TRACED;
1349                   }
1350               }
1351             ip6_reass_on_timeout (vm, node, rm, rt, reass, &icmp_bi, &vec_timeout);
1352             u32 after = vec_len (vec_timeout);
1353             rt->buffers_n -= (after - before);
1354             if (~0 != icmp_bi)
1355               {
1356                 vec_add1 (vec_icmp_bi, icmp_bi);
1357               }
1358             ip6_reass_free (rm, rt, reass);
1359           }
1360           /* *INDENT-ON* */
1361
1362           clib_spinlock_unlock (&rt->lock);
1363         }
1364
1365       while (vec_len (vec_timeout) > 0)
1366         {
1367           vlib_frame_t *f = vlib_get_frame_to_node (vm, rm->ip6_drop_idx);
1368           u32 *to_next = vlib_frame_vector_args (f);
1369           u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
1370           int trace_frame = 0;
1371           while (vec_len (vec_timeout) > 0 && n_left_to_next > 0)
1372             {
1373               u32 bi = vec_pop (vec_timeout);
1374               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1375               if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
1376                 {
1377                   if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
1378                                           b->trace_index))
1379                     {
1380                       /* the trace is gone, don't trace this buffer anymore */
1381                       b->flags &= ~VLIB_BUFFER_IS_TRACED;
1382                     }
1383                   else
1384                     {
1385                       trace_frame = 1;
1386                     }
1387                 }
1388               b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
1389               to_next[0] = bi;
1390               ++f->n_vectors;
1391               to_next += 1;
1392               n_left_to_next -= 1;
1393             }
1394           f->frame_flags |= (trace_frame * VLIB_FRAME_TRACE);
1395           vlib_put_frame_to_node (vm, rm->ip6_drop_idx, f);
1396         }
1397
1398       while (vec_len (vec_icmp_bi) > 0)
1399         {
1400           vlib_frame_t *f =
1401             vlib_get_frame_to_node (vm, rm->ip6_icmp_error_idx);
1402           u32 *to_next = vlib_frame_vector_args (f);
1403           u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
1404           int trace_frame = 0;
1405           while (vec_len (vec_icmp_bi) > 0 && n_left_to_next > 0)
1406             {
1407               u32 bi = vec_pop (vec_icmp_bi);
1408               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1409               if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
1410                 {
1411                   if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
1412                                           b->trace_index))
1413                     {
1414                       /* the trace is gone, don't trace this buffer anymore */
1415                       b->flags &= ~VLIB_BUFFER_IS_TRACED;
1416                     }
1417                   else
1418                     {
1419                       trace_frame = 1;
1420                     }
1421                 }
1422               b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
1423               to_next[0] = bi;
1424               ++f->n_vectors;
1425               to_next += 1;
1426               n_left_to_next -= 1;
1427             }
1428           f->frame_flags |= (trace_frame * VLIB_FRAME_TRACE);
1429           vlib_put_frame_to_node (vm, rm->ip6_icmp_error_idx, f);
1430         }
1431
1432       vec_free (pool_indexes_to_free);
1433       vec_free (vec_timeout);
1434       vec_free (vec_icmp_bi);
1435       if (event_data)
1436         {
1437           _vec_len (event_data) = 0;
1438         }
1439     }
1440
1441   return 0;
1442 }
1443
1444 static vlib_node_registration_t ip6_reass_expire_node;
1445
1446 /* *INDENT-OFF* */
1447 VLIB_REGISTER_NODE (ip6_reass_expire_node, static) = {
1448     .function = ip6_reass_walk_expired,
1449     .format_trace = format_ip6_reass_trace,
1450     .type = VLIB_NODE_TYPE_PROCESS,
1451     .name = "ip6-reassembly-expire-walk",
1452
1453     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1454     .error_strings = ip6_reassembly_error_strings,
1455
1456 };
1457 /* *INDENT-ON* */
1458
1459 static u8 *
1460 format_ip6_reass_key (u8 * s, va_list * args)
1461 {
1462   ip6_reass_key_t *key = va_arg (*args, ip6_reass_key_t *);
1463   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1464               key->xx_id, format_ip6_address, &key->src, format_ip6_address,
1465               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1466   return s;
1467 }
1468
1469 static u8 *
1470 format_ip6_reass (u8 * s, va_list * args)
1471 {
1472   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1473   ip6_reass_t *reass = va_arg (*args, ip6_reass_t *);
1474
1475   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1476               "last_packet_octet: %u, trace_op_counter: %u\n",
1477               reass->id, format_ip6_reass_key, &reass->key, reass->first_bi,
1478               reass->data_len, reass->last_packet_octet,
1479               reass->trace_op_counter);
1480   u32 bi = reass->first_bi;
1481   u32 counter = 0;
1482   while (~0 != bi)
1483     {
1484       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1485       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1486       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1487                   "fragment[%u, %u]\n",
1488                   counter, vnb->ip.reass.range_first,
1489                   vnb->ip.reass.range_last, bi,
1490                   ip6_reass_buffer_get_data_offset (b),
1491                   ip6_reass_buffer_get_data_len (b),
1492                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1493       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1494         {
1495           bi = b->next_buffer;
1496         }
1497       else
1498         {
1499           bi = ~0;
1500         }
1501     }
1502   return s;
1503 }
1504
1505 static clib_error_t *
1506 show_ip6_reass (vlib_main_t * vm, unformat_input_t * input,
1507                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1508 {
1509   ip6_reass_main_t *rm = &ip6_reass_main;
1510
1511   vlib_cli_output (vm, "---------------------");
1512   vlib_cli_output (vm, "IP6 reassembly status");
1513   vlib_cli_output (vm, "---------------------");
1514   bool details = false;
1515   if (unformat (input, "details"))
1516     {
1517       details = true;
1518     }
1519
1520   u32 sum_reass_n = 0;
1521   u64 sum_buffers_n = 0;
1522   ip6_reass_t *reass;
1523   uword thread_index;
1524   const uword nthreads = vlib_num_workers () + 1;
1525   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1526     {
1527       ip6_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1528       clib_spinlock_lock (&rt->lock);
1529       if (details)
1530         {
1531           /* *INDENT-OFF* */
1532           pool_foreach (reass, rt->pool, {
1533             vlib_cli_output (vm, "%U", format_ip6_reass, vm, reass);
1534           });
1535           /* *INDENT-ON* */
1536         }
1537       sum_reass_n += rt->reass_n;
1538       sum_buffers_n += rt->buffers_n;
1539       clib_spinlock_unlock (&rt->lock);
1540     }
1541   vlib_cli_output (vm, "---------------------");
1542   vlib_cli_output (vm, "Current IP6 reassemblies count: %lu\n",
1543                    (long unsigned) sum_reass_n);
1544   vlib_cli_output (vm, "Maximum configured concurrent IP6 reassemblies per "
1545                    "worker-thread: %lu\n", (long unsigned) rm->max_reass_n);
1546   vlib_cli_output (vm, "Buffers in use: %lu\n",
1547                    (long unsigned) sum_buffers_n);
1548   return 0;
1549 }
1550
1551 /* *INDENT-OFF* */
1552 VLIB_CLI_COMMAND (show_ip6_reassembly_cmd, static) = {
1553     .path = "show ip6-reassembly",
1554     .short_help = "show ip6-reassembly [details]",
1555     .function = show_ip6_reass,
1556 };
1557 /* *INDENT-ON* */
1558
1559 vnet_api_error_t
1560 ip6_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1561 {
1562   return vnet_feature_enable_disable ("ip6-unicast", "ip6-reassembly-feature",
1563                                       sw_if_index, enable_disable, 0, 0);
1564 }
1565
1566 /*
1567  * fd.io coding-style-patch-verification: ON
1568  *
1569  * Local Variables:
1570  * eval: (c-set-style "gnu")
1571  * End:
1572  */