reassembly: replace asserts with error counters
[vpp.git] / src / vnet / ip / ip6_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv6 Reassembly.
19  *
20  * This file contains the source code for IPv6 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_48_8.h>
27 #include <vnet/ip/ip6_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP6_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP6_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP6_REASS_HT_LOAD_FACTOR (0.75)
34
35 typedef enum
36 {
37   IP6_REASS_RC_OK,
38   IP6_REASS_RC_INTERNAL_ERROR,
39 } ip6_reass_rc_t;
40
41 typedef struct
42 {
43   union
44   {
45     struct
46     {
47       ip6_address_t src;
48       ip6_address_t dst;
49       u32 xx_id;
50       u32 frag_id;
51       u8 unused[7];
52       u8 proto;
53     };
54     u64 as_u64[6];
55   };
56 } ip6_reass_key_t;
57
58 always_inline u32
59 ip6_reass_buffer_get_data_offset (vlib_buffer_t * b)
60 {
61   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
62   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
63 }
64
65 always_inline u16
66 ip6_reass_buffer_get_data_len (vlib_buffer_t * b)
67 {
68   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
69   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
70     (vnb->ip.reass.fragment_first + ip6_reass_buffer_get_data_offset (b)) + 1;
71 }
72
73 typedef struct
74 {
75   // hash table key
76   ip6_reass_key_t key;
77   // time when last packet was received
78   f64 last_heard;
79   // internal id of this reassembly
80   u64 id;
81   // buffer index of first buffer in this reassembly context
82   u32 first_bi;
83   // last octet of packet, ~0 until fragment without more_fragments arrives
84   u32 last_packet_octet;
85   // length of data collected so far
86   u32 data_len;
87   // trace operation counter
88   u32 trace_op_counter;
89   // next index - used by non-feature node
90   u8 next_index;
91   // minimum fragment length for this reassembly - used to estimate MTU
92   u16 min_fragment_length;
93 } ip6_reass_t;
94
95 typedef struct
96 {
97   ip6_reass_t *pool;
98   u32 reass_n;
99   u32 buffers_n;
100   u32 id_counter;
101   clib_spinlock_t lock;
102 } ip6_reass_per_thread_t;
103
104 typedef struct
105 {
106   // IPv6 config
107   u32 timeout_ms;
108   f64 timeout;
109   u32 expire_walk_interval_ms;
110   u32 max_reass_n;
111
112   // IPv6 runtime
113   clib_bihash_48_8_t hash;
114
115   // per-thread data
116   ip6_reass_per_thread_t *per_thread_data;
117
118   // convenience
119   vlib_main_t *vlib_main;
120   vnet_main_t *vnet_main;
121
122   // node index of ip6-drop node
123   u32 ip6_drop_idx;
124   u32 ip6_icmp_error_idx;
125   u32 ip6_reass_expire_node_idx;
126
127 } ip6_reass_main_t;
128
129 ip6_reass_main_t ip6_reass_main;
130
131 typedef enum
132 {
133   IP6_REASSEMBLY_NEXT_INPUT,
134   IP6_REASSEMBLY_NEXT_DROP,
135   IP6_REASSEMBLY_NEXT_ICMP_ERROR,
136   IP6_REASSEMBLY_N_NEXT,
137 } ip6_reass_next_t;
138
139 typedef enum
140 {
141   RANGE_NEW,
142   RANGE_OVERLAP,
143   ICMP_ERROR_RT_EXCEEDED,
144   ICMP_ERROR_FL_TOO_BIG,
145   ICMP_ERROR_FL_NOT_MULT_8,
146   FINALIZE,
147 } ip6_reass_trace_operation_e;
148
149 typedef struct
150 {
151   u16 range_first;
152   u16 range_last;
153   u32 range_bi;
154   i32 data_offset;
155   u32 data_len;
156   u32 first_bi;
157 } ip6_reass_range_trace_t;
158
159 typedef struct
160 {
161   ip6_reass_trace_operation_e action;
162   u32 reass_id;
163   ip6_reass_range_trace_t trace_range;
164   u32 size_diff;
165   u32 op_id;
166   u32 fragment_first;
167   u32 fragment_last;
168   u32 total_data_len;
169 } ip6_reass_trace_t;
170
171 static void
172 ip6_reass_trace_details (vlib_main_t * vm, u32 bi,
173                          ip6_reass_range_trace_t * trace)
174 {
175   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
176   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
177   trace->range_first = vnb->ip.reass.range_first;
178   trace->range_last = vnb->ip.reass.range_last;
179   trace->data_offset = ip6_reass_buffer_get_data_offset (b);
180   trace->data_len = ip6_reass_buffer_get_data_len (b);
181   trace->range_bi = bi;
182 }
183
184 static u8 *
185 format_ip6_reass_range_trace (u8 * s, va_list * args)
186 {
187   ip6_reass_range_trace_t *trace = va_arg (*args, ip6_reass_range_trace_t *);
188   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
189               trace->range_last, trace->data_offset, trace->data_len,
190               trace->range_bi);
191   return s;
192 }
193
194 static u8 *
195 format_ip6_reass_trace (u8 * s, va_list * args)
196 {
197   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
198   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
199   ip6_reass_trace_t *t = va_arg (*args, ip6_reass_trace_t *);
200   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
201   u32 indent = format_get_indent (s);
202   s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
203               t->trace_range.first_bi, t->total_data_len, t->fragment_first,
204               t->fragment_last);
205   switch (t->action)
206     {
207     case RANGE_NEW:
208       s = format (s, "\n%Unew %U", format_white_space, indent,
209                   format_ip6_reass_range_trace, &t->trace_range);
210       break;
211     case RANGE_OVERLAP:
212       s = format (s, "\n%Uoverlap %U", format_white_space, indent,
213                   format_ip6_reass_range_trace, &t->trace_range);
214       break;
215     case ICMP_ERROR_FL_TOO_BIG:
216       s = format (s, "\n%Uicmp-error - frag_len > 65535 %U",
217                   format_white_space, indent, format_ip6_reass_range_trace,
218                   &t->trace_range);
219       break;
220     case ICMP_ERROR_FL_NOT_MULT_8:
221       s = format (s, "\n%Uicmp-error - frag_len mod 8 != 0 %U",
222                   format_white_space, indent, format_ip6_reass_range_trace,
223                   &t->trace_range);
224       break;
225     case ICMP_ERROR_RT_EXCEEDED:
226       s = format (s, "\n%Uicmp-error - reassembly time exceeded",
227                   format_white_space, indent);
228       break;
229     case FINALIZE:
230       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
231       break;
232     }
233   return s;
234 }
235
236 static void
237 ip6_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
238                      ip6_reass_main_t * rm, ip6_reass_t * reass,
239                      u32 bi, ip6_reass_trace_operation_e action,
240                      u32 size_diff)
241 {
242   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
243   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
244   if (pool_is_free_index (vm->trace_main.trace_buffer_pool, b->trace_index))
245     {
246       // this buffer's trace is gone
247       b->flags &= ~VLIB_BUFFER_IS_TRACED;
248       return;
249     }
250   ip6_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
251   t->reass_id = reass->id;
252   t->action = action;
253   ip6_reass_trace_details (vm, bi, &t->trace_range);
254   t->size_diff = size_diff;
255   t->op_id = reass->trace_op_counter;
256   ++reass->trace_op_counter;
257   t->fragment_first = vnb->ip.reass.fragment_first;
258   t->fragment_last = vnb->ip.reass.fragment_last;
259   t->trace_range.first_bi = reass->first_bi;
260   t->total_data_len = reass->data_len;
261 #if 0
262   static u8 *s = NULL;
263   s = format (s, "%U", format_ip6_reass_trace, NULL, NULL, t);
264   printf ("%.*s\n", vec_len (s), s);
265   fflush (stdout);
266   vec_reset_length (s);
267 #endif
268 }
269
270 always_inline void
271 ip6_reass_free (ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
272                 ip6_reass_t * reass)
273 {
274   clib_bihash_kv_48_8_t kv;
275   kv.key[0] = reass->key.as_u64[0];
276   kv.key[1] = reass->key.as_u64[1];
277   kv.key[2] = reass->key.as_u64[2];
278   kv.key[3] = reass->key.as_u64[3];
279   kv.key[4] = reass->key.as_u64[4];
280   kv.key[5] = reass->key.as_u64[5];
281   clib_bihash_add_del_48_8 (&rm->hash, &kv, 0);
282   pool_put (rt->pool, reass);
283   --rt->reass_n;
284 }
285
286 always_inline void
287 ip6_reass_drop_all (vlib_main_t * vm, ip6_reass_main_t * rm,
288                     ip6_reass_t * reass, u32 ** vec_drop_bi)
289 {
290   u32 range_bi = reass->first_bi;
291   vlib_buffer_t *range_b;
292   vnet_buffer_opaque_t *range_vnb;
293   while (~0 != range_bi)
294     {
295       range_b = vlib_get_buffer (vm, range_bi);
296       range_vnb = vnet_buffer (range_b);
297       u32 bi = range_bi;
298       while (~0 != bi)
299         {
300           vec_add1 (*vec_drop_bi, bi);
301           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
302           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
303             {
304               bi = b->next_buffer;
305               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
306             }
307           else
308             {
309               bi = ~0;
310             }
311         }
312       range_bi = range_vnb->ip.reass.next_range_bi;
313     }
314 }
315
316 always_inline void
317 ip6_reass_on_timeout (vlib_main_t * vm, vlib_node_runtime_t * node,
318                       ip6_reass_main_t * rm, ip6_reass_t * reass,
319                       u32 * icmp_bi, u32 ** vec_timeout)
320 {
321   if (~0 == reass->first_bi)
322     {
323       return;
324     }
325   vlib_buffer_t *b = vlib_get_buffer (vm, reass->first_bi);
326   if (0 == vnet_buffer (b)->ip.reass.fragment_first)
327     {
328       *icmp_bi = reass->first_bi;
329       if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
330         {
331           ip6_reass_add_trace (vm, node, rm, reass, reass->first_bi,
332                                ICMP_ERROR_RT_EXCEEDED, 0);
333         }
334       // fragment with offset zero received - send icmp message back
335       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
336         {
337           // separate first buffer from chain and steer it towards icmp node
338           b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
339           reass->first_bi = b->next_buffer;
340         }
341       else
342         {
343           reass->first_bi = vnet_buffer (b)->ip.reass.next_range_bi;
344         }
345       icmp6_error_set_vnet_buffer (b, ICMP6_time_exceeded,
346                                    ICMP6_time_exceeded_fragment_reassembly_time_exceeded,
347                                    0);
348     }
349   ip6_reass_drop_all (vm, rm, reass, vec_timeout);
350 }
351
352 always_inline ip6_reass_t *
353 ip6_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
354                           ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
355                           ip6_reass_key_t * k, u32 * icmp_bi,
356                           u32 ** vec_timeout)
357 {
358   ip6_reass_t *reass = NULL;
359   f64 now = vlib_time_now (rm->vlib_main);
360   clib_bihash_kv_48_8_t kv, value;
361   kv.key[0] = k->as_u64[0];
362   kv.key[1] = k->as_u64[1];
363   kv.key[2] = k->as_u64[2];
364   kv.key[3] = k->as_u64[3];
365   kv.key[4] = k->as_u64[4];
366   kv.key[5] = k->as_u64[5];
367
368   if (!clib_bihash_search_48_8 (&rm->hash, &kv, &value))
369     {
370       reass = pool_elt_at_index (rt->pool, value.value);
371       if (now > reass->last_heard + rm->timeout)
372         {
373           ip6_reass_on_timeout (vm, node, rm, reass, icmp_bi, vec_timeout);
374           ip6_reass_free (rm, rt, reass);
375           reass = NULL;
376         }
377     }
378
379   if (reass)
380     {
381       reass->last_heard = now;
382       return reass;
383     }
384
385   if (rt->reass_n >= rm->max_reass_n)
386     {
387       reass = NULL;
388       return reass;
389     }
390   else
391     {
392       pool_get (rt->pool, reass);
393       clib_memset (reass, 0, sizeof (*reass));
394       reass->id =
395         ((u64) os_get_thread_index () * 1000000000) + rt->id_counter;
396       ++rt->id_counter;
397       reass->first_bi = ~0;
398       reass->last_packet_octet = ~0;
399       reass->data_len = 0;
400       ++rt->reass_n;
401     }
402
403   reass->key.as_u64[0] = kv.key[0] = k->as_u64[0];
404   reass->key.as_u64[1] = kv.key[1] = k->as_u64[1];
405   reass->key.as_u64[2] = kv.key[2] = k->as_u64[2];
406   reass->key.as_u64[3] = kv.key[3] = k->as_u64[3];
407   reass->key.as_u64[4] = kv.key[4] = k->as_u64[4];
408   reass->key.as_u64[5] = kv.key[5] = k->as_u64[5];
409   kv.value = reass - rt->pool;
410   reass->last_heard = now;
411
412   if (clib_bihash_add_del_48_8 (&rm->hash, &kv, 1))
413     {
414       ip6_reass_free (rm, rt, reass);
415       reass = NULL;
416     }
417
418   return reass;
419 }
420
421 always_inline ip6_reass_rc_t
422 ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
423                     ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
424                     ip6_reass_t * reass, u32 * bi0, u32 * next0,
425                     u32 * error0, u32 ** vec_drop_compress, bool is_feature)
426 {
427   *bi0 = reass->first_bi;
428   *error0 = IP6_ERROR_NONE;
429   ip6_frag_hdr_t *frag_hdr;
430   vlib_buffer_t *last_b = NULL;
431   u32 sub_chain_bi = reass->first_bi;
432   u32 total_length = 0;
433   u32 buf_cnt = 0;
434   u32 dropped_cnt = 0;
435   do
436     {
437       u32 tmp_bi = sub_chain_bi;
438       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
439       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
440       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
441           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
442         {
443           return IP6_REASS_RC_INTERNAL_ERROR;
444         }
445
446       u32 data_len = ip6_reass_buffer_get_data_len (tmp);
447       u32 trim_front = vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
448         sizeof (*frag_hdr) + ip6_reass_buffer_get_data_offset (tmp);
449       u32 trim_end =
450         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
451       if (tmp_bi == reass->first_bi)
452         {
453           /* first buffer - keep ip6 header */
454           if (0 != ip6_reass_buffer_get_data_offset (tmp))
455             {
456               return IP6_REASS_RC_INTERNAL_ERROR;
457             }
458           trim_front = 0;
459           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
460             (vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
461              sizeof (*frag_hdr));
462           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
463             {
464               return IP6_REASS_RC_INTERNAL_ERROR;
465             }
466         }
467       u32 keep_data =
468         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
469       while (1)
470         {
471           ++buf_cnt;
472           if (trim_front)
473             {
474               if (trim_front > tmp->current_length)
475                 {
476                   /* drop whole buffer */
477                   vec_add1 (*vec_drop_compress, tmp_bi);
478                   ++dropped_cnt;
479                   trim_front -= tmp->current_length;
480                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
481                     {
482                       return IP6_REASS_RC_INTERNAL_ERROR;
483                     }
484                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
485                   tmp_bi = tmp->next_buffer;
486                   tmp = vlib_get_buffer (vm, tmp_bi);
487                   continue;
488                 }
489               else
490                 {
491                   vlib_buffer_advance (tmp, trim_front);
492                   trim_front = 0;
493                 }
494             }
495           if (keep_data)
496             {
497               if (last_b)
498                 {
499                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
500                   last_b->next_buffer = tmp_bi;
501                 }
502               last_b = tmp;
503               if (keep_data <= tmp->current_length)
504                 {
505                   tmp->current_length = keep_data;
506                   keep_data = 0;
507                 }
508               else
509                 {
510                   keep_data -= tmp->current_length;
511                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
512                     {
513                       return IP6_REASS_RC_INTERNAL_ERROR;
514                     }
515                 }
516               total_length += tmp->current_length;
517             }
518           else
519             {
520               vec_add1 (*vec_drop_compress, tmp_bi);
521               if (reass->first_bi == tmp_bi)
522                 {
523                   return IP6_REASS_RC_INTERNAL_ERROR;
524                 }
525               ++dropped_cnt;
526             }
527           if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
528             {
529               tmp_bi = tmp->next_buffer;
530               tmp = vlib_get_buffer (vm, tmp->next_buffer);
531             }
532           else
533             {
534               break;
535             }
536         }
537       sub_chain_bi =
538         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
539         reass.next_range_bi;
540     }
541   while (~0 != sub_chain_bi);
542
543   if (!last_b)
544     {
545       return IP6_REASS_RC_INTERNAL_ERROR;
546     }
547   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
548   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
549   if (total_length < first_b->current_length)
550     {
551       return IP6_REASS_RC_INTERNAL_ERROR;
552     }
553   total_length -= first_b->current_length;
554   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
555   first_b->total_length_not_including_first_buffer = total_length;
556   // drop fragment header
557   vnet_buffer_opaque_t *first_b_vnb = vnet_buffer (first_b);
558   ip6_header_t *ip = vlib_buffer_get_current (first_b);
559   u16 ip6_frag_hdr_offset = first_b_vnb->ip.reass.ip6_frag_hdr_offset;
560   ip6_ext_header_t *prev_hdr;
561   ip6_ext_header_find_t (ip, prev_hdr, frag_hdr,
562                          IP_PROTOCOL_IPV6_FRAGMENTATION);
563   if (prev_hdr)
564     {
565       prev_hdr->next_hdr = frag_hdr->next_hdr;
566     }
567   else
568     {
569       ip->protocol = frag_hdr->next_hdr;
570     }
571   if (!((u8 *) frag_hdr - (u8 *) ip == ip6_frag_hdr_offset))
572     {
573       return IP6_REASS_RC_INTERNAL_ERROR;
574     }
575   memmove (frag_hdr, (u8 *) frag_hdr + sizeof (*frag_hdr),
576            first_b->current_length - ip6_frag_hdr_offset -
577            sizeof (ip6_frag_hdr_t));
578   first_b->current_length -= sizeof (*frag_hdr);
579   ip->payload_length =
580     clib_host_to_net_u16 (total_length + first_b->current_length -
581                           sizeof (*ip));
582   vlib_buffer_chain_compress (vm, first_b, vec_drop_compress);
583   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
584     {
585       ip6_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
586 #if 0
587       // following code does a hexdump of packet fragments to stdout ...
588       do
589         {
590           u32 bi = reass->first_bi;
591           u8 *s = NULL;
592           while (~0 != bi)
593             {
594               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
595               s = format (s, "%u: %U\n", bi, format_hexdump,
596                           vlib_buffer_get_current (b), b->current_length);
597               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
598                 {
599                   bi = b->next_buffer;
600                 }
601               else
602                 {
603                   break;
604                 }
605             }
606           printf ("%.*s\n", vec_len (s), s);
607           fflush (stdout);
608           vec_free (s);
609         }
610       while (0);
611 #endif
612     }
613   if (is_feature)
614     {
615       *next0 = IP6_REASSEMBLY_NEXT_INPUT;
616     }
617   else
618     {
619       *next0 = reass->next_index;
620     }
621   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
622   ip6_reass_free (rm, rt, reass);
623   reass = NULL;
624   return IP6_REASS_RC_OK;
625 }
626
627 always_inline u32
628 ip6_reass_get_buffer_chain_length (vlib_main_t * vm, vlib_buffer_t * b)
629 {
630   u32 len = 0;
631   while (b)
632     {
633       ++len;
634       if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
635         {
636           b = vlib_get_buffer (vm, b->next_buffer);
637         }
638       else
639         {
640           break;
641         }
642     }
643   return len;
644 }
645
646 always_inline void
647 ip6_reass_insert_range_in_chain (vlib_main_t * vm, ip6_reass_main_t * rm,
648                                  ip6_reass_per_thread_t * rt,
649                                  ip6_reass_t * reass, u32 prev_range_bi,
650                                  u32 new_next_bi)
651 {
652
653   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
654   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
655   if (~0 != prev_range_bi)
656     {
657       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
658       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
659       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
660       prev_vnb->ip.reass.next_range_bi = new_next_bi;
661     }
662   else
663     {
664       if (~0 != reass->first_bi)
665         {
666           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
667         }
668       reass->first_bi = new_next_bi;
669     }
670   reass->data_len += ip6_reass_buffer_get_data_len (new_next_b);
671   rt->buffers_n += ip6_reass_get_buffer_chain_length (vm, new_next_b);
672 }
673
674 always_inline ip6_reass_rc_t
675 ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
676                   ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
677                   ip6_reass_t * reass, u32 * bi0, u32 * next0,
678                   u32 * error0, ip6_frag_hdr_t * frag_hdr,
679                   u32 ** vec_drop_overlap, u32 ** vec_drop_compress,
680                   bool is_feature)
681 {
682   int consumed = 0;
683   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
684   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
685   reass->next_index = fvnb->ip.reass.next_index;        // store next_index before it's overwritten
686   fvnb->ip.reass.ip6_frag_hdr_offset =
687     (u8 *) frag_hdr - (u8 *) vlib_buffer_get_current (fb);
688   ip6_header_t *fip = vlib_buffer_get_current (fb);
689   if (fb->current_length < sizeof (*fip) ||
690       fvnb->ip.reass.ip6_frag_hdr_offset == 0 ||
691       fvnb->ip.reass.ip6_frag_hdr_offset >= fb->current_length)
692     {
693       return IP6_REASS_RC_INTERNAL_ERROR;
694     }
695
696   u32 fragment_first = fvnb->ip.reass.fragment_first =
697     ip6_frag_hdr_offset_bytes (frag_hdr);
698   u32 fragment_length =
699     vlib_buffer_length_in_chain (vm, fb) -
700     (fvnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
701   u32 fragment_last = fvnb->ip.reass.fragment_last =
702     fragment_first + fragment_length - 1;
703   int more_fragments = ip6_frag_hdr_more (frag_hdr);
704   u32 candidate_range_bi = reass->first_bi;
705   u32 prev_range_bi = ~0;
706   fvnb->ip.reass.range_first = fragment_first;
707   fvnb->ip.reass.range_last = fragment_last;
708   fvnb->ip.reass.next_range_bi = ~0;
709   if (!more_fragments)
710     {
711       reass->last_packet_octet = fragment_last;
712     }
713   if (~0 == reass->first_bi)
714     {
715       // starting a new reassembly
716       ip6_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
717                                        *bi0);
718       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
719         {
720           ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
721         }
722       reass->min_fragment_length = clib_net_to_host_u16 (fip->payload_length);
723       *bi0 = ~0;
724       return IP6_REASS_RC_OK;
725     }
726   reass->min_fragment_length =
727     clib_min (clib_net_to_host_u16 (fip->payload_length),
728               fvnb->ip.reass.estimated_mtu);
729   while (~0 != candidate_range_bi)
730     {
731       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
732       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
733       if (fragment_first > candidate_vnb->ip.reass.range_last)
734         {
735           // this fragments starts after candidate range
736           prev_range_bi = candidate_range_bi;
737           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
738           if (candidate_vnb->ip.reass.range_last < fragment_last &&
739               ~0 == candidate_range_bi)
740             {
741               // special case - this fragment falls beyond all known ranges
742               ip6_reass_insert_range_in_chain (vm, rm, rt, reass,
743                                                prev_range_bi, *bi0);
744               consumed = 1;
745               break;
746             }
747           continue;
748         }
749       if (fragment_last < candidate_vnb->ip.reass.range_first)
750         {
751           // this fragment ends before candidate range without any overlap
752           ip6_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
753                                            *bi0);
754           consumed = 1;
755         }
756       else if (fragment_first == candidate_vnb->ip.reass.range_first &&
757                fragment_last == candidate_vnb->ip.reass.range_last)
758         {
759           // duplicate fragment - ignore
760         }
761       else
762         {
763           // overlapping fragment - not allowed by RFC 8200
764           ip6_reass_drop_all (vm, rm, reass, vec_drop_overlap);
765           ip6_reass_free (rm, rt, reass);
766           if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
767             {
768               ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_OVERLAP,
769                                    0);
770             }
771           *next0 = IP6_REASSEMBLY_NEXT_DROP;
772           *error0 = IP6_ERROR_REASS_OVERLAPPING_FRAGMENT;
773         }
774       break;
775     }
776   if (consumed)
777     {
778       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
779         {
780           ip6_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
781         }
782     }
783   if (~0 != reass->last_packet_octet &&
784       reass->data_len == reass->last_packet_octet + 1)
785     {
786       return ip6_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
787                                  vec_drop_compress, is_feature);
788     }
789   else
790     {
791       if (consumed)
792         {
793           *bi0 = ~0;
794         }
795       else
796         {
797           *next0 = IP6_REASSEMBLY_NEXT_DROP;
798           ;
799           *error0 = IP6_ERROR_REASS_DUPLICATE_FRAGMENT;
800         }
801     }
802   return IP6_REASS_RC_OK;
803 }
804
805 always_inline bool
806 ip6_reass_verify_upper_layer_present (vlib_node_runtime_t * node,
807                                       vlib_buffer_t * b,
808                                       ip6_frag_hdr_t * frag_hdr)
809 {
810   ip6_ext_header_t *tmp = (ip6_ext_header_t *) frag_hdr;
811   while (ip6_ext_hdr (tmp->next_hdr))
812     {
813       tmp = ip6_ext_next_header (tmp);
814     }
815   if (IP_PROTOCOL_IP6_NONXT == tmp->next_hdr)
816     {
817       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
818                                    ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain,
819                                    0);
820       b->error = node->errors[IP6_ERROR_REASS_MISSING_UPPER];
821
822       return false;
823     }
824   return true;
825 }
826
827 always_inline bool
828 ip6_reass_verify_fragment_multiple_8 (vlib_main_t * vm,
829                                       vlib_node_runtime_t * node,
830                                       vlib_buffer_t * b,
831                                       ip6_frag_hdr_t * frag_hdr)
832 {
833   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
834   ip6_header_t *ip = vlib_buffer_get_current (b);
835   int more_fragments = ip6_frag_hdr_more (frag_hdr);
836   u32 fragment_length =
837     vlib_buffer_length_in_chain (vm, b) -
838     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
839   if (more_fragments && 0 != fragment_length % 8)
840     {
841       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
842                                    ICMP6_parameter_problem_erroneous_header_field,
843                                    (u8 *) & ip->payload_length - (u8 *) ip);
844       return false;
845     }
846   return true;
847 }
848
849 always_inline bool
850 ip6_reass_verify_packet_size_lt_64k (vlib_main_t * vm,
851                                      vlib_node_runtime_t * node,
852                                      vlib_buffer_t * b,
853                                      ip6_frag_hdr_t * frag_hdr)
854 {
855   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
856   u32 fragment_first = ip6_frag_hdr_offset_bytes (frag_hdr);
857   u32 fragment_length =
858     vlib_buffer_length_in_chain (vm, b) -
859     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
860   if (fragment_first + fragment_length > 65535)
861     {
862       ip6_header_t *ip0 = vlib_buffer_get_current (b);
863       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
864                                    ICMP6_parameter_problem_erroneous_header_field,
865                                    (u8 *) & frag_hdr->fragment_offset_and_more
866                                    - (u8 *) ip0);
867       return false;
868     }
869   return true;
870 }
871
872 always_inline uword
873 ip6_reassembly_inline (vlib_main_t * vm,
874                        vlib_node_runtime_t * node,
875                        vlib_frame_t * frame, bool is_feature)
876 {
877   u32 *from = vlib_frame_vector_args (frame);
878   u32 n_left_from, n_left_to_next, *to_next, next_index;
879   ip6_reass_main_t *rm = &ip6_reass_main;
880   ip6_reass_per_thread_t *rt = &rm->per_thread_data[os_get_thread_index ()];
881   clib_spinlock_lock (&rt->lock);
882
883   n_left_from = frame->n_vectors;
884   next_index = node->cached_next_index;
885   static u32 *vec_timeout = NULL;       // indexes of buffers which timed out
886   static u32 *vec_drop_overlap = NULL;  // indexes of buffers dropped due to overlap
887   static u32 *vec_drop_internal_error = NULL;   // indexes of buffers dropped due to internal errors
888   static u32 *vec_drop_compress = NULL; // indexes of buffers dropped due to buffer compression
889   while (n_left_from > 0 || vec_len (vec_timeout) > 0
890          || vec_len (vec_drop_overlap) > 0 || vec_len (vec_drop_compress) > 0
891          || vec_len (vec_drop_internal_error) > 0)
892     {
893       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
894
895       while (vec_len (vec_timeout) > 0 && n_left_to_next > 0)
896         {
897           u32 bi = vec_pop (vec_timeout);
898           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
899           b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
900           to_next[0] = bi;
901           to_next += 1;
902           n_left_to_next -= 1;
903           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
904                                            n_left_to_next, bi,
905                                            IP6_REASSEMBLY_NEXT_DROP);
906           --rt->buffers_n;
907         }
908
909       while (vec_len (vec_drop_overlap) > 0 && n_left_to_next > 0)
910         {
911           u32 bi = vec_pop (vec_drop_overlap);
912           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
913           b->error = node->errors[IP6_ERROR_REASS_OVERLAPPING_FRAGMENT];
914           to_next[0] = bi;
915           to_next += 1;
916           n_left_to_next -= 1;
917           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
918                                            n_left_to_next, bi,
919                                            IP6_REASSEMBLY_NEXT_DROP);
920           --rt->buffers_n;
921         }
922
923       while (vec_len (vec_drop_compress) > 0 && n_left_to_next > 0)
924         {
925           u32 bi = vec_pop (vec_drop_compress);
926           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
927           b->error = node->errors[IP6_ERROR_NONE];
928           to_next[0] = bi;
929           to_next += 1;
930           n_left_to_next -= 1;
931           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
932                                            n_left_to_next, bi,
933                                            IP6_REASSEMBLY_NEXT_DROP);
934           --rt->buffers_n;
935         }
936       while (vec_len (vec_drop_internal_error) > 0 && n_left_to_next > 0)
937         {
938           u32 bi = vec_pop (vec_drop_internal_error);
939           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
940           b->error = node->errors[IP6_ERROR_REASS_INTERNAL_ERROR];
941           to_next[0] = bi;
942           to_next += 1;
943           n_left_to_next -= 1;
944           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
945                                            n_left_to_next, bi,
946                                            IP6_REASSEMBLY_NEXT_DROP);
947           --rt->buffers_n;
948         }
949
950       while (n_left_from > 0 && n_left_to_next > 0)
951         {
952           u32 bi0;
953           vlib_buffer_t *b0;
954           u32 next0;
955           u32 error0 = IP6_ERROR_NONE;
956           u32 icmp_bi = ~0;
957
958           bi0 = from[0];
959           b0 = vlib_get_buffer (vm, bi0);
960
961           ip6_header_t *ip0 = vlib_buffer_get_current (b0);
962           ip6_frag_hdr_t *frag_hdr = NULL;
963           ip6_ext_header_t *prev_hdr;
964           if (ip6_ext_hdr (ip0->protocol))
965             {
966               ip6_ext_header_find_t (ip0, prev_hdr, frag_hdr,
967                                      IP_PROTOCOL_IPV6_FRAGMENTATION);
968             }
969           if (!frag_hdr)
970             {
971               // this is a regular packet - no fragmentation
972               next0 = IP6_REASSEMBLY_NEXT_INPUT;
973               goto skip_reass;
974             }
975           if (0 == ip6_frag_hdr_offset (frag_hdr))
976             {
977               // first fragment - verify upper-layer is present
978               if (!ip6_reass_verify_upper_layer_present (node, b0, frag_hdr))
979                 {
980                   next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
981                   goto skip_reass;
982                 }
983             }
984           if (!ip6_reass_verify_fragment_multiple_8 (vm, node, b0, frag_hdr)
985               || !ip6_reass_verify_packet_size_lt_64k (vm, node, b0,
986                                                        frag_hdr))
987             {
988               next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
989               goto skip_reass;
990             }
991           vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset =
992             (u8 *) frag_hdr - (u8 *) ip0;
993
994           ip6_reass_key_t k;
995           k.as_u64[0] = ip0->src_address.as_u64[0];
996           k.as_u64[1] = ip0->src_address.as_u64[1];
997           k.as_u64[2] = ip0->dst_address.as_u64[0];
998           k.as_u64[3] = ip0->dst_address.as_u64[1];
999           k.as_u64[4] =
1000             (u64) vnet_buffer (b0)->
1001             sw_if_index[VLIB_RX] << 32 | frag_hdr->identification;
1002           k.as_u64[5] = ip0->protocol;
1003           ip6_reass_t *reass =
1004             ip6_reass_find_or_create (vm, node, rm, rt, &k, &icmp_bi,
1005                                       &vec_timeout);
1006
1007           if (reass)
1008             {
1009               switch (ip6_reass_update (vm, node, rm, rt, reass, &bi0, &next0,
1010                                         &error0, frag_hdr, &vec_drop_overlap,
1011                                         &vec_drop_compress, is_feature))
1012                 {
1013                 case IP6_REASS_RC_OK:
1014                   /* nothing to do here */
1015                   break;
1016                 case IP6_REASS_RC_INTERNAL_ERROR:
1017                   /* drop everything and start with a clean slate */
1018                   ip6_reass_drop_all (vm, rm, reass,
1019                                       &vec_drop_internal_error);
1020                   ip6_reass_free (rm, rt, reass);
1021                   goto next_packet;
1022                   break;
1023                 }
1024             }
1025           else
1026             {
1027               next0 = IP6_REASSEMBLY_NEXT_DROP;
1028               error0 = IP6_ERROR_REASS_LIMIT_REACHED;
1029             }
1030
1031           b0->error = node->errors[error0];
1032
1033           if (~0 != bi0)
1034             {
1035             skip_reass:
1036               to_next[0] = bi0;
1037               to_next += 1;
1038               n_left_to_next -= 1;
1039               if (is_feature && IP6_ERROR_NONE == error0)
1040                 {
1041                   b0 = vlib_get_buffer (vm, bi0);
1042                   vnet_feature_next (&next0, b0);
1043                 }
1044               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1045                                                n_left_to_next, bi0, next0);
1046             }
1047
1048           if (~0 != icmp_bi)
1049             {
1050               next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
1051               to_next[0] = icmp_bi;
1052               to_next += 1;
1053               n_left_to_next -= 1;
1054               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1055                                                n_left_to_next, icmp_bi,
1056                                                next0);
1057             }
1058         next_packet:
1059           from += 1;
1060           n_left_from -= 1;
1061         }
1062
1063       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1064     }
1065
1066   clib_spinlock_unlock (&rt->lock);
1067   return frame->n_vectors;
1068 }
1069
1070 static char *ip6_reassembly_error_strings[] = {
1071 #define _(sym, string) string,
1072   foreach_ip6_error
1073 #undef _
1074 };
1075
1076 static uword
1077 ip6_reassembly (vlib_main_t * vm, vlib_node_runtime_t * node,
1078                 vlib_frame_t * frame)
1079 {
1080   return ip6_reassembly_inline (vm, node, frame, false /* is_feature */ );
1081 }
1082
1083 /* *INDENT-OFF* */
1084 VLIB_REGISTER_NODE (ip6_reass_node, static) = {
1085     .function = ip6_reassembly,
1086     .name = "ip6-reassembly",
1087     .vector_size = sizeof (u32),
1088     .format_trace = format_ip6_reass_trace,
1089     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1090     .error_strings = ip6_reassembly_error_strings,
1091     .n_next_nodes = IP6_REASSEMBLY_N_NEXT,
1092     .next_nodes =
1093         {
1094                 [IP6_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1095                 [IP6_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1096                 [IP6_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1097         },
1098 };
1099 /* *INDENT-ON* */
1100
1101 VLIB_NODE_FUNCTION_MULTIARCH (ip6_reass_node, ip6_reassembly);
1102
1103 static uword
1104 ip6_reassembly_feature (vlib_main_t * vm,
1105                         vlib_node_runtime_t * node, vlib_frame_t * frame)
1106 {
1107   return ip6_reassembly_inline (vm, node, frame, true /* is_feature */ );
1108 }
1109
1110 /* *INDENT-OFF* */
1111 VLIB_REGISTER_NODE (ip6_reass_node_feature, static) = {
1112     .function = ip6_reassembly_feature,
1113     .name = "ip6-reassembly-feature",
1114     .vector_size = sizeof (u32),
1115     .format_trace = format_ip6_reass_trace,
1116     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1117     .error_strings = ip6_reassembly_error_strings,
1118     .n_next_nodes = IP6_REASSEMBLY_N_NEXT,
1119     .next_nodes =
1120         {
1121                 [IP6_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1122                 [IP6_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1123                 [IP6_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1124         },
1125 };
1126 /* *INDENT-ON* */
1127
1128 VLIB_NODE_FUNCTION_MULTIARCH (ip6_reass_node_feature, ip6_reassembly_feature);
1129
1130 /* *INDENT-OFF* */
1131 VNET_FEATURE_INIT (ip6_reassembly_feature, static) = {
1132     .arc_name = "ip6-unicast",
1133     .node_name = "ip6-reassembly-feature",
1134     .runs_before = VNET_FEATURES ("ip6-lookup"),
1135     .runs_after = 0,
1136 };
1137 /* *INDENT-ON* */
1138
1139 static u32
1140 ip6_reass_get_nbuckets ()
1141 {
1142   ip6_reass_main_t *rm = &ip6_reass_main;
1143   u32 nbuckets;
1144   u8 i;
1145
1146   nbuckets = (u32) (rm->max_reass_n / IP6_REASS_HT_LOAD_FACTOR);
1147
1148   for (i = 0; i < 31; i++)
1149     if ((1 << i) >= nbuckets)
1150       break;
1151   nbuckets = 1 << i;
1152
1153   return nbuckets;
1154 }
1155
1156 typedef enum
1157 {
1158   IP6_EVENT_CONFIG_CHANGED = 1,
1159 } ip6_reass_event_t;
1160
1161 typedef struct
1162 {
1163   int failure;
1164   clib_bihash_48_8_t *new_hash;
1165 } ip6_rehash_cb_ctx;
1166
1167 static void
1168 ip6_rehash_cb (clib_bihash_kv_48_8_t * kv, void *_ctx)
1169 {
1170   ip6_rehash_cb_ctx *ctx = _ctx;
1171   if (clib_bihash_add_del_48_8 (ctx->new_hash, kv, 1))
1172     {
1173       ctx->failure = 1;
1174     }
1175 }
1176
1177 static void
1178 ip6_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1179                       u32 expire_walk_interval_ms)
1180 {
1181   ip6_reass_main.timeout_ms = timeout_ms;
1182   ip6_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1183   ip6_reass_main.max_reass_n = max_reassemblies;
1184   ip6_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1185 }
1186
1187 vnet_api_error_t
1188 ip6_reass_set (u32 timeout_ms, u32 max_reassemblies,
1189                u32 expire_walk_interval_ms)
1190 {
1191   u32 old_nbuckets = ip6_reass_get_nbuckets ();
1192   ip6_reass_set_params (timeout_ms, max_reassemblies,
1193                         expire_walk_interval_ms);
1194   vlib_process_signal_event (ip6_reass_main.vlib_main,
1195                              ip6_reass_main.ip6_reass_expire_node_idx,
1196                              IP6_EVENT_CONFIG_CHANGED, 0);
1197   u32 new_nbuckets = ip6_reass_get_nbuckets ();
1198   if (ip6_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1199     {
1200       clib_bihash_48_8_t new_hash;
1201       clib_memset (&new_hash, 0, sizeof (new_hash));
1202       ip6_rehash_cb_ctx ctx;
1203       ctx.failure = 0;
1204       ctx.new_hash = &new_hash;
1205       clib_bihash_init_48_8 (&new_hash, "ip6-reass", new_nbuckets,
1206                              new_nbuckets * 1024);
1207       clib_bihash_foreach_key_value_pair_48_8 (&ip6_reass_main.hash,
1208                                                ip6_rehash_cb, &ctx);
1209       if (ctx.failure)
1210         {
1211           clib_bihash_free_48_8 (&new_hash);
1212           return -1;
1213         }
1214       else
1215         {
1216           clib_bihash_free_48_8 (&ip6_reass_main.hash);
1217           clib_memcpy_fast (&ip6_reass_main.hash, &new_hash,
1218                             sizeof (ip6_reass_main.hash));
1219         }
1220     }
1221   return 0;
1222 }
1223
1224 vnet_api_error_t
1225 ip6_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1226                u32 * expire_walk_interval_ms)
1227 {
1228   *timeout_ms = ip6_reass_main.timeout_ms;
1229   *max_reassemblies = ip6_reass_main.max_reass_n;
1230   *expire_walk_interval_ms = ip6_reass_main.expire_walk_interval_ms;
1231   return 0;
1232 }
1233
1234 static clib_error_t *
1235 ip6_reass_init_function (vlib_main_t * vm)
1236 {
1237   ip6_reass_main_t *rm = &ip6_reass_main;
1238   clib_error_t *error = 0;
1239   u32 nbuckets;
1240   vlib_node_t *node;
1241
1242   rm->vlib_main = vm;
1243   rm->vnet_main = vnet_get_main ();
1244
1245   vec_validate (rm->per_thread_data, vlib_num_workers ());
1246   ip6_reass_per_thread_t *rt;
1247   vec_foreach (rt, rm->per_thread_data)
1248   {
1249     clib_spinlock_init (&rt->lock);
1250     pool_alloc (rt->pool, rm->max_reass_n);
1251   }
1252
1253   node = vlib_get_node_by_name (vm, (u8 *) "ip6-reassembly-expire-walk");
1254   ASSERT (node);
1255   rm->ip6_reass_expire_node_idx = node->index;
1256
1257   ip6_reass_set_params (IP6_REASS_TIMEOUT_DEFAULT_MS,
1258                         IP6_REASS_MAX_REASSEMBLIES_DEFAULT,
1259                         IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1260
1261   nbuckets = ip6_reass_get_nbuckets ();
1262   clib_bihash_init_48_8 (&rm->hash, "ip6-reass", nbuckets, nbuckets * 1024);
1263
1264   node = vlib_get_node_by_name (vm, (u8 *) "ip6-drop");
1265   ASSERT (node);
1266   rm->ip6_drop_idx = node->index;
1267   node = vlib_get_node_by_name (vm, (u8 *) "ip6-icmp-error");
1268   ASSERT (node);
1269   rm->ip6_icmp_error_idx = node->index;
1270
1271   if ((error = vlib_call_init_function (vm, ip_main_init)))
1272     return error;
1273   ip6_register_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION,
1274                          ip6_reass_node.index);
1275
1276   return error;
1277 }
1278
1279 VLIB_INIT_FUNCTION (ip6_reass_init_function);
1280
1281 static uword
1282 ip6_reass_walk_expired (vlib_main_t * vm,
1283                         vlib_node_runtime_t * node, vlib_frame_t * f)
1284 {
1285   ip6_reass_main_t *rm = &ip6_reass_main;
1286   uword event_type, *event_data = 0;
1287
1288   while (true)
1289     {
1290       vlib_process_wait_for_event_or_clock (vm,
1291                                             (f64) rm->expire_walk_interval_ms
1292                                             / (f64) MSEC_PER_SEC);
1293       event_type = vlib_process_get_events (vm, &event_data);
1294
1295       switch (event_type)
1296         {
1297         case ~0:                /* no events => timeout */
1298           /* nothing to do here */
1299           break;
1300         case IP6_EVENT_CONFIG_CHANGED:
1301           break;
1302         default:
1303           clib_warning ("BUG: event type 0x%wx", event_type);
1304           break;
1305         }
1306       f64 now = vlib_time_now (vm);
1307
1308       ip6_reass_t *reass;
1309       u32 *vec_timeout = NULL;
1310       int *pool_indexes_to_free = NULL;
1311
1312       uword thread_index = 0;
1313       int index;
1314       const uword nthreads = vlib_num_workers () + 1;
1315       u32 *vec_icmp_bi = NULL;
1316       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1317         {
1318           ip6_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1319           clib_spinlock_lock (&rt->lock);
1320
1321           vec_reset_length (pool_indexes_to_free);
1322           /* *INDENT-OFF* */
1323           pool_foreach_index (index, rt->pool, ({
1324                                 reass = pool_elt_at_index (rt->pool, index);
1325                                 if (now > reass->last_heard + rm->timeout)
1326                                   {
1327                                     vec_add1 (pool_indexes_to_free, index);
1328                                   }
1329                               }));
1330           /* *INDENT-ON* */
1331           int *i;
1332           /* *INDENT-OFF* */
1333           vec_foreach (i, pool_indexes_to_free)
1334           {
1335             ip6_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1336             u32 icmp_bi = ~0;
1337             u32 before = vec_len (vec_timeout);
1338             vlib_buffer_t *b = vlib_get_buffer (vm, reass->first_bi);
1339             if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
1340               {
1341                 if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
1342                                         b->trace_index))
1343                   {
1344                     /* the trace is gone, don't trace this buffer anymore */
1345                     b->flags &= ~VLIB_BUFFER_IS_TRACED;
1346                   }
1347               }
1348             ip6_reass_on_timeout (vm, node, rm, reass, &icmp_bi, &vec_timeout);
1349             u32 after = vec_len (vec_timeout);
1350             rt->buffers_n -= (after - before);
1351             if (~0 != icmp_bi)
1352               {
1353                 vec_add1 (vec_icmp_bi, icmp_bi);
1354                 --rt->buffers_n;
1355               }
1356             ip6_reass_free (rm, rt, reass);
1357           }
1358           /* *INDENT-ON* */
1359
1360           clib_spinlock_unlock (&rt->lock);
1361         }
1362
1363       while (vec_len (vec_timeout) > 0)
1364         {
1365           vlib_frame_t *f = vlib_get_frame_to_node (vm, rm->ip6_drop_idx);
1366           u32 *to_next = vlib_frame_vector_args (f);
1367           u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
1368           int trace_frame = 0;
1369           while (vec_len (vec_timeout) > 0 && n_left_to_next > 0)
1370             {
1371               u32 bi = vec_pop (vec_timeout);
1372               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1373               if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
1374                 {
1375                   if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
1376                                           b->trace_index))
1377                     {
1378                       /* the trace is gone, don't trace this buffer anymore */
1379                       b->flags &= ~VLIB_BUFFER_IS_TRACED;
1380                     }
1381                   else
1382                     {
1383                       trace_frame = 1;
1384                     }
1385                 }
1386               b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
1387               to_next[0] = bi;
1388               ++f->n_vectors;
1389               to_next += 1;
1390               n_left_to_next -= 1;
1391             }
1392           f->frame_flags |= (trace_frame * VLIB_FRAME_TRACE);
1393           vlib_put_frame_to_node (vm, rm->ip6_drop_idx, f);
1394         }
1395
1396       while (vec_len (vec_icmp_bi) > 0)
1397         {
1398           vlib_frame_t *f =
1399             vlib_get_frame_to_node (vm, rm->ip6_icmp_error_idx);
1400           u32 *to_next = vlib_frame_vector_args (f);
1401           u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
1402           int trace_frame = 0;
1403           while (vec_len (vec_icmp_bi) > 0 && n_left_to_next > 0)
1404             {
1405               u32 bi = vec_pop (vec_icmp_bi);
1406               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1407               if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
1408                 {
1409                   if (pool_is_free_index (vm->trace_main.trace_buffer_pool,
1410                                           b->trace_index))
1411                     {
1412                       /* the trace is gone, don't trace this buffer anymore */
1413                       b->flags &= ~VLIB_BUFFER_IS_TRACED;
1414                     }
1415                   else
1416                     {
1417                       trace_frame = 1;
1418                     }
1419                 }
1420               b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
1421               to_next[0] = bi;
1422               ++f->n_vectors;
1423               to_next += 1;
1424               n_left_to_next -= 1;
1425             }
1426           f->frame_flags |= (trace_frame * VLIB_FRAME_TRACE);
1427           vlib_put_frame_to_node (vm, rm->ip6_icmp_error_idx, f);
1428         }
1429
1430       vec_free (pool_indexes_to_free);
1431       vec_free (vec_timeout);
1432       vec_free (vec_icmp_bi);
1433       if (event_data)
1434         {
1435           _vec_len (event_data) = 0;
1436         }
1437     }
1438
1439   return 0;
1440 }
1441
1442 static vlib_node_registration_t ip6_reass_expire_node;
1443
1444 /* *INDENT-OFF* */
1445 VLIB_REGISTER_NODE (ip6_reass_expire_node, static) = {
1446     .function = ip6_reass_walk_expired,
1447     .format_trace = format_ip6_reass_trace,
1448     .type = VLIB_NODE_TYPE_PROCESS,
1449     .name = "ip6-reassembly-expire-walk",
1450
1451     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1452     .error_strings = ip6_reassembly_error_strings,
1453
1454 };
1455 /* *INDENT-ON* */
1456
1457 static u8 *
1458 format_ip6_reass_key (u8 * s, va_list * args)
1459 {
1460   ip6_reass_key_t *key = va_arg (*args, ip6_reass_key_t *);
1461   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1462               key->xx_id, format_ip6_address, &key->src, format_ip6_address,
1463               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1464   return s;
1465 }
1466
1467 static u8 *
1468 format_ip6_reass (u8 * s, va_list * args)
1469 {
1470   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1471   ip6_reass_t *reass = va_arg (*args, ip6_reass_t *);
1472
1473   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1474               "last_packet_octet: %u, trace_op_counter: %u\n",
1475               reass->id, format_ip6_reass_key, &reass->key, reass->first_bi,
1476               reass->data_len, reass->last_packet_octet,
1477               reass->trace_op_counter);
1478   u32 bi = reass->first_bi;
1479   u32 counter = 0;
1480   while (~0 != bi)
1481     {
1482       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1483       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1484       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1485                   "fragment[%u, %u]\n",
1486                   counter, vnb->ip.reass.range_first,
1487                   vnb->ip.reass.range_last, bi,
1488                   ip6_reass_buffer_get_data_offset (b),
1489                   ip6_reass_buffer_get_data_len (b),
1490                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1491       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1492         {
1493           bi = b->next_buffer;
1494         }
1495       else
1496         {
1497           bi = ~0;
1498         }
1499     }
1500   return s;
1501 }
1502
1503 static clib_error_t *
1504 show_ip6_reass (vlib_main_t * vm, unformat_input_t * input,
1505                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1506 {
1507   ip6_reass_main_t *rm = &ip6_reass_main;
1508
1509   vlib_cli_output (vm, "---------------------");
1510   vlib_cli_output (vm, "IP6 reassembly status");
1511   vlib_cli_output (vm, "---------------------");
1512   bool details = false;
1513   if (unformat (input, "details"))
1514     {
1515       details = true;
1516     }
1517
1518   u32 sum_reass_n = 0;
1519   u64 sum_buffers_n = 0;
1520   ip6_reass_t *reass;
1521   uword thread_index;
1522   const uword nthreads = vlib_num_workers () + 1;
1523   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1524     {
1525       ip6_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1526       clib_spinlock_lock (&rt->lock);
1527       if (details)
1528         {
1529           /* *INDENT-OFF* */
1530           pool_foreach (reass, rt->pool, {
1531             vlib_cli_output (vm, "%U", format_ip6_reass, vm, reass);
1532           });
1533           /* *INDENT-ON* */
1534         }
1535       sum_reass_n += rt->reass_n;
1536       sum_buffers_n += rt->buffers_n;
1537       clib_spinlock_unlock (&rt->lock);
1538     }
1539   vlib_cli_output (vm, "---------------------");
1540   vlib_cli_output (vm, "Current IP6 reassemblies count: %lu\n",
1541                    (long unsigned) sum_reass_n);
1542   vlib_cli_output (vm, "Maximum configured concurrent IP6 reassemblies per "
1543                    "worker-thread: %lu\n", (long unsigned) rm->max_reass_n);
1544   vlib_cli_output (vm, "Buffers in use: %lu\n",
1545                    (long unsigned) sum_buffers_n);
1546   return 0;
1547 }
1548
1549 /* *INDENT-OFF* */
1550 VLIB_CLI_COMMAND (show_ip6_reassembly_cmd, static) = {
1551     .path = "show ip6-reassembly",
1552     .short_help = "show ip6-reassembly [details]",
1553     .function = show_ip6_reass,
1554 };
1555 /* *INDENT-ON* */
1556
1557 vnet_api_error_t
1558 ip6_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1559 {
1560   return vnet_feature_enable_disable ("ip6-unicast", "ip6-reassembly-feature",
1561                                       sw_if_index, enable_disable, 0, 0);
1562 }
1563
1564 /*
1565  * fd.io coding-style-patch-verification: ON
1566  *
1567  * Local Variables:
1568  * eval: (c-set-style "gnu")
1569  * End:
1570  */