ip: Use .api declared error counters
[vpp.git] / src / vnet / ip / reass / ip6_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv6 Full Reassembly.
19  *
20  * This file contains the source code for IPv6 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_48_8.h>
27 #include <vnet/ip/reass/ip6_full_reass.h>
28 #include <vnet/ip/ip6_inlines.h>
29
30 #define MSEC_PER_SEC 1000
31 #define IP6_FULL_REASS_TIMEOUT_DEFAULT_MS 200
32 /* As there are only 1024 reass context per thread, either the DDOS attacks
33  * or fractions of real timeouts, would consume these contexts quickly and
34  * running out context space and unable to perform reassembly */
35 #define IP6_FULL_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 50 // 50 ms default
36 #define IP6_FULL_REASS_MAX_REASSEMBLIES_DEFAULT 1024
37 #define IP6_FULL_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
38 #define IP6_FULL_REASS_HT_LOAD_FACTOR (0.75)
39
40 typedef enum
41 {
42   IP6_FULL_REASS_RC_OK,
43   IP6_FULL_REASS_RC_INTERNAL_ERROR,
44   IP6_FULL_REASS_RC_TOO_MANY_FRAGMENTS,
45   IP6_FULL_REASS_RC_NO_BUF,
46   IP6_FULL_REASS_RC_HANDOFF,
47   IP6_FULL_REASS_RC_INVALID_FRAG_LEN,
48   IP6_FULL_REASS_RC_OVERLAP,
49 } ip6_full_reass_rc_t;
50
51 typedef struct
52 {
53   union
54   {
55     struct
56     {
57       ip6_address_t src;
58       ip6_address_t dst;
59       u32 xx_id;
60       u32 frag_id;
61       u8 unused[7];
62       u8 proto;
63     };
64     u64 as_u64[6];
65   };
66 } ip6_full_reass_key_t;
67
68 typedef union
69 {
70   struct
71   {
72     u32 reass_index;
73     u32 memory_owner_thread_index;
74   };
75   u64 as_u64;
76 } ip6_full_reass_val_t;
77
78 typedef union
79 {
80   struct
81   {
82     ip6_full_reass_key_t k;
83     ip6_full_reass_val_t v;
84   };
85   clib_bihash_kv_48_8_t kv;
86 } ip6_full_reass_kv_t;
87
88
89 always_inline u32
90 ip6_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
91 {
92   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
93   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
94 }
95
96 always_inline u16
97 ip6_full_reass_buffer_get_data_len (vlib_buffer_t * b)
98 {
99   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
100   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
101     (vnb->ip.reass.fragment_first +
102      ip6_full_reass_buffer_get_data_offset (b)) + 1;
103 }
104
105 typedef struct
106 {
107   // hash table key
108   ip6_full_reass_key_t key;
109   // time when last packet was received
110   f64 last_heard;
111   // internal id of this reassembly
112   u64 id;
113   // buffer index of first buffer in this reassembly context
114   u32 first_bi;
115   // last octet of packet, ~0 until fragment without more_fragments arrives
116   u32 last_packet_octet;
117   // length of data collected so far
118   u32 data_len;
119   // trace operation counter
120   u32 trace_op_counter;
121   // next index - used by custom apps (~0 if not set)
122   u32 next_index;
123   // error next index - used by custom apps (~0 if not set)
124   u32 error_next_index;
125   // minimum fragment length for this reassembly - used to estimate MTU
126   u16 min_fragment_length;
127   // number of fragments for this reassembly
128   u32 fragments_n;
129   // thread owning memory for this context (whose pool contains this ctx)
130   u32 memory_owner_thread_index;
131   // thread which received fragment with offset 0 and which sends out the
132   // completed reassembly
133   u32 sendout_thread_index;
134 } ip6_full_reass_t;
135
136 typedef struct
137 {
138   ip6_full_reass_t *pool;
139   u32 reass_n;
140   u32 id_counter;
141   // for pacing the main thread timeouts
142   u32 last_id;
143   clib_spinlock_t lock;
144 } ip6_full_reass_per_thread_t;
145
146 typedef struct
147 {
148   // IPv6 config
149   u32 timeout_ms;
150   f64 timeout;
151   u32 expire_walk_interval_ms;
152   // maximum number of fragments in one reassembly
153   u32 max_reass_len;
154   // maximum number of reassemblies
155   u32 max_reass_n;
156
157   // IPv6 runtime
158   clib_bihash_48_8_t hash;
159
160   // per-thread data
161   ip6_full_reass_per_thread_t *per_thread_data;
162
163   // convenience
164   vlib_main_t *vlib_main;
165
166   u32 ip6_icmp_error_idx;
167   u32 ip6_full_reass_expire_node_idx;
168
169   /** Worker handoff */
170   u32 fq_index;
171   u32 fq_local_index;
172   u32 fq_feature_index;
173   u32 fq_custom_index;
174
175   // reference count for enabling/disabling feature - per interface
176   u32 *feature_use_refcount_per_intf;
177
178   // whether local fragmented packets are reassembled or not
179   int is_local_reass_enabled;
180 } ip6_full_reass_main_t;
181
182 extern ip6_full_reass_main_t ip6_full_reass_main;
183
184 #ifndef CLIB_MARCH_VARIANT
185 ip6_full_reass_main_t ip6_full_reass_main;
186 #endif /* CLIB_MARCH_VARIANT */
187
188 typedef enum
189 {
190   IP6_FULL_REASSEMBLY_NEXT_INPUT,
191   IP6_FULL_REASSEMBLY_NEXT_DROP,
192   IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR,
193   IP6_FULL_REASSEMBLY_NEXT_HANDOFF,
194   IP6_FULL_REASSEMBLY_N_NEXT,
195 } ip6_full_reass_next_t;
196
197 typedef enum
198 {
199   NORMAL,
200   FEATURE,
201   CUSTOM
202 } ip6_full_reass_node_type_t;
203
204 typedef enum
205 {
206   RANGE_NEW,
207   RANGE_DISCARD,
208   RANGE_OVERLAP,
209   ICMP_ERROR_RT_EXCEEDED,
210   ICMP_ERROR_FL_TOO_BIG,
211   ICMP_ERROR_FL_NOT_MULT_8,
212   FINALIZE,
213   HANDOFF,
214   PASSTHROUGH,
215 } ip6_full_reass_trace_operation_e;
216
217 typedef struct
218 {
219   u16 range_first;
220   u16 range_last;
221   u32 range_bi;
222   i32 data_offset;
223   u32 data_len;
224   u32 first_bi;
225 } ip6_full_reass_range_trace_t;
226
227 typedef struct
228 {
229   ip6_full_reass_trace_operation_e action;
230   u32 reass_id;
231   ip6_full_reass_range_trace_t trace_range;
232   u32 op_id;
233   u32 fragment_first;
234   u32 fragment_last;
235   u32 total_data_len;
236   u32 thread_id;
237   u32 thread_id_to;
238   bool is_after_handoff;
239   ip6_header_t ip6_header;
240   ip6_frag_hdr_t ip6_frag_header;
241 } ip6_full_reass_trace_t;
242
243 static void
244 ip6_full_reass_trace_details (vlib_main_t * vm, u32 bi,
245                               ip6_full_reass_range_trace_t * trace)
246 {
247   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
248   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
249   trace->range_first = vnb->ip.reass.range_first;
250   trace->range_last = vnb->ip.reass.range_last;
251   trace->data_offset = ip6_full_reass_buffer_get_data_offset (b);
252   trace->data_len = ip6_full_reass_buffer_get_data_len (b);
253   trace->range_bi = bi;
254 }
255
256 static u8 *
257 format_ip6_full_reass_range_trace (u8 * s, va_list * args)
258 {
259   ip6_full_reass_range_trace_t *trace =
260     va_arg (*args, ip6_full_reass_range_trace_t *);
261   s =
262     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
263             trace->range_last, trace->data_offset, trace->data_len,
264             trace->range_bi);
265   return s;
266 }
267
268 static u8 *
269 format_ip6_full_reass_trace (u8 * s, va_list * args)
270 {
271   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
272   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
273   ip6_full_reass_trace_t *t = va_arg (*args, ip6_full_reass_trace_t *);
274   u32 indent = 0;
275   if (~0 != t->reass_id)
276     {
277       if (t->is_after_handoff)
278         {
279           s =
280             format (s, "%U\n", format_ip6_header, &t->ip6_header,
281                     sizeof (t->ip6_header));
282           s =
283             format (s, "  %U\n", format_ip6_frag_hdr, &t->ip6_frag_header,
284                     sizeof (t->ip6_frag_header));
285           indent = 2;
286         }
287       s =
288         format (s, "%Ureass id: %u, op id: %u, ", format_white_space, indent,
289                 t->reass_id, t->op_id);
290       indent = format_get_indent (s);
291       s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
292                   t->trace_range.first_bi, t->total_data_len,
293                   t->fragment_first, t->fragment_last);
294     }
295   switch (t->action)
296     {
297     case RANGE_NEW:
298       s = format (s, "\n%Unew %U", format_white_space, indent,
299                   format_ip6_full_reass_range_trace, &t->trace_range);
300       break;
301     case RANGE_DISCARD:
302       s = format (s, "\n%Udiscard %U", format_white_space, indent,
303                   format_ip6_full_reass_range_trace, &t->trace_range);
304       break;
305     case RANGE_OVERLAP:
306       s = format (s, "\n%Uoverlap %U", format_white_space, indent,
307                   format_ip6_full_reass_range_trace, &t->trace_range);
308       break;
309     case ICMP_ERROR_FL_TOO_BIG:
310       s = format (s, "\n%Uicmp-error - frag_len > 65535 %U",
311                   format_white_space, indent,
312                   format_ip6_full_reass_range_trace, &t->trace_range);
313       break;
314     case ICMP_ERROR_FL_NOT_MULT_8:
315       s = format (s, "\n%Uicmp-error - frag_len mod 8 != 0 %U",
316                   format_white_space, indent,
317                   format_ip6_full_reass_range_trace, &t->trace_range);
318       break;
319     case ICMP_ERROR_RT_EXCEEDED:
320       s = format (s, "\n%Uicmp-error - reassembly time exceeded",
321                   format_white_space, indent);
322       break;
323     case FINALIZE:
324       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
325       break;
326     case HANDOFF:
327       s =
328         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
329                 t->thread_id_to);
330       break;
331     case PASSTHROUGH:
332       s = format (s, "passthrough - not a fragment");
333       break;
334     }
335   return s;
336 }
337
338 static void
339 ip6_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
340                           ip6_full_reass_t * reass, u32 bi,
341                           ip6_frag_hdr_t * ip6_frag_header,
342                           ip6_full_reass_trace_operation_e action,
343                           u32 thread_id_to)
344 {
345   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
346   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
347   bool is_after_handoff = false;
348   if (pool_is_free_index
349       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
350     {
351       // this buffer's trace is gone
352       b->flags &= ~VLIB_BUFFER_IS_TRACED;
353       return;
354     }
355   if (vlib_buffer_get_trace_thread (b) != vm->thread_index)
356     {
357       is_after_handoff = true;
358     }
359   ip6_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
360   t->is_after_handoff = is_after_handoff;
361   if (t->is_after_handoff)
362     {
363       clib_memcpy (&t->ip6_header, vlib_buffer_get_current (b),
364                    clib_min (sizeof (t->ip6_header), b->current_length));
365       if (ip6_frag_header)
366         {
367           clib_memcpy (&t->ip6_frag_header, ip6_frag_header,
368                        sizeof (t->ip6_frag_header));
369         }
370       else
371         {
372           clib_memset (&t->ip6_frag_header, 0, sizeof (t->ip6_frag_header));
373         }
374     }
375   if (reass)
376     {
377       t->reass_id = reass->id;
378       t->op_id = reass->trace_op_counter;
379       t->trace_range.first_bi = reass->first_bi;
380       t->total_data_len = reass->data_len;
381       ++reass->trace_op_counter;
382     }
383   else
384     {
385       t->reass_id = ~0;
386     }
387   t->action = action;
388   t->thread_id = vm->thread_index;
389   t->thread_id_to = thread_id_to;
390   ip6_full_reass_trace_details (vm, bi, &t->trace_range);
391   t->fragment_first = vnb->ip.reass.fragment_first;
392   t->fragment_last = vnb->ip.reass.fragment_last;
393 #if 0
394   static u8 *s = NULL;
395   s = format (s, "%U", format_ip6_full_reass_trace, NULL, NULL, t);
396   printf ("%.*s\n", vec_len (s), s);
397   fflush (stdout);
398   vec_reset_length (s);
399 #endif
400 }
401
402 always_inline void
403 ip6_full_reass_free_ctx (ip6_full_reass_per_thread_t * rt,
404                          ip6_full_reass_t * reass)
405 {
406   pool_put (rt->pool, reass);
407   --rt->reass_n;
408 }
409
410 always_inline void
411 ip6_full_reass_free (ip6_full_reass_main_t * rm,
412                      ip6_full_reass_per_thread_t * rt,
413                      ip6_full_reass_t * reass)
414 {
415   clib_bihash_kv_48_8_t kv;
416   kv.key[0] = reass->key.as_u64[0];
417   kv.key[1] = reass->key.as_u64[1];
418   kv.key[2] = reass->key.as_u64[2];
419   kv.key[3] = reass->key.as_u64[3];
420   kv.key[4] = reass->key.as_u64[4];
421   kv.key[5] = reass->key.as_u64[5];
422   clib_bihash_add_del_48_8 (&rm->hash, &kv, 0);
423   ip6_full_reass_free_ctx (rt, reass);
424 }
425
426 /* n_left_to_next, and to_next are taken as input params, as this function
427  * could be called from a graphnode, where its managing local copy of these
428  * variables, and ignoring those and still trying to enqueue the buffers
429  * with local variables would cause either buffer leak or corruption */
430 always_inline void
431 ip6_full_reass_drop_all (vlib_main_t *vm, vlib_node_runtime_t *node,
432                          ip6_full_reass_t *reass, u32 *n_left_to_next,
433                          u32 **to_next)
434 {
435   u32 range_bi = reass->first_bi;
436   vlib_buffer_t *range_b;
437   vnet_buffer_opaque_t *range_vnb;
438   u32 *to_free = NULL;
439
440   while (~0 != range_bi)
441     {
442       range_b = vlib_get_buffer (vm, range_bi);
443       range_vnb = vnet_buffer (range_b);
444
445       if (~0 != range_bi)
446         {
447           vec_add1 (to_free, range_bi);
448         }
449       range_bi = range_vnb->ip.reass.next_range_bi;
450     }
451
452   /* send to next_error_index */
453   if (~0 != reass->error_next_index &&
454       reass->error_next_index < node->n_next_nodes)
455     {
456       u32 next_index;
457
458       next_index = reass->error_next_index;
459       u32 bi = ~0;
460
461       /* record number of packets sent to custom app */
462       vlib_node_increment_counter (vm, node->node_index,
463                                    IP6_ERROR_REASS_TO_CUSTOM_APP,
464                                    vec_len (to_free));
465
466       while (vec_len (to_free) > 0)
467         {
468           vlib_get_next_frame (vm, node, next_index, *to_next,
469                                (*n_left_to_next));
470
471           while (vec_len (to_free) > 0 && (*n_left_to_next) > 0)
472             {
473               bi = vec_pop (to_free);
474
475               if (~0 != bi)
476                 {
477                   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
478                   if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
479                     {
480                       ip6_full_reass_add_trace (vm, node, reass, bi, NULL,
481                                                 RANGE_DISCARD, ~0);
482                     }
483                   *to_next[0] = bi;
484                   (*to_next) += 1;
485                   (*n_left_to_next) -= 1;
486                 }
487             }
488           vlib_put_next_frame (vm, node, next_index, (*n_left_to_next));
489         }
490     }
491   else
492     {
493       vlib_buffer_free (vm, to_free, vec_len (to_free));
494     }
495   vec_free (to_free);
496 }
497
498 always_inline void
499 sanitize_reass_buffers_add_missing (vlib_main_t *vm, ip6_full_reass_t *reass,
500                                     u32 *bi0)
501 {
502   u32 range_bi = reass->first_bi;
503   vlib_buffer_t *range_b;
504   vnet_buffer_opaque_t *range_vnb;
505
506   while (~0 != range_bi)
507     {
508       range_b = vlib_get_buffer (vm, range_bi);
509       range_vnb = vnet_buffer (range_b);
510       u32 bi = range_bi;
511       if (~0 != bi)
512         {
513           if (bi == *bi0)
514             *bi0 = ~0;
515           if (range_b->flags & VLIB_BUFFER_NEXT_PRESENT)
516             {
517               u32 _bi = bi;
518               vlib_buffer_t *_b = vlib_get_buffer (vm, _bi);
519               while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)
520                 {
521                   if (_b->next_buffer != range_vnb->ip.reass.next_range_bi)
522                     {
523                       _bi = _b->next_buffer;
524                       _b = vlib_get_buffer (vm, _bi);
525                     }
526                   else
527                     {
528                       _b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
529                       break;
530                     }
531                 }
532             }
533           range_bi = range_vnb->ip.reass.next_range_bi;
534         }
535     }
536   if (*bi0 != ~0)
537     {
538       vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
539       vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
540       if (~0 != reass->first_bi)
541         {
542           fvnb->ip.reass.next_range_bi = reass->first_bi;
543           reass->first_bi = *bi0;
544         }
545       else
546         {
547           reass->first_bi = *bi0;
548           fvnb->ip.reass.next_range_bi = ~0;
549         }
550       *bi0 = ~0;
551     }
552 }
553
554 always_inline void
555 ip6_full_reass_on_timeout (vlib_main_t *vm, vlib_node_runtime_t *node,
556                            ip6_full_reass_t *reass, u32 *icmp_bi,
557                            u32 *n_left_to_next, u32 **to_next)
558 {
559   if (~0 == reass->first_bi)
560     {
561       return;
562     }
563   if (~0 == reass->next_index)  // custom apps don't want icmp
564     {
565       vlib_buffer_t *b = vlib_get_buffer (vm, reass->first_bi);
566       if (0 == vnet_buffer (b)->ip.reass.fragment_first)
567         {
568           *icmp_bi = reass->first_bi;
569           if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
570             {
571               ip6_full_reass_add_trace (vm, node, reass, reass->first_bi, NULL,
572                                         ICMP_ERROR_RT_EXCEEDED, ~0);
573             }
574           // fragment with offset zero received - send icmp message back
575           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
576             {
577               // separate first buffer from chain and steer it towards icmp node
578               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
579               reass->first_bi = b->next_buffer;
580             }
581           else
582             {
583               reass->first_bi = vnet_buffer (b)->ip.reass.next_range_bi;
584             }
585           icmp6_error_set_vnet_buffer (b, ICMP6_time_exceeded,
586                                        ICMP6_time_exceeded_fragment_reassembly_time_exceeded,
587                                        0);
588         }
589     }
590   ip6_full_reass_drop_all (vm, node, reass, n_left_to_next, to_next);
591 }
592
593 always_inline ip6_full_reass_t *
594 ip6_full_reass_find_or_create (vlib_main_t *vm, vlib_node_runtime_t *node,
595                                ip6_full_reass_main_t *rm,
596                                ip6_full_reass_per_thread_t *rt,
597                                ip6_full_reass_kv_t *kv, u32 *icmp_bi,
598                                u8 *do_handoff, int skip_bihash,
599                                u32 *n_left_to_next, u32 **to_next)
600 {
601   ip6_full_reass_t *reass;
602   f64 now;
603
604 again:
605
606   reass = NULL;
607   now = vlib_time_now (vm);
608
609   if (!skip_bihash && !clib_bihash_search_48_8 (&rm->hash, &kv->kv, &kv->kv))
610     {
611       if (vm->thread_index != kv->v.memory_owner_thread_index)
612         {
613           *do_handoff = 1;
614           return NULL;
615         }
616
617       reass =
618         pool_elt_at_index (rm->per_thread_data
619                            [kv->v.memory_owner_thread_index].pool,
620                            kv->v.reass_index);
621
622       if (now > reass->last_heard + rm->timeout)
623         {
624           vlib_node_increment_counter (vm, node->node_index,
625                                        IP6_ERROR_REASS_TIMEOUT, 1);
626           ip6_full_reass_on_timeout (vm, node, reass, icmp_bi, n_left_to_next,
627                                      to_next);
628           ip6_full_reass_free (rm, rt, reass);
629           reass = NULL;
630         }
631     }
632
633   if (reass)
634     {
635       reass->last_heard = now;
636       return reass;
637     }
638
639   if (rt->reass_n >= rm->max_reass_n)
640     {
641       reass = NULL;
642       return reass;
643     }
644   else
645     {
646       pool_get (rt->pool, reass);
647       clib_memset (reass, 0, sizeof (*reass));
648       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
649       ++rt->id_counter;
650       reass->first_bi = ~0;
651       reass->last_packet_octet = ~0;
652       reass->data_len = 0;
653       reass->next_index = ~0;
654       reass->error_next_index = ~0;
655       reass->memory_owner_thread_index = vm->thread_index;
656       ++rt->reass_n;
657     }
658
659   kv->v.reass_index = (reass - rt->pool);
660   kv->v.memory_owner_thread_index = vm->thread_index;
661   reass->last_heard = now;
662
663   if (!skip_bihash)
664     {
665       reass->key.as_u64[0] = kv->kv.key[0];
666       reass->key.as_u64[1] = kv->kv.key[1];
667       reass->key.as_u64[2] = kv->kv.key[2];
668       reass->key.as_u64[3] = kv->kv.key[3];
669       reass->key.as_u64[4] = kv->kv.key[4];
670       reass->key.as_u64[5] = kv->kv.key[5];
671
672       int rv = clib_bihash_add_del_48_8 (&rm->hash, &kv->kv, 2);
673       if (rv)
674         {
675           ip6_full_reass_free (rm, rt, reass);
676           reass = NULL;
677           // if other worker created a context already work with the other copy
678           if (-2 == rv)
679             goto again;
680         }
681     }
682   else
683     {
684       reass->key.as_u64[0] = ~0;
685       reass->key.as_u64[1] = ~0;
686       reass->key.as_u64[2] = ~0;
687       reass->key.as_u64[3] = ~0;
688       reass->key.as_u64[4] = ~0;
689       reass->key.as_u64[5] = ~0;
690     }
691
692   return reass;
693 }
694
695 always_inline ip6_full_reass_rc_t
696 ip6_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
697                          ip6_full_reass_main_t * rm,
698                          ip6_full_reass_per_thread_t * rt,
699                          ip6_full_reass_t * reass, u32 * bi0, u32 * next0,
700                          u32 * error0, bool is_custom_app)
701 {
702   *bi0 = reass->first_bi;
703   *error0 = IP6_ERROR_NONE;
704   ip6_frag_hdr_t *frag_hdr;
705   vlib_buffer_t *last_b = NULL;
706   u32 sub_chain_bi = reass->first_bi;
707   u32 total_length = 0;
708   u32 buf_cnt = 0;
709   u32 dropped_cnt = 0;
710   u32 *vec_drop_compress = NULL;
711   ip6_full_reass_rc_t rv = IP6_FULL_REASS_RC_OK;
712   do
713     {
714       u32 tmp_bi = sub_chain_bi;
715       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
716       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
717       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
718           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
719         {
720           rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
721           goto free_buffers_and_return;
722         }
723
724       u32 data_len = ip6_full_reass_buffer_get_data_len (tmp);
725       u32 trim_front = vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
726         sizeof (*frag_hdr) + ip6_full_reass_buffer_get_data_offset (tmp);
727       u32 trim_end =
728         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
729       if (tmp_bi == reass->first_bi)
730         {
731           /* first buffer - keep ip6 header */
732           if (0 != ip6_full_reass_buffer_get_data_offset (tmp))
733             {
734               rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
735               goto free_buffers_and_return;
736             }
737           trim_front = 0;
738           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
739             (vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
740              sizeof (*frag_hdr));
741           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
742             {
743               rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
744               goto free_buffers_and_return;
745             }
746         }
747       u32 keep_data =
748         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
749       while (1)
750         {
751           ++buf_cnt;
752           if (trim_front)
753             {
754               if (trim_front > tmp->current_length)
755                 {
756                   /* drop whole buffer */
757                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
758                     {
759                       rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
760                       goto free_buffers_and_return;
761                     }
762                   trim_front -= tmp->current_length;
763                   vec_add1 (vec_drop_compress, tmp_bi);
764                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
765                   tmp_bi = tmp->next_buffer;
766                   tmp = vlib_get_buffer (vm, tmp_bi);
767                   continue;
768                 }
769               else
770                 {
771                   vlib_buffer_advance (tmp, trim_front);
772                   trim_front = 0;
773                 }
774             }
775           if (keep_data)
776             {
777               if (last_b)
778                 {
779                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
780                   last_b->next_buffer = tmp_bi;
781                 }
782               last_b = tmp;
783               if (keep_data <= tmp->current_length)
784                 {
785                   tmp->current_length = keep_data;
786                   keep_data = 0;
787                 }
788               else
789                 {
790                   keep_data -= tmp->current_length;
791                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
792                     {
793                       rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
794                       goto free_buffers_and_return;
795                     }
796                 }
797               total_length += tmp->current_length;
798             }
799           else
800             {
801               if (reass->first_bi == tmp_bi)
802                 {
803                   rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
804                   goto free_buffers_and_return;
805                 }
806               vec_add1 (vec_drop_compress, tmp_bi);
807               ++dropped_cnt;
808             }
809           if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
810             {
811               tmp_bi = tmp->next_buffer;
812               tmp = vlib_get_buffer (vm, tmp->next_buffer);
813             }
814           else
815             {
816               break;
817             }
818         }
819       sub_chain_bi =
820         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
821         reass.next_range_bi;
822     }
823   while (~0 != sub_chain_bi);
824
825   if (!last_b)
826     {
827       rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
828       goto free_buffers_and_return;
829     }
830   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
831   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
832   if (total_length < first_b->current_length)
833     {
834       rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
835       goto free_buffers_and_return;
836     }
837   total_length -= first_b->current_length;
838   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
839   first_b->total_length_not_including_first_buffer = total_length;
840   // drop fragment header
841   vnet_buffer_opaque_t *first_b_vnb = vnet_buffer (first_b);
842   ip6_header_t *ip = vlib_buffer_get_current (first_b);
843   u16 ip6_frag_hdr_offset = first_b_vnb->ip.reass.ip6_frag_hdr_offset;
844   ip6_ext_hdr_chain_t hdr_chain;
845   ip6_ext_header_t *prev_hdr = 0;
846   int res = ip6_ext_header_walk (first_b, ip, IP_PROTOCOL_IPV6_FRAGMENTATION,
847                                  &hdr_chain);
848   if (res < 0 ||
849       (hdr_chain.eh[res].protocol != IP_PROTOCOL_IPV6_FRAGMENTATION))
850     {
851       rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
852       goto free_buffers_and_return;
853     }
854   frag_hdr = ip6_ext_next_header_offset (ip, hdr_chain.eh[res].offset);
855   if (res > 0)
856     {
857       prev_hdr = ip6_ext_next_header_offset (ip, hdr_chain.eh[res - 1].offset);
858       prev_hdr->next_hdr = frag_hdr->next_hdr;
859     }
860   else
861     {
862       ip->protocol = frag_hdr->next_hdr;
863     }
864   if (hdr_chain.eh[res].offset != ip6_frag_hdr_offset)
865     {
866       rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
867       goto free_buffers_and_return;
868     }
869   memmove (frag_hdr, (u8 *) frag_hdr + sizeof (*frag_hdr),
870            first_b->current_length - ip6_frag_hdr_offset -
871            sizeof (ip6_frag_hdr_t));
872   first_b->current_length -= sizeof (*frag_hdr);
873   ip->payload_length =
874     clib_host_to_net_u16 (total_length + first_b->current_length -
875                           sizeof (*ip));
876   if (!vlib_buffer_chain_linearize (vm, first_b))
877     {
878       rv = IP6_FULL_REASS_RC_NO_BUF;
879       goto free_buffers_and_return;
880     }
881   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
882   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
883     {
884       ip6_full_reass_add_trace (vm, node, reass, reass->first_bi, NULL,
885                                 FINALIZE, ~0);
886 #if 0
887       // following code does a hexdump of packet fragments to stdout ...
888       do
889         {
890           u32 bi = reass->first_bi;
891           u8 *s = NULL;
892           while (~0 != bi)
893             {
894               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
895               s = format (s, "%u: %U\n", bi, format_hexdump,
896                           vlib_buffer_get_current (b), b->current_length);
897               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
898                 {
899                   bi = b->next_buffer;
900                 }
901               else
902                 {
903                   break;
904                 }
905             }
906           printf ("%.*s\n", vec_len (s), s);
907           fflush (stdout);
908           vec_free (s);
909         }
910       while (0);
911 #endif
912     }
913   if (!is_custom_app)
914     {
915       *next0 = IP6_FULL_REASSEMBLY_NEXT_INPUT;
916     }
917   else
918     {
919       *next0 = reass->next_index;
920     }
921   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
922   /* Keep track of number of successfully reassembled packets and number of
923    * fragments reassembled */
924   vlib_node_increment_counter (vm, node->node_index, IP6_ERROR_REASS_SUCCESS,
925                                1);
926
927   vlib_node_increment_counter (vm, node->node_index,
928                                IP6_ERROR_REASS_FRAGMENTS_REASSEMBLED,
929                                reass->fragments_n);
930
931   ip6_full_reass_free (rm, rt, reass);
932   reass = NULL;
933 free_buffers_and_return:
934   vlib_buffer_free (vm, vec_drop_compress, vec_len (vec_drop_compress));
935   vec_free (vec_drop_compress);
936   return rv;
937 }
938
939 always_inline void
940 ip6_full_reass_insert_range_in_chain (vlib_main_t * vm,
941                                       ip6_full_reass_t * reass,
942                                       u32 prev_range_bi, u32 new_next_bi)
943 {
944
945   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
946   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
947   if (~0 != prev_range_bi)
948     {
949       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
950       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
951       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
952       prev_vnb->ip.reass.next_range_bi = new_next_bi;
953     }
954   else
955     {
956       if (~0 != reass->first_bi)
957         {
958           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
959         }
960       reass->first_bi = new_next_bi;
961     }
962   reass->data_len += ip6_full_reass_buffer_get_data_len (new_next_b);
963 }
964
965 always_inline ip6_full_reass_rc_t
966 ip6_full_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
967                        ip6_full_reass_main_t *rm,
968                        ip6_full_reass_per_thread_t *rt,
969                        ip6_full_reass_t *reass, u32 *bi0, u32 *next0,
970                        u32 *error0, ip6_frag_hdr_t *frag_hdr,
971                        bool is_custom_app, u32 *handoff_thread_idx,
972                        int skip_bihash)
973 {
974   int consumed = 0;
975   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
976   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
977   if (is_custom_app)
978     {
979       reass->next_index = fvnb->ip.reass.next_index;    // store next_index before it's overwritten
980       reass->error_next_index = fvnb->ip.reass.error_next_index;        // store error_next_index before it is overwritten
981     }
982
983   fvnb->ip.reass.ip6_frag_hdr_offset =
984     (u8 *) frag_hdr - (u8 *) vlib_buffer_get_current (fb);
985   ip6_header_t *fip = vlib_buffer_get_current (fb);
986   if (fb->current_length < sizeof (*fip) ||
987       fvnb->ip.reass.ip6_frag_hdr_offset == 0 ||
988       fvnb->ip.reass.ip6_frag_hdr_offset >= fb->current_length)
989     {
990       return IP6_FULL_REASS_RC_INTERNAL_ERROR;
991     }
992
993   u32 fragment_first = fvnb->ip.reass.fragment_first =
994     ip6_frag_hdr_offset_bytes (frag_hdr);
995   u32 fragment_length =
996     vlib_buffer_length_in_chain (vm, fb) -
997     (fvnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
998   if (0 == fragment_length)
999     {
1000       return IP6_FULL_REASS_RC_INVALID_FRAG_LEN;
1001     }
1002   u32 fragment_last = fvnb->ip.reass.fragment_last =
1003     fragment_first + fragment_length - 1;
1004   int more_fragments = ip6_frag_hdr_more (frag_hdr);
1005   u32 candidate_range_bi = reass->first_bi;
1006   u32 prev_range_bi = ~0;
1007   fvnb->ip.reass.range_first = fragment_first;
1008   fvnb->ip.reass.range_last = fragment_last;
1009   fvnb->ip.reass.next_range_bi = ~0;
1010   if (!more_fragments)
1011     {
1012       reass->last_packet_octet = fragment_last;
1013     }
1014   if (~0 == reass->first_bi)
1015     {
1016       // starting a new reassembly
1017       ip6_full_reass_insert_range_in_chain (vm, reass, prev_range_bi, *bi0);
1018       reass->min_fragment_length = clib_net_to_host_u16 (fip->payload_length);
1019       consumed = 1;
1020       reass->fragments_n = 1;
1021       goto check_if_done_maybe;
1022     }
1023   reass->min_fragment_length =
1024     clib_min (clib_net_to_host_u16 (fip->payload_length),
1025               fvnb->ip.reass.estimated_mtu);
1026   while (~0 != candidate_range_bi)
1027     {
1028       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
1029       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
1030       if (fragment_first > candidate_vnb->ip.reass.range_last)
1031         {
1032           // this fragments starts after candidate range
1033           prev_range_bi = candidate_range_bi;
1034           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
1035           if (candidate_vnb->ip.reass.range_last < fragment_last &&
1036               ~0 == candidate_range_bi)
1037             {
1038               // special case - this fragment falls beyond all known ranges
1039               ip6_full_reass_insert_range_in_chain (vm, reass, prev_range_bi,
1040                                                     *bi0);
1041               consumed = 1;
1042               break;
1043             }
1044           continue;
1045         }
1046       if (fragment_last < candidate_vnb->ip.reass.range_first)
1047         {
1048           // this fragment ends before candidate range without any overlap
1049           ip6_full_reass_insert_range_in_chain (vm, reass, prev_range_bi,
1050                                                 *bi0);
1051           consumed = 1;
1052         }
1053       else if (fragment_first == candidate_vnb->ip.reass.range_first &&
1054                fragment_last == candidate_vnb->ip.reass.range_last)
1055         {
1056           // duplicate fragment - ignore
1057         }
1058       else
1059         {
1060           // overlapping fragment - not allowed by RFC 8200
1061           if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1062             {
1063               ip6_full_reass_add_trace (vm, node, reass, *bi0, frag_hdr,
1064                                         RANGE_OVERLAP, ~0);
1065             }
1066           return IP6_FULL_REASS_RC_OVERLAP;
1067         }
1068       break;
1069     }
1070   ++reass->fragments_n;
1071 check_if_done_maybe:
1072   if (consumed)
1073     {
1074       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1075         {
1076           ip6_full_reass_add_trace (vm, node, reass, *bi0, frag_hdr, RANGE_NEW,
1077                                     ~0);
1078         }
1079     }
1080   else if (skip_bihash)
1081     {
1082       // if this reassembly is not in bihash, then the packet must have been
1083       // consumed
1084       return IP6_FULL_REASS_RC_INTERNAL_ERROR;
1085     }
1086   if (~0 != reass->last_packet_octet &&
1087       reass->data_len == reass->last_packet_octet + 1)
1088     {
1089       *handoff_thread_idx = reass->sendout_thread_index;
1090       int handoff =
1091         reass->memory_owner_thread_index != reass->sendout_thread_index;
1092       ip6_full_reass_rc_t rc =
1093         ip6_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1094                                  is_custom_app);
1095       if (IP6_FULL_REASS_RC_OK == rc && handoff)
1096         {
1097           return IP6_FULL_REASS_RC_HANDOFF;
1098         }
1099       return rc;
1100     }
1101   else
1102     {
1103       if (skip_bihash)
1104         {
1105           // if this reassembly is not in bihash, it should've been an atomic
1106           // fragment and thus finalized
1107           return IP6_FULL_REASS_RC_INTERNAL_ERROR;
1108         }
1109       if (consumed)
1110         {
1111           *bi0 = ~0;
1112           if (reass->fragments_n > rm->max_reass_len)
1113             {
1114               return IP6_FULL_REASS_RC_TOO_MANY_FRAGMENTS;
1115             }
1116         }
1117       else
1118         {
1119           *next0 = IP6_FULL_REASSEMBLY_NEXT_DROP;
1120           *error0 = IP6_ERROR_REASS_DUPLICATE_FRAGMENT;
1121         }
1122     }
1123   return IP6_FULL_REASS_RC_OK;
1124 }
1125
1126 always_inline bool
1127 ip6_full_reass_verify_upper_layer_present (vlib_node_runtime_t *node,
1128                                            vlib_buffer_t *b,
1129                                            ip6_ext_hdr_chain_t *hc)
1130 {
1131   int nh = hc->eh[hc->length - 1].protocol;
1132   /* Checking to see if it's a terminating header */
1133   if (ip6_ext_hdr (nh))
1134     {
1135       icmp6_error_set_vnet_buffer (
1136         b, ICMP6_parameter_problem,
1137         ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain, 0);
1138       b->error = node->errors[IP6_ERROR_REASS_MISSING_UPPER];
1139       return false;
1140     }
1141   return true;
1142 }
1143
1144 always_inline bool
1145 ip6_full_reass_verify_fragment_multiple_8 (vlib_main_t *vm,
1146                                            vlib_node_runtime_t *node,
1147                                            vlib_buffer_t *b,
1148                                            ip6_frag_hdr_t *frag_hdr)
1149 {
1150   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1151   ip6_header_t *ip = vlib_buffer_get_current (b);
1152   int more_fragments = ip6_frag_hdr_more (frag_hdr);
1153   u32 fragment_length =
1154     vlib_buffer_length_in_chain (vm, b) -
1155     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
1156   if (more_fragments && 0 != fragment_length % 8)
1157     {
1158       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
1159                                    ICMP6_parameter_problem_erroneous_header_field,
1160                                    (u8 *) & ip->payload_length - (u8 *) ip);
1161       b->error = node->errors[IP6_ERROR_REASS_INVALID_FRAG_SIZE];
1162       return false;
1163     }
1164   return true;
1165 }
1166
1167 always_inline bool
1168 ip6_full_reass_verify_packet_size_lt_64k (vlib_main_t *vm,
1169                                           vlib_node_runtime_t *node,
1170                                           vlib_buffer_t *b,
1171                                           ip6_frag_hdr_t *frag_hdr)
1172 {
1173   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1174   u32 fragment_first = ip6_frag_hdr_offset_bytes (frag_hdr);
1175   u32 fragment_length =
1176     vlib_buffer_length_in_chain (vm, b) -
1177     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
1178   if (fragment_first + fragment_length > 65535)
1179     {
1180       ip6_header_t *ip0 = vlib_buffer_get_current (b);
1181       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
1182                                    ICMP6_parameter_problem_erroneous_header_field,
1183                                    (u8 *) & frag_hdr->fragment_offset_and_more
1184                                    - (u8 *) ip0);
1185       b->error = node->errors[IP6_ERROR_REASS_INVALID_FRAG_SIZE];
1186       return false;
1187     }
1188   return true;
1189 }
1190
1191 always_inline uword
1192 ip6_full_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
1193                             vlib_frame_t *frame, bool is_feature,
1194                             bool is_custom_app, bool is_local)
1195 {
1196   u32 *from = vlib_frame_vector_args (frame);
1197   u32 n_left_from, n_left_to_next, *to_next, next_index;
1198   ip6_full_reass_main_t *rm = &ip6_full_reass_main;
1199   ip6_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1200   clib_spinlock_lock (&rt->lock);
1201
1202   n_left_from = frame->n_vectors;
1203   next_index = node->cached_next_index;
1204   while (n_left_from > 0)
1205     {
1206       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1207
1208       while (n_left_from > 0 && n_left_to_next > 0)
1209         {
1210           u32 bi0;
1211           vlib_buffer_t *b0;
1212           u32 next0 = IP6_FULL_REASSEMBLY_NEXT_DROP;
1213           u32 error0 = IP6_ERROR_NONE;
1214           u32 icmp_bi = ~0;
1215
1216           bi0 = from[0];
1217           b0 = vlib_get_buffer (vm, bi0);
1218
1219           ip6_header_t *ip0 = vlib_buffer_get_current (b0);
1220           ip6_frag_hdr_t *frag_hdr = NULL;
1221           ip6_ext_hdr_chain_t hdr_chain;
1222           vnet_buffer_opaque_t *fvnb = vnet_buffer (b0);
1223
1224           int res = ip6_ext_header_walk (
1225             b0, ip0, IP_PROTOCOL_IPV6_FRAGMENTATION, &hdr_chain);
1226           if (res < 0 ||
1227               hdr_chain.eh[res].protocol != IP_PROTOCOL_IPV6_FRAGMENTATION)
1228             {
1229               vlib_node_increment_counter (vm, node->node_index,
1230                                            IP6_ERROR_REASS_NO_FRAG_HDR, 1);
1231               // this is a mangled packet - no fragmentation
1232               next0 = is_custom_app ? fvnb->ip.reass.error_next_index :
1233                                             IP6_FULL_REASSEMBLY_NEXT_DROP;
1234               ip6_full_reass_add_trace (vm, node, NULL, bi0, NULL, PASSTHROUGH,
1235                                         ~0);
1236               goto skip_reass;
1237             }
1238           if (is_local && !rm->is_local_reass_enabled)
1239             {
1240               next0 = IP6_FULL_REASSEMBLY_NEXT_DROP;
1241               goto skip_reass;
1242             }
1243
1244           /* Keep track of received fragments */
1245           vlib_node_increment_counter (vm, node->node_index,
1246                                        IP6_ERROR_REASS_FRAGMENTS_RCVD, 1);
1247           frag_hdr =
1248             ip6_ext_next_header_offset (ip0, hdr_chain.eh[res].offset);
1249           vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset =
1250             hdr_chain.eh[res].offset;
1251
1252           if (0 == ip6_frag_hdr_offset (frag_hdr))
1253             {
1254               // first fragment - verify upper-layer is present
1255               if (!ip6_full_reass_verify_upper_layer_present (node, b0,
1256                                                               &hdr_chain))
1257                 {
1258                   next0 = is_custom_app ? fvnb->ip.reass.error_next_index :
1259                                                 IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR;
1260                   goto skip_reass;
1261                 }
1262             }
1263
1264           if (!ip6_full_reass_verify_fragment_multiple_8 (vm, node, b0,
1265                                                           frag_hdr) ||
1266               !ip6_full_reass_verify_packet_size_lt_64k (vm, node, b0,
1267                                                          frag_hdr))
1268             {
1269               next0 = is_custom_app ? fvnb->ip.reass.error_next_index :
1270                                             IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR;
1271               goto skip_reass;
1272             }
1273
1274           int skip_bihash = 0;
1275           ip6_full_reass_kv_t kv;
1276           u8 do_handoff = 0;
1277
1278           if (0 == ip6_frag_hdr_offset (frag_hdr) &&
1279               !ip6_frag_hdr_more (frag_hdr))
1280             {
1281               // this is atomic fragment and needs to be processed separately
1282               skip_bihash = 1;
1283             }
1284           else
1285             {
1286               kv.k.as_u64[0] = ip0->src_address.as_u64[0];
1287               kv.k.as_u64[1] = ip0->src_address.as_u64[1];
1288               kv.k.as_u64[2] = ip0->dst_address.as_u64[0];
1289               kv.k.as_u64[3] = ip0->dst_address.as_u64[1];
1290               kv.k.as_u64[4] =
1291                 ((u64) vec_elt (ip6_main.fib_index_by_sw_if_index,
1292                                 vnet_buffer (b0)->sw_if_index[VLIB_RX]))
1293                   << 32 |
1294                 (u64) frag_hdr->identification;
1295               /* RFC 8200: The Next Header values in the Fragment headers of
1296                * different fragments of the same original packet may differ.
1297                * Only the value from the Offset zero fragment packet is used
1298                * for reassembly.
1299                *
1300                * Also, IPv6 Header doesnt contain the protocol value unlike
1301                * IPv4.*/
1302               kv.k.as_u64[5] = 0;
1303             }
1304
1305           ip6_full_reass_t *reass = ip6_full_reass_find_or_create (
1306             vm, node, rm, rt, &kv, &icmp_bi, &do_handoff, skip_bihash,
1307             &n_left_to_next, &to_next);
1308
1309           if (reass)
1310             {
1311               const u32 fragment_first = ip6_frag_hdr_offset (frag_hdr);
1312               if (0 == fragment_first)
1313                 {
1314                   reass->sendout_thread_index = vm->thread_index;
1315                 }
1316             }
1317           if (PREDICT_FALSE (do_handoff))
1318             {
1319               next0 = IP6_FULL_REASSEMBLY_NEXT_HANDOFF;
1320               vnet_buffer (b0)->ip.reass.owner_thread_index =
1321                 kv.v.memory_owner_thread_index;
1322             }
1323           else if (reass)
1324             {
1325               u32 handoff_thread_idx;
1326               u32 counter = ~0;
1327               switch (ip6_full_reass_update (
1328                 vm, node, rm, rt, reass, &bi0, &next0, &error0, frag_hdr,
1329                 is_custom_app, &handoff_thread_idx, skip_bihash))
1330                 {
1331                 case IP6_FULL_REASS_RC_OK:
1332                   /* nothing to do here */
1333                   break;
1334                 case IP6_FULL_REASS_RC_HANDOFF:
1335                   next0 = IP6_FULL_REASSEMBLY_NEXT_HANDOFF;
1336                   b0 = vlib_get_buffer (vm, bi0);
1337                   vnet_buffer (b0)->ip.reass.owner_thread_index =
1338                     handoff_thread_idx;
1339                   break;
1340                 case IP6_FULL_REASS_RC_TOO_MANY_FRAGMENTS:
1341                   counter = IP6_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
1342                   break;
1343                 case IP6_FULL_REASS_RC_NO_BUF:
1344                   counter = IP6_ERROR_REASS_NO_BUF;
1345                   break;
1346                 case IP6_FULL_REASS_RC_INVALID_FRAG_LEN:
1347                   counter = IP6_ERROR_REASS_INVALID_FRAG_LEN;
1348                   break;
1349                 case IP6_FULL_REASS_RC_OVERLAP:
1350                   counter = IP6_ERROR_REASS_OVERLAPPING_FRAGMENT;
1351                   break;
1352                 case IP6_FULL_REASS_RC_INTERNAL_ERROR:
1353                   counter = IP6_ERROR_REASS_INTERNAL_ERROR;
1354                   /* Sanitization is needed in internal error cases only, as
1355                    * the incoming packet is already dropped in other cases,
1356                    * also adding bi0 back to the reassembly list, fixes the
1357                    * leaking of buffers during internal errors.
1358                    *
1359                    * Also it doesnt make sense to send these buffers custom
1360                    * app, these fragments are with internal errors */
1361                   sanitize_reass_buffers_add_missing (vm, reass, &bi0);
1362                   reass->error_next_index = ~0;
1363                   break;
1364                 }
1365               if (~0 != counter)
1366                 {
1367                   vlib_node_increment_counter (vm, node->node_index, counter,
1368                                                1);
1369                   ip6_full_reass_drop_all (vm, node, reass, &n_left_to_next,
1370                                            &to_next);
1371                   ip6_full_reass_free (rm, rt, reass);
1372                   goto next_packet;
1373                   break;
1374                 }
1375             }
1376           else
1377             {
1378               if (is_feature)
1379                 {
1380                   next0 = IP6_FULL_REASSEMBLY_NEXT_DROP;
1381                 }
1382               else
1383                 {
1384                   next0 = fvnb->ip.reass.error_next_index;
1385                 }
1386               error0 = IP6_ERROR_REASS_LIMIT_REACHED;
1387             }
1388
1389           if (~0 != bi0)
1390             {
1391             skip_reass:
1392               to_next[0] = bi0;
1393               to_next += 1;
1394               n_left_to_next -= 1;
1395
1396               /* bi0 might have been updated by reass_finalize, reload */
1397               b0 = vlib_get_buffer (vm, bi0);
1398               if (IP6_ERROR_NONE != error0)
1399                 {
1400                   b0->error = node->errors[error0];
1401                 }
1402
1403               if (next0 == IP6_FULL_REASSEMBLY_NEXT_HANDOFF)
1404                 {
1405                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1406                     {
1407                       ip6_full_reass_add_trace (
1408                         vm, node, NULL, bi0, frag_hdr, HANDOFF,
1409                         vnet_buffer (b0)->ip.reass.owner_thread_index);
1410                     }
1411                 }
1412               else if (is_feature && IP6_ERROR_NONE == error0)
1413                 {
1414                   vnet_feature_next (&next0, b0);
1415                 }
1416
1417               /* Increment the counter to-custom-app also as this fragment is
1418                * also going to application */
1419               if (is_custom_app)
1420                 {
1421                   vlib_node_increment_counter (
1422                     vm, node->node_index, IP6_ERROR_REASS_TO_CUSTOM_APP, 1);
1423                 }
1424
1425               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1426                                                n_left_to_next, bi0, next0);
1427             }
1428
1429           if (~0 != icmp_bi)
1430             {
1431               next0 = IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR;
1432               to_next[0] = icmp_bi;
1433               to_next += 1;
1434               n_left_to_next -= 1;
1435               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1436                                                n_left_to_next, icmp_bi,
1437                                                next0);
1438             }
1439         next_packet:
1440           from += 1;
1441           n_left_from -= 1;
1442         }
1443
1444       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1445     }
1446
1447   clib_spinlock_unlock (&rt->lock);
1448   return frame->n_vectors;
1449 }
1450
1451 VLIB_NODE_FN (ip6_full_reass_node) (vlib_main_t * vm,
1452                                     vlib_node_runtime_t * node,
1453                                     vlib_frame_t * frame)
1454 {
1455   return ip6_full_reassembly_inline (vm, node, frame, false /* is_feature */,
1456                                      false /* is_custom_app */,
1457                                      false /* is_local */);
1458 }
1459
1460 VLIB_REGISTER_NODE (ip6_full_reass_node) = {
1461     .name = "ip6-full-reassembly",
1462     .vector_size = sizeof (u32),
1463     .format_trace = format_ip6_full_reass_trace,
1464     .n_errors = IP6_N_ERROR,
1465     .error_counters = ip6_error_counters,
1466     .n_next_nodes = IP6_FULL_REASSEMBLY_N_NEXT,
1467     .next_nodes =
1468         {
1469                 [IP6_FULL_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1470                 [IP6_FULL_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1471                 [IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1472                 [IP6_FULL_REASSEMBLY_NEXT_HANDOFF] = "ip6-full-reassembly-handoff",
1473         },
1474 };
1475
1476 VLIB_NODE_FN (ip6_local_full_reass_node)
1477 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1478 {
1479   return ip6_full_reassembly_inline (vm, node, frame, false /* is_feature */,
1480                                      false /* is_custom_app */,
1481                                      true /* is_local */);
1482 }
1483
1484 VLIB_REGISTER_NODE (ip6_local_full_reass_node) = {
1485     .name = "ip6-local-full-reassembly",
1486     .vector_size = sizeof (u32),
1487     .format_trace = format_ip6_full_reass_trace,
1488     .n_errors = IP6_N_ERROR,
1489     .error_counters = ip6_error_counters,
1490     .n_next_nodes = IP6_FULL_REASSEMBLY_N_NEXT,
1491     .next_nodes =
1492         {
1493                 [IP6_FULL_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1494                 [IP6_FULL_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1495                 [IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1496                 [IP6_FULL_REASSEMBLY_NEXT_HANDOFF] = "ip6-local-full-reassembly-handoff",
1497         },
1498 };
1499
1500 VLIB_NODE_FN (ip6_full_reass_node_feature) (vlib_main_t * vm,
1501                                             vlib_node_runtime_t * node,
1502                                             vlib_frame_t * frame)
1503 {
1504   return ip6_full_reassembly_inline (vm, node, frame, true /* is_feature */,
1505                                      false /* is_custom_app */,
1506                                      false /* is_local */);
1507 }
1508
1509 VLIB_REGISTER_NODE (ip6_full_reass_node_feature) = {
1510     .name = "ip6-full-reassembly-feature",
1511     .vector_size = sizeof (u32),
1512     .format_trace = format_ip6_full_reass_trace,
1513     .n_errors = IP6_N_ERROR,
1514     .error_counters = ip6_error_counters,
1515     .n_next_nodes = IP6_FULL_REASSEMBLY_N_NEXT,
1516     .next_nodes =
1517         {
1518                 [IP6_FULL_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1519                 [IP6_FULL_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1520                 [IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1521                 [IP6_FULL_REASSEMBLY_NEXT_HANDOFF] = "ip6-full-reass-feature-hoff",
1522         },
1523 };
1524
1525 VNET_FEATURE_INIT (ip6_full_reassembly_feature, static) = {
1526     .arc_name = "ip6-unicast",
1527     .node_name = "ip6-full-reassembly-feature",
1528     .runs_before = VNET_FEATURES ("ip6-lookup",
1529                                   "ipsec6-input-feature"),
1530     .runs_after = 0,
1531 };
1532
1533 VLIB_NODE_FN (ip6_full_reass_node_custom)
1534 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1535 {
1536   return ip6_full_reassembly_inline (vm, node, frame, false /* is_feature */,
1537                                      true /* is_custom_app */,
1538                                      false /* is_local */);
1539 }
1540
1541 VLIB_REGISTER_NODE (ip6_full_reass_node_custom) = {
1542     .name = "ip6-full-reassembly-custom",
1543     .vector_size = sizeof (u32),
1544     .format_trace = format_ip6_full_reass_trace,
1545     .n_errors = IP6_N_ERROR,
1546     .error_counters = ip6_error_counters,
1547     .n_next_nodes = IP6_FULL_REASSEMBLY_N_NEXT,
1548     .next_nodes =
1549         {
1550                 [IP6_FULL_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1551                 [IP6_FULL_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1552                 [IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1553                 [IP6_FULL_REASSEMBLY_NEXT_HANDOFF] = "ip6-full-reass-custom-hoff",
1554         },
1555 };
1556
1557 #ifndef CLIB_MARCH_VARIANT
1558 static u32
1559 ip6_full_reass_get_nbuckets ()
1560 {
1561   ip6_full_reass_main_t *rm = &ip6_full_reass_main;
1562   u32 nbuckets;
1563   u8 i;
1564
1565   /* need more mem with more workers */
1566   nbuckets = (u32) (rm->max_reass_n * (vlib_num_workers () + 1) /
1567                     IP6_FULL_REASS_HT_LOAD_FACTOR);
1568
1569   for (i = 0; i < 31; i++)
1570     if ((1 << i) >= nbuckets)
1571       break;
1572   nbuckets = 1 << i;
1573
1574   return nbuckets;
1575 }
1576 #endif /* CLIB_MARCH_VARIANT */
1577
1578 typedef enum
1579 {
1580   IP6_EVENT_CONFIG_CHANGED = 1,
1581 } ip6_full_reass_event_t;
1582
1583 #ifndef CLIB_MARCH_VARIANT
1584 typedef struct
1585 {
1586   int failure;
1587   clib_bihash_48_8_t *new_hash;
1588 } ip6_rehash_cb_ctx;
1589
1590 static int
1591 ip6_rehash_cb (clib_bihash_kv_48_8_t * kv, void *_ctx)
1592 {
1593   ip6_rehash_cb_ctx *ctx = _ctx;
1594   if (clib_bihash_add_del_48_8 (ctx->new_hash, kv, 1))
1595     {
1596       ctx->failure = 1;
1597     }
1598   return (BIHASH_WALK_CONTINUE);
1599 }
1600
1601 static void
1602 ip6_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1603                            u32 max_reassembly_length,
1604                            u32 expire_walk_interval_ms)
1605 {
1606   ip6_full_reass_main.timeout_ms = timeout_ms;
1607   ip6_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1608   ip6_full_reass_main.max_reass_n = max_reassemblies;
1609   ip6_full_reass_main.max_reass_len = max_reassembly_length;
1610   ip6_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1611 }
1612
1613 vnet_api_error_t
1614 ip6_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1615                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1616 {
1617   u32 old_nbuckets = ip6_full_reass_get_nbuckets ();
1618   ip6_full_reass_set_params (timeout_ms, max_reassemblies,
1619                              max_reassembly_length, expire_walk_interval_ms);
1620   vlib_process_signal_event (ip6_full_reass_main.vlib_main,
1621                              ip6_full_reass_main.ip6_full_reass_expire_node_idx,
1622                              IP6_EVENT_CONFIG_CHANGED, 0);
1623   u32 new_nbuckets = ip6_full_reass_get_nbuckets ();
1624   if (ip6_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1625     {
1626       clib_bihash_48_8_t new_hash;
1627       clib_memset (&new_hash, 0, sizeof (new_hash));
1628       ip6_rehash_cb_ctx ctx;
1629       ctx.failure = 0;
1630       ctx.new_hash = &new_hash;
1631       clib_bihash_init_48_8 (&new_hash, "ip6-full-reass", new_nbuckets,
1632                              new_nbuckets * 1024);
1633       clib_bihash_foreach_key_value_pair_48_8 (&ip6_full_reass_main.hash,
1634                                                ip6_rehash_cb, &ctx);
1635       if (ctx.failure)
1636         {
1637           clib_bihash_free_48_8 (&new_hash);
1638           return -1;
1639         }
1640       else
1641         {
1642           clib_bihash_free_48_8 (&ip6_full_reass_main.hash);
1643           clib_memcpy_fast (&ip6_full_reass_main.hash, &new_hash,
1644                             sizeof (ip6_full_reass_main.hash));
1645           clib_bihash_copied (&ip6_full_reass_main.hash, &new_hash);
1646         }
1647     }
1648   return 0;
1649 }
1650
1651 vnet_api_error_t
1652 ip6_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1653                     u32 * max_reassembly_length,
1654                     u32 * expire_walk_interval_ms)
1655 {
1656   *timeout_ms = ip6_full_reass_main.timeout_ms;
1657   *max_reassemblies = ip6_full_reass_main.max_reass_n;
1658   *max_reassembly_length = ip6_full_reass_main.max_reass_len;
1659   *expire_walk_interval_ms = ip6_full_reass_main.expire_walk_interval_ms;
1660   return 0;
1661 }
1662
1663 static clib_error_t *
1664 ip6_full_reass_init_function (vlib_main_t * vm)
1665 {
1666   ip6_full_reass_main_t *rm = &ip6_full_reass_main;
1667   clib_error_t *error = 0;
1668   u32 nbuckets;
1669   vlib_node_t *node;
1670
1671   rm->vlib_main = vm;
1672
1673   vec_validate (rm->per_thread_data, vlib_num_workers ());
1674   ip6_full_reass_per_thread_t *rt;
1675   vec_foreach (rt, rm->per_thread_data)
1676   {
1677     clib_spinlock_init (&rt->lock);
1678     pool_alloc (rt->pool, rm->max_reass_n);
1679   }
1680
1681   node = vlib_get_node_by_name (vm, (u8 *) "ip6-full-reassembly-expire-walk");
1682   ASSERT (node);
1683   rm->ip6_full_reass_expire_node_idx = node->index;
1684
1685   ip6_full_reass_set_params (IP6_FULL_REASS_TIMEOUT_DEFAULT_MS,
1686                              IP6_FULL_REASS_MAX_REASSEMBLIES_DEFAULT,
1687                              IP6_FULL_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1688                              IP6_FULL_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1689
1690   nbuckets = ip6_full_reass_get_nbuckets ();
1691   clib_bihash_init_48_8 (&rm->hash, "ip6-full-reass", nbuckets,
1692                          nbuckets * 1024);
1693
1694   node = vlib_get_node_by_name (vm, (u8 *) "ip6-icmp-error");
1695   ASSERT (node);
1696   rm->ip6_icmp_error_idx = node->index;
1697
1698   if ((error = vlib_call_init_function (vm, ip_main_init)))
1699     return error;
1700   ip6_register_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION,
1701                          ip6_local_full_reass_node.index);
1702   rm->is_local_reass_enabled = 1;
1703
1704   rm->fq_index = vlib_frame_queue_main_init (ip6_full_reass_node.index, 0);
1705   rm->fq_local_index =
1706     vlib_frame_queue_main_init (ip6_local_full_reass_node.index, 0);
1707   rm->fq_feature_index =
1708     vlib_frame_queue_main_init (ip6_full_reass_node_feature.index, 0);
1709   rm->fq_custom_index =
1710     vlib_frame_queue_main_init (ip6_full_reass_node_custom.index, 0);
1711
1712   rm->feature_use_refcount_per_intf = NULL;
1713   return error;
1714 }
1715
1716 VLIB_INIT_FUNCTION (ip6_full_reass_init_function);
1717 #endif /* CLIB_MARCH_VARIANT */
1718
1719 static uword
1720 ip6_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
1721                              CLIB_UNUSED (vlib_frame_t *f))
1722 {
1723   ip6_full_reass_main_t *rm = &ip6_full_reass_main;
1724   uword event_type, *event_data = 0;
1725
1726   while (true)
1727     {
1728       vlib_process_wait_for_event_or_clock (vm,
1729                                             (f64) rm->expire_walk_interval_ms
1730                                             / (f64) MSEC_PER_SEC);
1731       event_type = vlib_process_get_events (vm, &event_data);
1732
1733       switch (event_type)
1734         {
1735         case ~0:
1736           /* no events => timeout */
1737           /* fallthrough */
1738         case IP6_EVENT_CONFIG_CHANGED:
1739           /* nothing to do here */
1740           break;
1741         default:
1742           clib_warning ("BUG: event type 0x%wx", event_type);
1743           break;
1744         }
1745       f64 now = vlib_time_now (vm);
1746
1747       ip6_full_reass_t *reass;
1748       int *pool_indexes_to_free = NULL;
1749
1750       uword thread_index = 0;
1751       int index;
1752       const uword nthreads = vlib_num_workers () + 1;
1753       u32 *vec_icmp_bi = NULL;
1754       u32 n_left_to_next, *to_next;
1755
1756       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1757         {
1758           ip6_full_reass_per_thread_t *rt =
1759             &rm->per_thread_data[thread_index];
1760           u32 reass_timeout_cnt = 0;
1761           clib_spinlock_lock (&rt->lock);
1762
1763           vec_reset_length (pool_indexes_to_free);
1764           /* Pace the number of timeouts handled per thread,to avoid barrier
1765            * sync issues in real world scenarios */
1766
1767           u32 beg = rt->last_id;
1768           /* to ensure we walk at least once per sec per context */
1769           u32 end = beg + (IP6_FULL_REASS_MAX_REASSEMBLIES_DEFAULT *
1770                              IP6_FULL_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS /
1771                              MSEC_PER_SEC +
1772                            1);
1773           if (end > vec_len (rt->pool))
1774             {
1775               end = vec_len (rt->pool);
1776               rt->last_id = 0;
1777             }
1778           else
1779             {
1780               rt->last_id = end;
1781             }
1782
1783           pool_foreach_stepping_index (index, beg, end, rt->pool)
1784           {
1785             reass = pool_elt_at_index (rt->pool, index);
1786             if (now > reass->last_heard + rm->timeout)
1787               {
1788                 vec_add1 (pool_indexes_to_free, index);
1789               }
1790           }
1791
1792           int *i;
1793           vec_foreach (i, pool_indexes_to_free)
1794           {
1795             ip6_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1796             u32 icmp_bi = ~0;
1797
1798             reass_timeout_cnt += reass->fragments_n;
1799             ip6_full_reass_on_timeout (vm, node, reass, &icmp_bi,
1800                                        &n_left_to_next, &to_next);
1801             if (~0 != icmp_bi)
1802               vec_add1 (vec_icmp_bi, icmp_bi);
1803
1804             ip6_full_reass_free (rm, rt, reass);
1805           }
1806
1807           clib_spinlock_unlock (&rt->lock);
1808           if (reass_timeout_cnt)
1809             vlib_node_increment_counter (vm, node->node_index,
1810                                          IP6_ERROR_REASS_TIMEOUT,
1811                                          reass_timeout_cnt);
1812         }
1813
1814       while (vec_len (vec_icmp_bi) > 0)
1815         {
1816           vlib_frame_t *f =
1817             vlib_get_frame_to_node (vm, rm->ip6_icmp_error_idx);
1818           u32 *to_next = vlib_frame_vector_args (f);
1819           u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
1820           int trace_frame = 0;
1821           while (vec_len (vec_icmp_bi) > 0 && n_left_to_next > 0)
1822             {
1823               u32 bi = vec_pop (vec_icmp_bi);
1824               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1825               if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
1826                 trace_frame = 1;
1827               to_next[0] = bi;
1828               ++f->n_vectors;
1829               to_next += 1;
1830               n_left_to_next -= 1;
1831             }
1832           f->frame_flags |= (trace_frame * VLIB_FRAME_TRACE);
1833           vlib_put_frame_to_node (vm, rm->ip6_icmp_error_idx, f);
1834         }
1835
1836       vec_free (pool_indexes_to_free);
1837       vec_free (vec_icmp_bi);
1838       if (event_data)
1839         {
1840           vec_set_len (event_data, 0);
1841         }
1842     }
1843
1844   return 0;
1845 }
1846
1847 VLIB_REGISTER_NODE (ip6_full_reass_expire_node) = {
1848   .function = ip6_full_reass_walk_expired,
1849   .format_trace = format_ip6_full_reass_trace,
1850   .type = VLIB_NODE_TYPE_PROCESS,
1851   .name = "ip6-full-reassembly-expire-walk",
1852
1853   .n_errors = IP6_N_ERROR,
1854   .error_counters = ip6_error_counters,
1855 };
1856
1857 static u8 *
1858 format_ip6_full_reass_key (u8 * s, va_list * args)
1859 {
1860   ip6_full_reass_key_t *key = va_arg (*args, ip6_full_reass_key_t *);
1861   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1862               key->xx_id, format_ip6_address, &key->src, format_ip6_address,
1863               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1864   return s;
1865 }
1866
1867 static u8 *
1868 format_ip6_full_reass (u8 * s, va_list * args)
1869 {
1870   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1871   ip6_full_reass_t *reass = va_arg (*args, ip6_full_reass_t *);
1872
1873   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1874               "last_packet_octet: %u, trace_op_counter: %u\n",
1875               reass->id, format_ip6_full_reass_key, &reass->key,
1876               reass->first_bi, reass->data_len, reass->last_packet_octet,
1877               reass->trace_op_counter);
1878   u32 bi = reass->first_bi;
1879   u32 counter = 0;
1880   while (~0 != bi)
1881     {
1882       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1883       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1884       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1885                   "fragment[%u, %u]\n",
1886                   counter, vnb->ip.reass.range_first,
1887                   vnb->ip.reass.range_last, bi,
1888                   ip6_full_reass_buffer_get_data_offset (b),
1889                   ip6_full_reass_buffer_get_data_len (b),
1890                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1891       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1892         {
1893           bi = b->next_buffer;
1894         }
1895       else
1896         {
1897           bi = ~0;
1898         }
1899     }
1900   return s;
1901 }
1902
1903 static clib_error_t *
1904 show_ip6_full_reass (vlib_main_t * vm, unformat_input_t * input,
1905                      CLIB_UNUSED (vlib_cli_command_t * lmd))
1906 {
1907   ip6_full_reass_main_t *rm = &ip6_full_reass_main;
1908
1909   vlib_cli_output (vm, "---------------------");
1910   vlib_cli_output (vm, "IP6 reassembly status");
1911   vlib_cli_output (vm, "---------------------");
1912   bool details = false;
1913   if (unformat (input, "details"))
1914     {
1915       details = true;
1916     }
1917
1918   u32 sum_reass_n = 0;
1919   u64 sum_buffers_n = 0;
1920   ip6_full_reass_t *reass;
1921   uword thread_index;
1922   const uword nthreads = vlib_num_workers () + 1;
1923   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1924     {
1925       ip6_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1926       clib_spinlock_lock (&rt->lock);
1927       if (details)
1928         {
1929           pool_foreach (reass, rt->pool) {
1930             vlib_cli_output (vm, "%U", format_ip6_full_reass, vm, reass);
1931           }
1932         }
1933       sum_reass_n += rt->reass_n;
1934       clib_spinlock_unlock (&rt->lock);
1935     }
1936   vlib_cli_output (vm, "---------------------");
1937   vlib_cli_output (vm, "Current IP6 reassemblies count: %lu\n",
1938                    (long unsigned) sum_reass_n);
1939   vlib_cli_output (vm,
1940                    "Maximum configured concurrent full IP6 reassemblies per worker-thread: %lu\n",
1941                    (long unsigned) rm->max_reass_n);
1942   vlib_cli_output (vm,
1943                    "Maximum configured amount of fragments "
1944                    "per full IP6 reassembly: %lu\n",
1945                    (long unsigned) rm->max_reass_len);
1946   vlib_cli_output (vm,
1947                    "Maximum configured full IP6 reassembly timeout: %lums\n",
1948                    (long unsigned) rm->timeout_ms);
1949   vlib_cli_output (vm,
1950                    "Maximum configured full IP6 reassembly expire walk interval: %lums\n",
1951                    (long unsigned) rm->expire_walk_interval_ms);
1952   vlib_cli_output (vm, "Buffers in use: %lu\n",
1953                    (long unsigned) sum_buffers_n);
1954   return 0;
1955 }
1956
1957 VLIB_CLI_COMMAND (show_ip6_full_reassembly_cmd, static) = {
1958     .path = "show ip6-full-reassembly",
1959     .short_help = "show ip6-full-reassembly [details]",
1960     .function = show_ip6_full_reass,
1961 };
1962
1963 #ifndef CLIB_MARCH_VARIANT
1964 vnet_api_error_t
1965 ip6_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1966 {
1967   return vnet_feature_enable_disable ("ip6-unicast",
1968                                       "ip6-full-reassembly-feature",
1969                                       sw_if_index, enable_disable, 0, 0);
1970 }
1971 #endif /* CLIB_MARCH_VARIANT */
1972
1973 #define foreach_ip6_full_reassembly_handoff_error                       \
1974 _(CONGESTION_DROP, "congestion drop")
1975
1976
1977 typedef enum
1978 {
1979 #define _(sym,str) IP6_FULL_REASSEMBLY_HANDOFF_ERROR_##sym,
1980   foreach_ip6_full_reassembly_handoff_error
1981 #undef _
1982     IP6_FULL_REASSEMBLY_HANDOFF_N_ERROR,
1983 } ip6_full_reassembly_handoff_error_t;
1984
1985 static char *ip6_full_reassembly_handoff_error_strings[] = {
1986 #define _(sym,string) string,
1987   foreach_ip6_full_reassembly_handoff_error
1988 #undef _
1989 };
1990
1991 typedef struct
1992 {
1993   u32 next_worker_index;
1994 } ip6_full_reassembly_handoff_trace_t;
1995
1996 static u8 *
1997 format_ip6_full_reassembly_handoff_trace (u8 * s, va_list * args)
1998 {
1999   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
2000   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
2001   ip6_full_reassembly_handoff_trace_t *t =
2002     va_arg (*args, ip6_full_reassembly_handoff_trace_t *);
2003
2004   s =
2005     format (s, "ip6-full-reassembly-handoff: next-worker %d",
2006             t->next_worker_index);
2007
2008   return s;
2009 }
2010
2011 always_inline uword
2012 ip6_full_reassembly_handoff_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
2013                                     vlib_frame_t *frame,
2014                                     ip6_full_reass_node_type_t type,
2015                                     bool is_local)
2016 {
2017   ip6_full_reass_main_t *rm = &ip6_full_reass_main;
2018
2019   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2020   u32 n_enq, n_left_from, *from;
2021   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
2022   u32 fq_index;
2023
2024   from = vlib_frame_vector_args (frame);
2025   n_left_from = frame->n_vectors;
2026   vlib_get_buffers (vm, from, bufs, n_left_from);
2027
2028   b = bufs;
2029   ti = thread_indices;
2030
2031   switch (type)
2032     {
2033     case NORMAL:
2034       if (is_local)
2035         {
2036           fq_index = rm->fq_local_index;
2037         }
2038       else
2039         {
2040           fq_index = rm->fq_index;
2041         }
2042       break;
2043     case FEATURE:
2044       fq_index = rm->fq_feature_index;
2045       break;
2046     case CUSTOM:
2047       fq_index = rm->fq_custom_index;
2048       break;
2049     default:
2050       clib_warning ("Unexpected `type' (%d)!", type);
2051       ASSERT (0);
2052     }
2053   while (n_left_from > 0)
2054     {
2055       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
2056
2057       if (PREDICT_FALSE
2058           ((node->flags & VLIB_NODE_FLAG_TRACE)
2059            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
2060         {
2061           ip6_full_reassembly_handoff_trace_t *t =
2062             vlib_add_trace (vm, node, b[0], sizeof (*t));
2063           t->next_worker_index = ti[0];
2064         }
2065
2066       n_left_from -= 1;
2067       ti += 1;
2068       b += 1;
2069     }
2070   n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
2071                                          thread_indices, frame->n_vectors, 1);
2072
2073   if (n_enq < frame->n_vectors)
2074     vlib_node_increment_counter (vm, node->node_index,
2075                                  IP6_FULL_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
2076                                  frame->n_vectors - n_enq);
2077   return frame->n_vectors;
2078 }
2079
2080 VLIB_NODE_FN (ip6_full_reassembly_handoff_node) (vlib_main_t * vm,
2081                                                  vlib_node_runtime_t * node,
2082                                                  vlib_frame_t * frame)
2083 {
2084   return ip6_full_reassembly_handoff_inline (vm, node, frame, NORMAL,
2085                                              false /* is_local */);
2086 }
2087
2088 VLIB_REGISTER_NODE (ip6_full_reassembly_handoff_node) = {
2089   .name = "ip6-full-reassembly-handoff",
2090   .vector_size = sizeof (u32),
2091   .n_errors = ARRAY_LEN(ip6_full_reassembly_handoff_error_strings),
2092   .error_strings = ip6_full_reassembly_handoff_error_strings,
2093   .format_trace = format_ip6_full_reassembly_handoff_trace,
2094
2095   .n_next_nodes = 1,
2096
2097   .next_nodes = {
2098     [0] = "error-drop",
2099   },
2100 };
2101
2102 VLIB_NODE_FN (ip6_local_full_reassembly_handoff_node)
2103 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
2104 {
2105   return ip6_full_reassembly_handoff_inline (vm, node, frame, NORMAL,
2106                                              true /* is_feature */);
2107 }
2108
2109 VLIB_REGISTER_NODE (ip6_local_full_reassembly_handoff_node) = {
2110   .name = "ip6-local-full-reassembly-handoff",
2111   .vector_size = sizeof (u32),
2112   .n_errors = ARRAY_LEN(ip6_full_reassembly_handoff_error_strings),
2113   .error_strings = ip6_full_reassembly_handoff_error_strings,
2114   .format_trace = format_ip6_full_reassembly_handoff_trace,
2115
2116   .n_next_nodes = 1,
2117
2118   .next_nodes = {
2119     [0] = "error-drop",
2120   },
2121 };
2122
2123 VLIB_NODE_FN (ip6_full_reassembly_feature_handoff_node) (vlib_main_t * vm,
2124                                vlib_node_runtime_t * node, vlib_frame_t * frame)
2125 {
2126   return ip6_full_reassembly_handoff_inline (vm, node, frame, FEATURE,
2127                                              false /* is_local */);
2128 }
2129
2130 VLIB_REGISTER_NODE (ip6_full_reassembly_feature_handoff_node) = {
2131   .name = "ip6-full-reass-feature-hoff",
2132   .vector_size = sizeof (u32),
2133   .n_errors = ARRAY_LEN(ip6_full_reassembly_handoff_error_strings),
2134   .error_strings = ip6_full_reassembly_handoff_error_strings,
2135   .format_trace = format_ip6_full_reassembly_handoff_trace,
2136
2137   .n_next_nodes = 1,
2138
2139   .next_nodes = {
2140     [0] = "error-drop",
2141   },
2142 };
2143
2144 VLIB_NODE_FN (ip6_full_reassembly_custom_handoff_node)
2145 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
2146 {
2147   return ip6_full_reassembly_handoff_inline (vm, node, frame, CUSTOM,
2148                                              false /* is_local */);
2149 }
2150
2151 VLIB_REGISTER_NODE (ip6_full_reassembly_custom_handoff_node) = {
2152   .name = "ip6-full-reass-custom-hoff",
2153   .vector_size = sizeof (u32),
2154   .n_errors = ARRAY_LEN(ip6_full_reassembly_handoff_error_strings),
2155   .error_strings = ip6_full_reassembly_handoff_error_strings,
2156   .format_trace = format_ip6_full_reassembly_handoff_trace,
2157
2158   .n_next_nodes = 1,
2159
2160   .next_nodes = {
2161     [0] = "error-drop",
2162   },
2163 };
2164
2165 #ifndef CLIB_MARCH_VARIANT
2166 int
2167 ip6_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
2168 {
2169   ip6_full_reass_main_t *rm = &ip6_full_reass_main;
2170   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
2171   if (is_enable)
2172     {
2173       if (!rm->feature_use_refcount_per_intf[sw_if_index])
2174         {
2175           ++rm->feature_use_refcount_per_intf[sw_if_index];
2176           return vnet_feature_enable_disable ("ip6-unicast",
2177                                               "ip6-full-reassembly-feature",
2178                                               sw_if_index, 1, 0, 0);
2179         }
2180       ++rm->feature_use_refcount_per_intf[sw_if_index];
2181     }
2182   else
2183     {
2184       --rm->feature_use_refcount_per_intf[sw_if_index];
2185       if (!rm->feature_use_refcount_per_intf[sw_if_index])
2186         return vnet_feature_enable_disable ("ip6-unicast",
2187                                             "ip6-full-reassembly-feature",
2188                                             sw_if_index, 0, 0, 0);
2189     }
2190   return -1;
2191 }
2192
2193 void
2194 ip6_local_full_reass_enable_disable (int enable)
2195 {
2196   if (enable)
2197     {
2198       if (!ip6_full_reass_main.is_local_reass_enabled)
2199         {
2200           ip6_full_reass_main.is_local_reass_enabled = 1;
2201           ip6_register_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION,
2202                                  ip6_local_full_reass_node.index);
2203         }
2204     }
2205   else
2206     {
2207       if (ip6_full_reass_main.is_local_reass_enabled)
2208         {
2209           ip6_full_reass_main.is_local_reass_enabled = 0;
2210           ip6_unregister_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION);
2211         }
2212     }
2213 }
2214
2215 int
2216 ip6_local_full_reass_enabled ()
2217 {
2218   return ip6_full_reass_main.is_local_reass_enabled;
2219 }
2220
2221 #endif
2222
2223 /*
2224  * fd.io coding-style-patch-verification: ON
2225  *
2226  * Local Variables:
2227  * eval: (c-set-style "gnu")
2228  * End:
2229  */