ip: reassembly - increasing the nbuckets for reass
[vpp.git] / src / vnet / ip / reass / ip4_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Full Reassembly.
19  *
20  * This file contains the source code for IPv4 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/fifo.h>
27 #include <vppinfra/bihash_16_8.h>
28 #include <vnet/ip/reass/ip4_full_reass.h>
29 #include <stddef.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_REASS_TIMEOUT_DEFAULT_MS              100
33 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
34 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT   3
36 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38 #define IP4_REASS_DEBUG_BUFFERS 0
39 #if IP4_REASS_DEBUG_BUFFERS
40 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41   do                                                 \
42     {                                                \
43       u32 _bi = bi;                                  \
44       printf (#what "buffer %u", _bi);               \
45       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47         {                                            \
48           _bi = _b->next_buffer;                     \
49           printf ("[%u]", _bi);                      \
50           _b = vlib_get_buffer (vm, _bi);            \
51         }                                            \
52       printf ("\n");                                 \
53       fflush (stdout);                               \
54     }                                                \
55   while (0)
56 #else
57 #define IP4_REASS_DEBUG_BUFFER(...)
58 #endif
59
60 typedef enum
61 {
62   IP4_REASS_RC_OK,
63   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64   IP4_REASS_RC_INTERNAL_ERROR,
65   IP4_REASS_RC_NO_BUF,
66   IP4_REASS_RC_HANDOFF,
67 } ip4_full_reass_rc_t;
68
69 typedef struct
70 {
71   union
72   {
73     struct
74     {
75       u32 xx_id;
76       ip4_address_t src;
77       ip4_address_t dst;
78       u16 frag_id;
79       u8 proto;
80       u8 unused;
81     };
82     u64 as_u64[2];
83   };
84 } ip4_full_reass_key_t;
85
86 typedef union
87 {
88   struct
89   {
90     u32 reass_index;
91     u32 memory_owner_thread_index;
92   };
93   u64 as_u64;
94 } ip4_full_reass_val_t;
95
96 typedef union
97 {
98   struct
99   {
100     ip4_full_reass_key_t k;
101     ip4_full_reass_val_t v;
102   };
103   clib_bihash_kv_16_8_t kv;
104 } ip4_full_reass_kv_t;
105
106 always_inline u32
107 ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108 {
109   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111 }
112
113 always_inline u16
114 ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115 {
116   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118     (vnb->ip.reass.fragment_first +
119      ip4_full_reass_buffer_get_data_offset (b)) + 1;
120 }
121
122 typedef struct
123 {
124   // hash table key
125   ip4_full_reass_key_t key;
126   // time when last packet was received
127   f64 last_heard;
128   // internal id of this reassembly
129   u64 id;
130   // buffer index of first buffer in this reassembly context
131   u32 first_bi;
132   // last octet of packet, ~0 until fragment without more_fragments arrives
133   u32 last_packet_octet;
134   // length of data collected so far
135   u32 data_len;
136   // trace operation counter
137   u32 trace_op_counter;
138   // next index - used by non-feature node
139   u32 next_index;
140   // error next index - used by custom apps (~0 if not used)
141   u32 error_next_index;
142   // minimum fragment length for this reassembly - used to estimate MTU
143   u16 min_fragment_length;
144   // number of fragments in this reassembly
145   u32 fragments_n;
146   // thread owning memory for this context (whose pool contains this ctx)
147   u32 memory_owner_thread_index;
148   // thread which received fragment with offset 0 and which sends out the
149   // completed reassembly
150   u32 sendout_thread_index;
151 } ip4_full_reass_t;
152
153 typedef struct
154 {
155   ip4_full_reass_t *pool;
156   u32 reass_n;
157   u32 id_counter;
158   clib_spinlock_t lock;
159 } ip4_full_reass_per_thread_t;
160
161 typedef struct
162 {
163   // IPv4 config
164   u32 timeout_ms;
165   f64 timeout;
166   u32 expire_walk_interval_ms;
167   // maximum number of fragments in one reassembly
168   u32 max_reass_len;
169   // maximum number of reassemblies
170   u32 max_reass_n;
171
172   // IPv4 runtime
173   clib_bihash_16_8_t hash;
174   // per-thread data
175   ip4_full_reass_per_thread_t *per_thread_data;
176
177   // convenience
178   vlib_main_t *vlib_main;
179
180   u32 ip4_full_reass_expire_node_idx;
181
182   /** Worker handoff */
183   u32 fq_index;
184   u32 fq_local_index;
185   u32 fq_feature_index;
186   u32 fq_custom_index;
187
188   // reference count for enabling/disabling feature - per interface
189   u32 *feature_use_refcount_per_intf;
190
191   // whether local fragmented packets are reassembled or not
192   int is_local_reass_enabled;
193 } ip4_full_reass_main_t;
194
195 extern ip4_full_reass_main_t ip4_full_reass_main;
196
197 #ifndef CLIB_MARCH_VARIANT
198 ip4_full_reass_main_t ip4_full_reass_main;
199 #endif /* CLIB_MARCH_VARIANT */
200
201 typedef enum
202 {
203   IP4_FULL_REASS_NEXT_INPUT,
204   IP4_FULL_REASS_NEXT_DROP,
205   IP4_FULL_REASS_NEXT_HANDOFF,
206   IP4_FULL_REASS_N_NEXT,
207 } ip4_full_reass_next_t;
208
209 typedef enum
210 {
211   NORMAL,
212   FEATURE,
213   CUSTOM
214 } ip4_full_reass_node_type_t;
215
216 typedef enum
217 {
218   RANGE_NEW,
219   RANGE_SHRINK,
220   RANGE_DISCARD,
221   RANGE_OVERLAP,
222   FINALIZE,
223   HANDOFF,
224   PASSTHROUGH,
225 } ip4_full_reass_trace_operation_e;
226
227 typedef struct
228 {
229   u16 range_first;
230   u16 range_last;
231   u32 range_bi;
232   i32 data_offset;
233   u32 data_len;
234   u32 first_bi;
235 } ip4_full_reass_range_trace_t;
236
237 typedef struct
238 {
239   ip4_full_reass_trace_operation_e action;
240   u32 reass_id;
241   ip4_full_reass_range_trace_t trace_range;
242   u32 size_diff;
243   u32 op_id;
244   u32 thread_id;
245   u32 thread_id_to;
246   u32 fragment_first;
247   u32 fragment_last;
248   u32 total_data_len;
249   bool is_after_handoff;
250   ip4_header_t ip4_header;
251 } ip4_full_reass_trace_t;
252
253 extern vlib_node_registration_t ip4_full_reass_node;
254 extern vlib_node_registration_t ip4_full_reass_node_feature;
255 extern vlib_node_registration_t ip4_full_reass_node_custom;
256
257 static void
258 ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
259                               ip4_full_reass_range_trace_t * trace)
260 {
261   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
262   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
263   trace->range_first = vnb->ip.reass.range_first;
264   trace->range_last = vnb->ip.reass.range_last;
265   trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
266   trace->data_len = ip4_full_reass_buffer_get_data_len (b);
267   trace->range_bi = bi;
268 }
269
270 static u8 *
271 format_ip4_full_reass_range_trace (u8 * s, va_list * args)
272 {
273   ip4_full_reass_range_trace_t *trace =
274     va_arg (*args, ip4_full_reass_range_trace_t *);
275   s =
276     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
277             trace->range_last, trace->data_offset, trace->data_len,
278             trace->range_bi);
279   return s;
280 }
281
282 static u8 *
283 format_ip4_full_reass_trace (u8 * s, va_list * args)
284 {
285   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
286   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
287   ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
288   u32 indent = 0;
289   if (~0 != t->reass_id)
290     {
291       if (t->is_after_handoff)
292         {
293           s =
294             format (s, "%U\n", format_ip4_header, &t->ip4_header,
295                     sizeof (t->ip4_header));
296           indent = 2;
297         }
298       s =
299         format (s, "%Ureass id: %u, op id: %u, ", format_white_space, indent,
300                 t->reass_id, t->op_id);
301       indent = format_get_indent (s);
302       s =
303         format (s,
304                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
305                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
306                 t->fragment_last);
307     }
308   switch (t->action)
309     {
310     case RANGE_SHRINK:
311       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
312                   format_ip4_full_reass_range_trace, &t->trace_range,
313                   t->size_diff);
314       break;
315     case RANGE_DISCARD:
316       s = format (s, "\n%Udiscard %U", format_white_space, indent,
317                   format_ip4_full_reass_range_trace, &t->trace_range);
318       break;
319     case RANGE_NEW:
320       s = format (s, "\n%Unew %U", format_white_space, indent,
321                   format_ip4_full_reass_range_trace, &t->trace_range);
322       break;
323     case RANGE_OVERLAP:
324       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
325                   format_ip4_full_reass_range_trace, &t->trace_range);
326       break;
327     case FINALIZE:
328       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
329       break;
330     case HANDOFF:
331       s =
332         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
333                 t->thread_id_to);
334       break;
335     case PASSTHROUGH:
336       s = format (s, "passthrough - not a fragment");
337       break;
338     }
339   return s;
340 }
341
342 static void
343 ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
344                           ip4_full_reass_t * reass, u32 bi,
345                           ip4_full_reass_trace_operation_e action,
346                           u32 size_diff, u32 thread_id_to)
347 {
348   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
349   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
350   if (pool_is_free_index
351       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
352     {
353       // this buffer's trace is gone
354       b->flags &= ~VLIB_BUFFER_IS_TRACED;
355       return;
356     }
357   bool is_after_handoff = false;
358   if (vlib_buffer_get_trace_thread (b) != vm->thread_index)
359     {
360       is_after_handoff = true;
361     }
362   ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
363   t->is_after_handoff = is_after_handoff;
364   if (t->is_after_handoff)
365     {
366       clib_memcpy (&t->ip4_header, vlib_buffer_get_current (b),
367                    clib_min (sizeof (t->ip4_header), b->current_length));
368     }
369   if (reass)
370     {
371       t->reass_id = reass->id;
372       t->op_id = reass->trace_op_counter;
373       t->trace_range.first_bi = reass->first_bi;
374       t->total_data_len = reass->data_len;
375       ++reass->trace_op_counter;
376     }
377   else
378     {
379       t->reass_id = ~0;
380       t->op_id = 0;
381       t->trace_range.first_bi = 0;
382       t->total_data_len = 0;
383     }
384   t->action = action;
385   ip4_full_reass_trace_details (vm, bi, &t->trace_range);
386   t->size_diff = size_diff;
387   t->thread_id = vm->thread_index;
388   t->thread_id_to = thread_id_to;
389   t->fragment_first = vnb->ip.reass.fragment_first;
390   t->fragment_last = vnb->ip.reass.fragment_last;
391 #if 0
392   static u8 *s = NULL;
393   s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
394   printf ("%.*s\n", vec_len (s), s);
395   fflush (stdout);
396   vec_reset_length (s);
397 #endif
398 }
399
400 always_inline void
401 ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
402                          ip4_full_reass_t * reass)
403 {
404   pool_put (rt->pool, reass);
405   --rt->reass_n;
406 }
407
408 always_inline void
409 ip4_full_reass_free (ip4_full_reass_main_t * rm,
410                      ip4_full_reass_per_thread_t * rt,
411                      ip4_full_reass_t * reass)
412 {
413   clib_bihash_kv_16_8_t kv;
414   kv.key[0] = reass->key.as_u64[0];
415   kv.key[1] = reass->key.as_u64[1];
416   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
417   return ip4_full_reass_free_ctx (rt, reass);
418 }
419
420 /* n_left_to_next, and to_next are taken as input params, as this function
421  * could be called from a graphnode, where its managing local copy of these
422  * variables, and ignoring those and still trying to enqueue the buffers
423  * with local variables would cause either buffer leak or corruption */
424 always_inline void
425 ip4_full_reass_drop_all (vlib_main_t *vm, vlib_node_runtime_t *node,
426                          ip4_full_reass_t *reass, u32 *n_left_to_next,
427                          u32 **to_next)
428 {
429   u32 range_bi = reass->first_bi;
430   vlib_buffer_t *range_b;
431   vnet_buffer_opaque_t *range_vnb;
432   u32 *to_free = NULL;
433
434   while (~0 != range_bi)
435     {
436       range_b = vlib_get_buffer (vm, range_bi);
437       range_vnb = vnet_buffer (range_b);
438
439       if (~0 != range_bi)
440         {
441           vec_add1 (to_free, range_bi);
442         }
443
444       range_bi = range_vnb->ip.reass.next_range_bi;
445     }
446
447   /* send to next_error_index */
448   if (~0 != reass->error_next_index &&
449       reass->error_next_index < node->n_next_nodes)
450     {
451       u32 next_index;
452
453       next_index = reass->error_next_index;
454       u32 bi = ~0;
455
456       /* record number of packets sent to custom app */
457       vlib_node_increment_counter (vm, node->node_index,
458                                    IP4_ERROR_REASS_TO_CUSTOM_APP,
459                                    vec_len (to_free));
460
461       while (vec_len (to_free) > 0)
462         {
463           vlib_get_next_frame (vm, node, next_index, *to_next,
464                                (*n_left_to_next));
465
466           while (vec_len (to_free) > 0 && (*n_left_to_next) > 0)
467             {
468               bi = vec_pop (to_free);
469
470               if (~0 != bi)
471                 {
472                   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
473                   if ((b->flags & VLIB_BUFFER_IS_TRACED))
474                     {
475                       ip4_full_reass_add_trace (vm, node, reass, bi,
476                                                 RANGE_DISCARD, 0, ~0);
477                     }
478                   *to_next[0] = bi;
479                   (*to_next) += 1;
480                   (*n_left_to_next) -= 1;
481                 }
482             }
483           vlib_put_next_frame (vm, node, next_index, (*n_left_to_next));
484         }
485     }
486   else
487     {
488       vlib_buffer_free (vm, to_free, vec_len (to_free));
489     }
490   vec_free (to_free);
491 }
492
493 always_inline void
494 sanitize_reass_buffers_add_missing (vlib_main_t *vm, ip4_full_reass_t *reass,
495                                     u32 *bi0)
496 {
497   u32 range_bi = reass->first_bi;
498   vlib_buffer_t *range_b;
499   vnet_buffer_opaque_t *range_vnb;
500
501   while (~0 != range_bi)
502     {
503       range_b = vlib_get_buffer (vm, range_bi);
504       range_vnb = vnet_buffer (range_b);
505       u32 bi = range_bi;
506       if (~0 != bi)
507         {
508           if (bi == *bi0)
509             *bi0 = ~0;
510           if (range_b->flags & VLIB_BUFFER_NEXT_PRESENT)
511             {
512               u32 _bi = bi;
513               vlib_buffer_t *_b = vlib_get_buffer (vm, _bi);
514               while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)
515                 {
516                   if (_b->next_buffer != range_vnb->ip.reass.next_range_bi)
517                     {
518                       _bi = _b->next_buffer;
519                       _b = vlib_get_buffer (vm, _bi);
520                     }
521                   else
522                     {
523                       _b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
524                       break;
525                     }
526                 }
527             }
528           range_bi = range_vnb->ip.reass.next_range_bi;
529         }
530     }
531   if (*bi0 != ~0)
532     {
533       vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
534       vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
535       if (~0 != reass->first_bi)
536         {
537           fvnb->ip.reass.next_range_bi = reass->first_bi;
538           reass->first_bi = *bi0;
539         }
540       else
541         {
542           reass->first_bi = *bi0;
543           fvnb->ip.reass.next_range_bi = ~0;
544         }
545       *bi0 = ~0;
546     }
547 }
548
549 always_inline void
550 ip4_full_reass_init (ip4_full_reass_t * reass)
551 {
552   reass->first_bi = ~0;
553   reass->last_packet_octet = ~0;
554   reass->data_len = 0;
555   reass->next_index = ~0;
556   reass->error_next_index = ~0;
557 }
558
559 always_inline ip4_full_reass_t *
560 ip4_full_reass_find_or_create (vlib_main_t *vm, vlib_node_runtime_t *node,
561                                ip4_full_reass_main_t *rm,
562                                ip4_full_reass_per_thread_t *rt,
563                                ip4_full_reass_kv_t *kv, u8 *do_handoff,
564                                u32 *n_left_to_next, u32 **to_next)
565 {
566   ip4_full_reass_t *reass;
567   f64 now;
568
569 again:
570
571   reass = NULL;
572   now = vlib_time_now (vm);
573   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
574     {
575       if (vm->thread_index != kv->v.memory_owner_thread_index)
576         {
577           *do_handoff = 1;
578           return NULL;
579         }
580       reass =
581         pool_elt_at_index (rm->per_thread_data
582                            [kv->v.memory_owner_thread_index].pool,
583                            kv->v.reass_index);
584
585       if (now > reass->last_heard + rm->timeout)
586         {
587           vlib_node_increment_counter (vm, node->node_index,
588                                        IP4_ERROR_REASS_TIMEOUT, 1);
589           ip4_full_reass_drop_all (vm, node, reass, n_left_to_next, to_next);
590           ip4_full_reass_free (rm, rt, reass);
591           reass = NULL;
592         }
593     }
594
595   if (reass)
596     {
597       reass->last_heard = now;
598       return reass;
599     }
600
601   if (rt->reass_n >= rm->max_reass_n)
602     {
603       reass = NULL;
604       return reass;
605     }
606   else
607     {
608       pool_get (rt->pool, reass);
609       clib_memset (reass, 0, sizeof (*reass));
610       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
611       reass->memory_owner_thread_index = vm->thread_index;
612       ++rt->id_counter;
613       ip4_full_reass_init (reass);
614       ++rt->reass_n;
615     }
616
617   reass->key.as_u64[0] = kv->kv.key[0];
618   reass->key.as_u64[1] = kv->kv.key[1];
619   kv->v.reass_index = (reass - rt->pool);
620   kv->v.memory_owner_thread_index = vm->thread_index;
621   reass->last_heard = now;
622
623   int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
624   if (rv)
625     {
626       ip4_full_reass_free_ctx (rt, reass);
627       reass = NULL;
628       // if other worker created a context already work with the other copy
629       if (-2 == rv)
630         goto again;
631     }
632
633   return reass;
634 }
635
636 always_inline ip4_full_reass_rc_t
637 ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
638                          ip4_full_reass_main_t * rm,
639                          ip4_full_reass_per_thread_t * rt,
640                          ip4_full_reass_t * reass, u32 * bi0,
641                          u32 * next0, u32 * error0, bool is_custom)
642 {
643   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
644   vlib_buffer_t *last_b = NULL;
645   u32 sub_chain_bi = reass->first_bi;
646   u32 total_length = 0;
647   u32 buf_cnt = 0;
648   do
649     {
650       u32 tmp_bi = sub_chain_bi;
651       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
652       ip4_header_t *ip = vlib_buffer_get_current (tmp);
653       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
654       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
655           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
656         {
657           return IP4_REASS_RC_INTERNAL_ERROR;
658         }
659
660       u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
661       u32 trim_front =
662         ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
663       u32 trim_end =
664         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
665       if (tmp_bi == reass->first_bi)
666         {
667           /* first buffer - keep ip4 header */
668           if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
669             {
670               return IP4_REASS_RC_INTERNAL_ERROR;
671             }
672           trim_front = 0;
673           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
674             ip4_header_bytes (ip);
675           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
676             {
677               return IP4_REASS_RC_INTERNAL_ERROR;
678             }
679         }
680       u32 keep_data =
681         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
682       while (1)
683         {
684           ++buf_cnt;
685           if (trim_front)
686             {
687               if (trim_front > tmp->current_length)
688                 {
689                   /* drop whole buffer */
690                   u32 to_be_freed_bi = tmp_bi;
691                   trim_front -= tmp->current_length;
692                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
693                     {
694                       return IP4_REASS_RC_INTERNAL_ERROR;
695                     }
696                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
697                   tmp_bi = tmp->next_buffer;
698                   tmp->next_buffer = 0;
699                   tmp = vlib_get_buffer (vm, tmp_bi);
700                   vlib_buffer_free_one (vm, to_be_freed_bi);
701                   continue;
702                 }
703               else
704                 {
705                   vlib_buffer_advance (tmp, trim_front);
706                   trim_front = 0;
707                 }
708             }
709           if (keep_data)
710             {
711               if (last_b)
712                 {
713                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
714                   last_b->next_buffer = tmp_bi;
715                 }
716               last_b = tmp;
717               if (keep_data <= tmp->current_length)
718                 {
719                   tmp->current_length = keep_data;
720                   keep_data = 0;
721                 }
722               else
723                 {
724                   keep_data -= tmp->current_length;
725                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
726                     {
727                       return IP4_REASS_RC_INTERNAL_ERROR;
728                     }
729                 }
730               total_length += tmp->current_length;
731               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
732                 {
733                   tmp_bi = tmp->next_buffer;
734                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
735                 }
736               else
737                 {
738                   break;
739                 }
740             }
741           else
742             {
743               u32 to_be_freed_bi = tmp_bi;
744               if (reass->first_bi == tmp_bi)
745                 {
746                   return IP4_REASS_RC_INTERNAL_ERROR;
747                 }
748               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
749                 {
750                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
751                   tmp_bi = tmp->next_buffer;
752                   tmp->next_buffer = 0;
753                   tmp = vlib_get_buffer (vm, tmp_bi);
754                   vlib_buffer_free_one (vm, to_be_freed_bi);
755                 }
756               else
757                 {
758                   tmp->next_buffer = 0;
759                   vlib_buffer_free_one (vm, to_be_freed_bi);
760                   break;
761                 }
762             }
763         }
764       sub_chain_bi =
765         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
766         reass.next_range_bi;
767     }
768   while (~0 != sub_chain_bi);
769
770   if (!last_b)
771     {
772       return IP4_REASS_RC_INTERNAL_ERROR;
773     }
774   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
775
776   if (total_length < first_b->current_length)
777     {
778       return IP4_REASS_RC_INTERNAL_ERROR;
779     }
780   total_length -= first_b->current_length;
781   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
782   first_b->total_length_not_including_first_buffer = total_length;
783   ip4_header_t *ip = vlib_buffer_get_current (first_b);
784   ip->flags_and_fragment_offset = 0;
785   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
786   ip->checksum = ip4_header_checksum (ip);
787   if (!vlib_buffer_chain_linearize (vm, first_b))
788     {
789       return IP4_REASS_RC_NO_BUF;
790     }
791   // reset to reconstruct the mbuf linking
792   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
793   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
794     {
795       ip4_full_reass_add_trace (vm, node, reass, reass->first_bi, FINALIZE, 0,
796                                 ~0);
797 #if 0
798       // following code does a hexdump of packet fragments to stdout ...
799       do
800         {
801           u32 bi = reass->first_bi;
802           u8 *s = NULL;
803           while (~0 != bi)
804             {
805               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
806               s = format (s, "%u: %U\n", bi, format_hexdump,
807                           vlib_buffer_get_current (b), b->current_length);
808               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
809                 {
810                   bi = b->next_buffer;
811                 }
812               else
813                 {
814                   break;
815                 }
816             }
817           printf ("%.*s\n", vec_len (s), s);
818           fflush (stdout);
819           vec_free (s);
820         }
821       while (0);
822 #endif
823     }
824   *bi0 = reass->first_bi;
825   if (!is_custom)
826     {
827       *next0 = IP4_FULL_REASS_NEXT_INPUT;
828     }
829   else
830     {
831       *next0 = reass->next_index;
832     }
833   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
834
835   /* Keep track of number of successfully reassembled packets and number of
836    * fragments reassembled */
837   vlib_node_increment_counter (vm, node->node_index, IP4_ERROR_REASS_SUCCESS,
838                                1);
839
840   vlib_node_increment_counter (vm, node->node_index,
841                                IP4_ERROR_REASS_FRAGMENTS_REASSEMBLED,
842                                reass->fragments_n);
843
844   *error0 = IP4_ERROR_NONE;
845   ip4_full_reass_free (rm, rt, reass);
846   reass = NULL;
847   return IP4_REASS_RC_OK;
848 }
849
850 always_inline ip4_full_reass_rc_t
851 ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
852                                       ip4_full_reass_t * reass,
853                                       u32 prev_range_bi, u32 new_next_bi)
854 {
855   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
856   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
857   if (~0 != prev_range_bi)
858     {
859       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
860       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
861       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
862       prev_vnb->ip.reass.next_range_bi = new_next_bi;
863     }
864   else
865     {
866       if (~0 != reass->first_bi)
867         {
868           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
869         }
870       reass->first_bi = new_next_bi;
871     }
872   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
873   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
874       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
875     {
876       return IP4_REASS_RC_INTERNAL_ERROR;
877     }
878   reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
879   return IP4_REASS_RC_OK;
880 }
881
882 always_inline ip4_full_reass_rc_t
883 ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
884                                         vlib_node_runtime_t * node,
885                                         ip4_full_reass_t * reass,
886                                         u32 prev_range_bi, u32 discard_bi)
887 {
888   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
889   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
890   if (~0 != prev_range_bi)
891     {
892       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
893       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
894       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
895         {
896           return IP4_REASS_RC_INTERNAL_ERROR;
897         }
898       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
899     }
900   else
901     {
902       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
903     }
904   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
905   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
906       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
907     {
908       return IP4_REASS_RC_INTERNAL_ERROR;
909     }
910   reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
911   while (1)
912     {
913       u32 to_be_freed_bi = discard_bi;
914       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
915         {
916           ip4_full_reass_add_trace (vm, node, reass, discard_bi, RANGE_DISCARD,
917                                     0, ~0);
918         }
919       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
920         {
921           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
922           discard_bi = discard_b->next_buffer;
923           discard_b->next_buffer = 0;
924           discard_b = vlib_get_buffer (vm, discard_bi);
925           vlib_buffer_free_one (vm, to_be_freed_bi);
926         }
927       else
928         {
929           discard_b->next_buffer = 0;
930           vlib_buffer_free_one (vm, to_be_freed_bi);
931           break;
932         }
933     }
934   return IP4_REASS_RC_OK;
935 }
936
937 always_inline ip4_full_reass_rc_t
938 ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
939                        ip4_full_reass_main_t * rm,
940                        ip4_full_reass_per_thread_t * rt,
941                        ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
942                        u32 * error0, bool is_custom, u32 * handoff_thread_idx)
943 {
944   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
945   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
946   if (is_custom)
947     {
948       // store (error_)next_index before it's overwritten
949       reass->next_index = fvnb->ip.reass.next_index;
950       reass->error_next_index = fvnb->ip.reass.error_next_index;
951     }
952   ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
953   int consumed = 0;
954   ip4_header_t *fip = vlib_buffer_get_current (fb);
955   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
956   const u32 fragment_length =
957     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
958   const u32 fragment_last = fragment_first + fragment_length - 1;
959   fvnb->ip.reass.fragment_first = fragment_first;
960   fvnb->ip.reass.fragment_last = fragment_last;
961   int more_fragments = ip4_get_fragment_more (fip);
962   u32 candidate_range_bi = reass->first_bi;
963   u32 prev_range_bi = ~0;
964   fvnb->ip.reass.range_first = fragment_first;
965   fvnb->ip.reass.range_last = fragment_last;
966   fvnb->ip.reass.next_range_bi = ~0;
967   if (!more_fragments)
968     {
969       reass->last_packet_octet = fragment_last;
970     }
971   if (~0 == reass->first_bi)
972     {
973       // starting a new reassembly
974       rc =
975         ip4_full_reass_insert_range_in_chain (vm, reass, prev_range_bi, *bi0);
976       if (IP4_REASS_RC_OK != rc)
977         {
978           return rc;
979         }
980       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
981         {
982           ip4_full_reass_add_trace (vm, node, reass, *bi0, RANGE_NEW, 0, ~0);
983         }
984       *bi0 = ~0;
985       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
986       reass->fragments_n = 1;
987       return IP4_REASS_RC_OK;
988     }
989   reass->min_fragment_length =
990     clib_min (clib_net_to_host_u16 (fip->length),
991               fvnb->ip.reass.estimated_mtu);
992   while (~0 != candidate_range_bi)
993     {
994       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
995       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
996       if (fragment_first > candidate_vnb->ip.reass.range_last)
997         {
998           // this fragments starts after candidate range
999           prev_range_bi = candidate_range_bi;
1000           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
1001           if (candidate_vnb->ip.reass.range_last < fragment_last &&
1002               ~0 == candidate_range_bi)
1003             {
1004               // special case - this fragment falls beyond all known ranges
1005               rc = ip4_full_reass_insert_range_in_chain (vm, reass,
1006                                                          prev_range_bi, *bi0);
1007               if (IP4_REASS_RC_OK != rc)
1008                 {
1009                   return rc;
1010                 }
1011               consumed = 1;
1012               break;
1013             }
1014           continue;
1015         }
1016       if (fragment_last < candidate_vnb->ip.reass.range_first)
1017         {
1018           // this fragment ends before candidate range without any overlap
1019           rc = ip4_full_reass_insert_range_in_chain (vm, reass, prev_range_bi,
1020                                                      *bi0);
1021           if (IP4_REASS_RC_OK != rc)
1022             {
1023               return rc;
1024             }
1025           consumed = 1;
1026         }
1027       else
1028         {
1029           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
1030               fragment_last <= candidate_vnb->ip.reass.range_last)
1031             {
1032               // this fragment is a (sub)part of existing range, ignore it
1033               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1034                 {
1035                   ip4_full_reass_add_trace (vm, node, reass, *bi0,
1036                                             RANGE_OVERLAP, 0, ~0);
1037                 }
1038               break;
1039             }
1040           int discard_candidate = 0;
1041           if (fragment_first < candidate_vnb->ip.reass.range_first)
1042             {
1043               u32 overlap =
1044                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
1045               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
1046                 {
1047                   candidate_vnb->ip.reass.range_first += overlap;
1048                   if (reass->data_len < overlap)
1049                     {
1050                       return IP4_REASS_RC_INTERNAL_ERROR;
1051                     }
1052                   reass->data_len -= overlap;
1053                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1054                     {
1055                       ip4_full_reass_add_trace (vm, node, reass,
1056                                                 candidate_range_bi,
1057                                                 RANGE_SHRINK, 0, ~0);
1058                     }
1059                   rc = ip4_full_reass_insert_range_in_chain (
1060                     vm, reass, prev_range_bi, *bi0);
1061                   if (IP4_REASS_RC_OK != rc)
1062                     {
1063                       return rc;
1064                     }
1065                   consumed = 1;
1066                 }
1067               else
1068                 {
1069                   discard_candidate = 1;
1070                 }
1071             }
1072           else if (fragment_last > candidate_vnb->ip.reass.range_last)
1073             {
1074               u32 overlap =
1075                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
1076               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
1077                 {
1078                   fvnb->ip.reass.range_first += overlap;
1079                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
1080                     {
1081                       prev_range_bi = candidate_range_bi;
1082                       candidate_range_bi =
1083                         candidate_vnb->ip.reass.next_range_bi;
1084                       continue;
1085                     }
1086                   else
1087                     {
1088                       // special case - last range discarded
1089                       rc = ip4_full_reass_insert_range_in_chain (
1090                         vm, reass, candidate_range_bi, *bi0);
1091                       if (IP4_REASS_RC_OK != rc)
1092                         {
1093                           return rc;
1094                         }
1095                       consumed = 1;
1096                     }
1097                 }
1098               else
1099                 {
1100                   discard_candidate = 1;
1101                 }
1102             }
1103           else
1104             {
1105               discard_candidate = 1;
1106             }
1107           if (discard_candidate)
1108             {
1109               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
1110               // discard candidate range, probe next range
1111               rc = ip4_full_reass_remove_range_from_chain (
1112                 vm, node, reass, prev_range_bi, candidate_range_bi);
1113               if (IP4_REASS_RC_OK != rc)
1114                 {
1115                   return rc;
1116                 }
1117               if (~0 != next_range_bi)
1118                 {
1119                   candidate_range_bi = next_range_bi;
1120                   continue;
1121                 }
1122               else
1123                 {
1124                   // special case - last range discarded
1125                   rc = ip4_full_reass_insert_range_in_chain (
1126                     vm, reass, prev_range_bi, *bi0);
1127                   if (IP4_REASS_RC_OK != rc)
1128                     {
1129                       return rc;
1130                     }
1131                   consumed = 1;
1132                 }
1133             }
1134         }
1135       break;
1136     }
1137   ++reass->fragments_n;
1138   if (consumed)
1139     {
1140       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1141         {
1142           ip4_full_reass_add_trace (vm, node, reass, *bi0, RANGE_NEW, 0, ~0);
1143         }
1144     }
1145   if (~0 != reass->last_packet_octet &&
1146       reass->data_len == reass->last_packet_octet + 1)
1147     {
1148       *handoff_thread_idx = reass->sendout_thread_index;
1149       int handoff =
1150         reass->memory_owner_thread_index != reass->sendout_thread_index;
1151       rc =
1152         ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1153                                  is_custom);
1154       if (IP4_REASS_RC_OK == rc && handoff)
1155         {
1156           rc = IP4_REASS_RC_HANDOFF;
1157         }
1158     }
1159   else
1160     {
1161       if (consumed)
1162         {
1163           *bi0 = ~0;
1164           if (reass->fragments_n > rm->max_reass_len)
1165             {
1166               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1167             }
1168         }
1169       else
1170         {
1171           *next0 = IP4_FULL_REASS_NEXT_DROP;
1172           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1173         }
1174     }
1175   return rc;
1176 }
1177
1178 always_inline uword
1179 ip4_full_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
1180                        vlib_frame_t *frame, ip4_full_reass_node_type_t type,
1181                        bool is_local)
1182 {
1183   u32 *from = vlib_frame_vector_args (frame);
1184   u32 n_left_from, n_left_to_next, *to_next, next_index;
1185   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1186   ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1187   clib_spinlock_lock (&rt->lock);
1188
1189   n_left_from = frame->n_vectors;
1190   next_index = node->cached_next_index;
1191   while (n_left_from > 0)
1192     {
1193       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1194
1195       while (n_left_from > 0 && n_left_to_next > 0)
1196         {
1197           u32 bi0;
1198           vlib_buffer_t *b0;
1199           u32 next0;
1200           u32 error0 = IP4_ERROR_NONE;
1201
1202           bi0 = from[0];
1203           b0 = vlib_get_buffer (vm, bi0);
1204
1205           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1206           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1207             {
1208               // this is a whole packet - no fragmentation
1209               if (CUSTOM != type)
1210                 {
1211                   next0 = IP4_FULL_REASS_NEXT_INPUT;
1212                 }
1213               else
1214                 {
1215                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1216                 }
1217               ip4_full_reass_add_trace (vm, node, NULL, bi0, PASSTHROUGH, 0,
1218                                         ~0);
1219               goto packet_enqueue;
1220             }
1221
1222           if (is_local && !rm->is_local_reass_enabled)
1223             {
1224               next0 = IP4_FULL_REASS_NEXT_DROP;
1225               goto packet_enqueue;
1226             }
1227
1228           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1229           const u32 fragment_length =
1230             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1231           const u32 fragment_last = fragment_first + fragment_length - 1;
1232
1233           /* Keep track of received fragments */
1234           vlib_node_increment_counter (vm, node->node_index,
1235                                        IP4_ERROR_REASS_FRAGMENTS_RCVD, 1);
1236
1237           if (fragment_first > fragment_last ||
1238               fragment_first + fragment_length > UINT16_MAX - 20 ||
1239               (fragment_length < 8 && // 8 is minimum frag length per RFC 791
1240                ip4_get_fragment_more (ip0)))
1241             {
1242               next0 = IP4_FULL_REASS_NEXT_DROP;
1243               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1244               goto packet_enqueue;
1245             }
1246           ip4_full_reass_kv_t kv;
1247           u8 do_handoff = 0;
1248
1249           kv.k.as_u64[0] =
1250             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1251                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1252             (u64) ip0->src_address.as_u32 << 32;
1253           kv.k.as_u64[1] =
1254             (u64) ip0->dst_address.
1255             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1256
1257           ip4_full_reass_t *reass = ip4_full_reass_find_or_create (
1258             vm, node, rm, rt, &kv, &do_handoff, &n_left_to_next, &to_next);
1259
1260           if (reass)
1261             {
1262               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1263               if (0 == fragment_first)
1264                 {
1265                   reass->sendout_thread_index = vm->thread_index;
1266                 }
1267             }
1268
1269           if (PREDICT_FALSE (do_handoff))
1270             {
1271               next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1272               vnet_buffer (b0)->ip.reass.owner_thread_index =
1273                 kv.v.memory_owner_thread_index;
1274             }
1275           else if (reass)
1276             {
1277               u32 handoff_thread_idx;
1278               u32 counter = ~0;
1279               switch (ip4_full_reass_update
1280                       (vm, node, rm, rt, reass, &bi0, &next0,
1281                        &error0, CUSTOM == type, &handoff_thread_idx))
1282                 {
1283                 case IP4_REASS_RC_OK:
1284                   /* nothing to do here */
1285                   break;
1286                 case IP4_REASS_RC_HANDOFF:
1287                   next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1288                   b0 = vlib_get_buffer (vm, bi0);
1289                   vnet_buffer (b0)->ip.reass.owner_thread_index =
1290                     handoff_thread_idx;
1291                   break;
1292                 case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1293                   counter = IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
1294                   break;
1295                 case IP4_REASS_RC_NO_BUF:
1296                   counter = IP4_ERROR_REASS_NO_BUF;
1297                   break;
1298                 case IP4_REASS_RC_INTERNAL_ERROR:
1299                   counter = IP4_ERROR_REASS_INTERNAL_ERROR;
1300                   /* Sanitization is needed in internal error cases only, as
1301                    * the incoming packet is already dropped in other cases,
1302                    * also adding bi0 back to the reassembly list, fixes the
1303                    * leaking of buffers during internal errors.
1304                    *
1305                    * Also it doesnt make sense to send these buffers custom
1306                    * app, these fragments are with internal errors */
1307                   sanitize_reass_buffers_add_missing (vm, reass, &bi0);
1308                   reass->error_next_index = ~0;
1309                   break;
1310                 }
1311
1312               if (~0 != counter)
1313                 {
1314                   vlib_node_increment_counter (vm, node->node_index, counter,
1315                                                1);
1316                   ip4_full_reass_drop_all (vm, node, reass, &n_left_to_next,
1317                                            &to_next);
1318                   ip4_full_reass_free (rm, rt, reass);
1319                   goto next_packet;
1320                 }
1321             }
1322           else
1323             {
1324               next0 = IP4_FULL_REASS_NEXT_DROP;
1325               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1326             }
1327
1328
1329         packet_enqueue:
1330
1331           if (bi0 != ~0)
1332             {
1333               to_next[0] = bi0;
1334               to_next += 1;
1335               n_left_to_next -= 1;
1336
1337               /* bi0 might have been updated by reass_finalize, reload */
1338               b0 = vlib_get_buffer (vm, bi0);
1339               if (IP4_ERROR_NONE != error0)
1340                 {
1341                   b0->error = node->errors[error0];
1342                 }
1343
1344               if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1345                 {
1346                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1347                     {
1348                       ip4_full_reass_add_trace (
1349                         vm, node, NULL, bi0, HANDOFF, 0,
1350                         vnet_buffer (b0)->ip.reass.owner_thread_index);
1351                     }
1352                 }
1353               else if (FEATURE == type && IP4_ERROR_NONE == error0)
1354                 {
1355                   vnet_feature_next (&next0, b0);
1356                 }
1357
1358               /* Increment the counter to-custom-app also as this fragment is
1359                * also going to application */
1360               if (CUSTOM == type)
1361                 {
1362                   vlib_node_increment_counter (
1363                     vm, node->node_index, IP4_ERROR_REASS_TO_CUSTOM_APP, 1);
1364                 }
1365
1366               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1367                                                to_next, n_left_to_next,
1368                                                bi0, next0);
1369               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1370             }
1371
1372         next_packet:
1373           from += 1;
1374           n_left_from -= 1;
1375         }
1376
1377       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1378     }
1379
1380   clib_spinlock_unlock (&rt->lock);
1381   return frame->n_vectors;
1382 }
1383
1384 static char *ip4_full_reass_error_strings[] = {
1385 #define _(sym, string) string,
1386   foreach_ip4_error
1387 #undef _
1388 };
1389
1390 VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1391                                     vlib_node_runtime_t * node,
1392                                     vlib_frame_t * frame)
1393 {
1394   return ip4_full_reass_inline (vm, node, frame, NORMAL, false /* is_local */);
1395 }
1396
1397 VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1398     .name = "ip4-full-reassembly",
1399     .vector_size = sizeof (u32),
1400     .format_trace = format_ip4_full_reass_trace,
1401     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1402     .error_strings = ip4_full_reass_error_strings,
1403     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1404     .next_nodes =
1405         {
1406                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1407                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1408                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1409
1410         },
1411 };
1412
1413 VLIB_NODE_FN (ip4_local_full_reass_node)
1414 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1415 {
1416   return ip4_full_reass_inline (vm, node, frame, NORMAL, true /* is_local */);
1417 }
1418
1419 VLIB_REGISTER_NODE (ip4_local_full_reass_node) = {
1420     .name = "ip4-local-full-reassembly",
1421     .vector_size = sizeof (u32),
1422     .format_trace = format_ip4_full_reass_trace,
1423     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1424     .error_strings = ip4_full_reass_error_strings,
1425     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1426     .next_nodes =
1427         {
1428                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1429                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1430                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-local-full-reassembly-handoff",
1431
1432         },
1433 };
1434
1435 VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1436                                             vlib_node_runtime_t * node,
1437                                             vlib_frame_t * frame)
1438 {
1439   return ip4_full_reass_inline (vm, node, frame, FEATURE,
1440                                 false /* is_local */);
1441 }
1442
1443 VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1444     .name = "ip4-full-reassembly-feature",
1445     .vector_size = sizeof (u32),
1446     .format_trace = format_ip4_full_reass_trace,
1447     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1448     .error_strings = ip4_full_reass_error_strings,
1449     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1450     .next_nodes =
1451         {
1452                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1453                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1454                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1455         },
1456 };
1457
1458 VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1459     .arc_name = "ip4-unicast",
1460     .node_name = "ip4-full-reassembly-feature",
1461     .runs_before = VNET_FEATURES ("ip4-lookup",
1462                                   "ipsec4-input-feature"),
1463     .runs_after = 0,
1464 };
1465
1466 VLIB_NODE_FN (ip4_full_reass_node_custom) (vlib_main_t * vm,
1467                                            vlib_node_runtime_t * node,
1468                                            vlib_frame_t * frame)
1469 {
1470   return ip4_full_reass_inline (vm, node, frame, CUSTOM, false /* is_local */);
1471 }
1472
1473 VLIB_REGISTER_NODE (ip4_full_reass_node_custom) = {
1474     .name = "ip4-full-reassembly-custom",
1475     .vector_size = sizeof (u32),
1476     .format_trace = format_ip4_full_reass_trace,
1477     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1478     .error_strings = ip4_full_reass_error_strings,
1479     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1480     .next_nodes =
1481         {
1482                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1483                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1484                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-custom-hoff",
1485         },
1486 };
1487
1488 VNET_FEATURE_INIT (ip4_full_reass_custom, static) = {
1489     .arc_name = "ip4-unicast",
1490     .node_name = "ip4-full-reassembly-feature",
1491     .runs_before = VNET_FEATURES ("ip4-lookup",
1492                                   "ipsec4-input-feature"),
1493     .runs_after = 0,
1494 };
1495
1496
1497 #ifndef CLIB_MARCH_VARIANT
1498 uword
1499 ip4_full_reass_custom_register_next_node (uword node_index)
1500 {
1501   return vlib_node_add_next (vlib_get_main (),
1502                              ip4_full_reass_node_custom.index, node_index);
1503 }
1504
1505 always_inline u32
1506 ip4_full_reass_get_nbuckets ()
1507 {
1508   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1509   u32 nbuckets;
1510   u8 i;
1511
1512   /* need more mem with more workers */
1513   nbuckets = (u32) (rm->max_reass_n * (vlib_num_workers () + 1) /
1514                     IP4_REASS_HT_LOAD_FACTOR);
1515
1516   for (i = 0; i < 31; i++)
1517     if ((1 << i) >= nbuckets)
1518       break;
1519   nbuckets = 1 << i;
1520
1521   return nbuckets;
1522 }
1523 #endif /* CLIB_MARCH_VARIANT */
1524
1525 typedef enum
1526 {
1527   IP4_EVENT_CONFIG_CHANGED = 1,
1528 } ip4_full_reass_event_t;
1529
1530 typedef struct
1531 {
1532   int failure;
1533   clib_bihash_16_8_t *new_hash;
1534 } ip4_rehash_cb_ctx;
1535
1536 #ifndef CLIB_MARCH_VARIANT
1537 static int
1538 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1539 {
1540   ip4_rehash_cb_ctx *ctx = _ctx;
1541   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1542     {
1543       ctx->failure = 1;
1544     }
1545   return (BIHASH_WALK_CONTINUE);
1546 }
1547
1548 static void
1549 ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1550                            u32 max_reassembly_length,
1551                            u32 expire_walk_interval_ms)
1552 {
1553   ip4_full_reass_main.timeout_ms = timeout_ms;
1554   ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1555   ip4_full_reass_main.max_reass_n = max_reassemblies;
1556   ip4_full_reass_main.max_reass_len = max_reassembly_length;
1557   ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1558 }
1559
1560 vnet_api_error_t
1561 ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1562                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1563 {
1564   u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1565   ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1566                              max_reassembly_length, expire_walk_interval_ms);
1567   vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1568                              ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1569                              IP4_EVENT_CONFIG_CHANGED, 0);
1570   u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1571   if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1572     {
1573       clib_bihash_16_8_t new_hash;
1574       clib_memset (&new_hash, 0, sizeof (new_hash));
1575       ip4_rehash_cb_ctx ctx;
1576       ctx.failure = 0;
1577       ctx.new_hash = &new_hash;
1578       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1579                              new_nbuckets * 1024);
1580       clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1581                                                ip4_rehash_cb, &ctx);
1582       if (ctx.failure)
1583         {
1584           clib_bihash_free_16_8 (&new_hash);
1585           return -1;
1586         }
1587       else
1588         {
1589           clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1590           clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1591                             sizeof (ip4_full_reass_main.hash));
1592           clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1593         }
1594     }
1595   return 0;
1596 }
1597
1598 vnet_api_error_t
1599 ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1600                     u32 * max_reassembly_length,
1601                     u32 * expire_walk_interval_ms)
1602 {
1603   *timeout_ms = ip4_full_reass_main.timeout_ms;
1604   *max_reassemblies = ip4_full_reass_main.max_reass_n;
1605   *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1606   *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1607   return 0;
1608 }
1609
1610 static clib_error_t *
1611 ip4_full_reass_init_function (vlib_main_t * vm)
1612 {
1613   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1614   clib_error_t *error = 0;
1615   u32 nbuckets;
1616   vlib_node_t *node;
1617
1618   rm->vlib_main = vm;
1619
1620   vec_validate (rm->per_thread_data, vlib_num_workers ());
1621   ip4_full_reass_per_thread_t *rt;
1622   vec_foreach (rt, rm->per_thread_data)
1623   {
1624     clib_spinlock_init (&rt->lock);
1625     pool_alloc (rt->pool, rm->max_reass_n);
1626   }
1627
1628   node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1629   ASSERT (node);
1630   rm->ip4_full_reass_expire_node_idx = node->index;
1631
1632   ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1633                              IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1634                              IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1635                              IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1636
1637   nbuckets = ip4_full_reass_get_nbuckets ();
1638   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1639
1640   rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1641   rm->fq_local_index =
1642     vlib_frame_queue_main_init (ip4_local_full_reass_node.index, 0);
1643   rm->fq_feature_index =
1644     vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1645   rm->fq_custom_index =
1646     vlib_frame_queue_main_init (ip4_full_reass_node_custom.index, 0);
1647
1648   rm->feature_use_refcount_per_intf = NULL;
1649   rm->is_local_reass_enabled = 1;
1650
1651   return error;
1652 }
1653
1654 VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1655 #endif /* CLIB_MARCH_VARIANT */
1656
1657 static uword
1658 ip4_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
1659                              CLIB_UNUSED (vlib_frame_t *f))
1660 {
1661   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1662   uword event_type, *event_data = 0;
1663
1664   while (true)
1665     {
1666       vlib_process_wait_for_event_or_clock (vm,
1667                                             (f64)
1668                                             rm->expire_walk_interval_ms /
1669                                             (f64) MSEC_PER_SEC);
1670       event_type = vlib_process_get_events (vm, &event_data);
1671
1672       switch (event_type)
1673         {
1674         case ~0:
1675           /* no events => timeout */
1676           /* fallthrough */
1677         case IP4_EVENT_CONFIG_CHANGED:
1678           /* nothing to do here */
1679           break;
1680         default:
1681           clib_warning ("BUG: event type 0x%wx", event_type);
1682           break;
1683         }
1684       f64 now = vlib_time_now (vm);
1685
1686       ip4_full_reass_t *reass;
1687       int *pool_indexes_to_free = NULL;
1688
1689       uword thread_index = 0;
1690       int index;
1691       const uword nthreads = vlib_num_workers () + 1;
1692       u32 n_left_to_next, *to_next;
1693
1694       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1695         {
1696           ip4_full_reass_per_thread_t *rt =
1697             &rm->per_thread_data[thread_index];
1698           clib_spinlock_lock (&rt->lock);
1699
1700           vec_reset_length (pool_indexes_to_free);
1701
1702           pool_foreach_index (index, rt->pool)
1703             {
1704               reass = pool_elt_at_index (rt->pool, index);
1705               if (now > reass->last_heard + rm->timeout)
1706                 {
1707                   vec_add1 (pool_indexes_to_free, index);
1708                 }
1709             }
1710
1711           if (vec_len (pool_indexes_to_free))
1712             vlib_node_increment_counter (vm, node->node_index,
1713                                          IP4_ERROR_REASS_TIMEOUT,
1714                                          vec_len (pool_indexes_to_free));
1715           int *i;
1716           vec_foreach (i, pool_indexes_to_free)
1717           {
1718             ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1719             ip4_full_reass_drop_all (vm, node, reass, &n_left_to_next,
1720                                      &to_next);
1721             ip4_full_reass_free (rm, rt, reass);
1722           }
1723
1724           clib_spinlock_unlock (&rt->lock);
1725         }
1726
1727       vec_free (pool_indexes_to_free);
1728       if (event_data)
1729         {
1730           vec_set_len (event_data, 0);
1731         }
1732     }
1733
1734   return 0;
1735 }
1736
1737 VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1738     .function = ip4_full_reass_walk_expired,
1739     .type = VLIB_NODE_TYPE_PROCESS,
1740     .name = "ip4-full-reassembly-expire-walk",
1741     .format_trace = format_ip4_full_reass_trace,
1742     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1743     .error_strings = ip4_full_reass_error_strings,
1744
1745 };
1746
1747 static u8 *
1748 format_ip4_full_reass_key (u8 * s, va_list * args)
1749 {
1750   ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1751   s =
1752     format (s,
1753             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1754             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1755             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1756   return s;
1757 }
1758
1759 static u8 *
1760 format_ip4_reass (u8 * s, va_list * args)
1761 {
1762   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1763   ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1764
1765   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1766               "last_packet_octet: %u, trace_op_counter: %u\n",
1767               reass->id, format_ip4_full_reass_key, &reass->key,
1768               reass->first_bi, reass->data_len,
1769               reass->last_packet_octet, reass->trace_op_counter);
1770
1771   u32 bi = reass->first_bi;
1772   u32 counter = 0;
1773   while (~0 != bi)
1774     {
1775       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1776       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1777       s =
1778         format (s,
1779                 "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1780                 "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1781                 vnb->ip.reass.range_last, bi,
1782                 ip4_full_reass_buffer_get_data_offset (b),
1783                 ip4_full_reass_buffer_get_data_len (b),
1784                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1785       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1786         {
1787           bi = b->next_buffer;
1788         }
1789       else
1790         {
1791           bi = ~0;
1792         }
1793     }
1794   return s;
1795 }
1796
1797 static clib_error_t *
1798 show_ip4_reass (vlib_main_t * vm,
1799                 unformat_input_t * input,
1800                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1801 {
1802   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1803
1804   vlib_cli_output (vm, "---------------------");
1805   vlib_cli_output (vm, "IP4 reassembly status");
1806   vlib_cli_output (vm, "---------------------");
1807   bool details = false;
1808   if (unformat (input, "details"))
1809     {
1810       details = true;
1811     }
1812
1813   u32 sum_reass_n = 0;
1814   ip4_full_reass_t *reass;
1815   uword thread_index;
1816   const uword nthreads = vlib_num_workers () + 1;
1817   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1818     {
1819       ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1820       clib_spinlock_lock (&rt->lock);
1821       if (details)
1822         {
1823           pool_foreach (reass, rt->pool) {
1824             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1825           }
1826         }
1827       sum_reass_n += rt->reass_n;
1828       clib_spinlock_unlock (&rt->lock);
1829     }
1830   vlib_cli_output (vm, "---------------------");
1831   vlib_cli_output (vm, "Current full IP4 reassemblies count: %lu\n",
1832                    (long unsigned) sum_reass_n);
1833   vlib_cli_output (vm,
1834                    "Maximum configured concurrent full IP4 reassemblies per worker-thread: %lu\n",
1835                    (long unsigned) rm->max_reass_n);
1836   vlib_cli_output (vm,
1837                    "Maximum configured amount of fragments "
1838                    "per full IP4 reassembly: %lu\n",
1839                    (long unsigned) rm->max_reass_len);
1840   vlib_cli_output (vm,
1841                    "Maximum configured full IP4 reassembly timeout: %lums\n",
1842                    (long unsigned) rm->timeout_ms);
1843   vlib_cli_output (vm,
1844                    "Maximum configured full IP4 reassembly expire walk interval: %lums\n",
1845                    (long unsigned) rm->expire_walk_interval_ms);
1846   return 0;
1847 }
1848
1849 VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1850     .path = "show ip4-full-reassembly",
1851     .short_help = "show ip4-full-reassembly [details]",
1852     .function = show_ip4_reass,
1853 };
1854
1855 #ifndef CLIB_MARCH_VARIANT
1856 vnet_api_error_t
1857 ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1858 {
1859   return vnet_feature_enable_disable ("ip4-unicast",
1860                                       "ip4-full-reassembly-feature",
1861                                       sw_if_index, enable_disable, 0, 0);
1862 }
1863 #endif /* CLIB_MARCH_VARIANT */
1864
1865
1866 #define foreach_ip4_full_reass_handoff_error                       \
1867 _(CONGESTION_DROP, "congestion drop")
1868
1869
1870 typedef enum
1871 {
1872 #define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1873   foreach_ip4_full_reass_handoff_error
1874 #undef _
1875     IP4_FULL_REASS_HANDOFF_N_ERROR,
1876 } ip4_full_reass_handoff_error_t;
1877
1878 static char *ip4_full_reass_handoff_error_strings[] = {
1879 #define _(sym,string) string,
1880   foreach_ip4_full_reass_handoff_error
1881 #undef _
1882 };
1883
1884 typedef struct
1885 {
1886   u32 next_worker_index;
1887 } ip4_full_reass_handoff_trace_t;
1888
1889 static u8 *
1890 format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1891 {
1892   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1893   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1894   ip4_full_reass_handoff_trace_t *t =
1895     va_arg (*args, ip4_full_reass_handoff_trace_t *);
1896
1897   s =
1898     format (s, "ip4-full-reassembly-handoff: next-worker %d",
1899             t->next_worker_index);
1900
1901   return s;
1902 }
1903
1904 always_inline uword
1905 ip4_full_reass_handoff_node_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
1906                                     vlib_frame_t *frame,
1907                                     ip4_full_reass_node_type_t type,
1908                                     bool is_local)
1909 {
1910   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1911
1912   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1913   u32 n_enq, n_left_from, *from;
1914   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1915   u32 fq_index;
1916
1917   from = vlib_frame_vector_args (frame);
1918   n_left_from = frame->n_vectors;
1919   vlib_get_buffers (vm, from, bufs, n_left_from);
1920
1921   b = bufs;
1922   ti = thread_indices;
1923
1924   switch (type)
1925     {
1926     case NORMAL:
1927       if (is_local)
1928         {
1929           fq_index = rm->fq_local_index;
1930         }
1931       else
1932         {
1933           fq_index = rm->fq_index;
1934         }
1935       break;
1936     case FEATURE:
1937       fq_index = rm->fq_feature_index;
1938       break;
1939     case CUSTOM:
1940       fq_index = rm->fq_custom_index;
1941       break;
1942     default:
1943       clib_warning ("Unexpected `type' (%d)!", type);
1944     }
1945
1946   while (n_left_from > 0)
1947     {
1948       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1949
1950       if (PREDICT_FALSE
1951           ((node->flags & VLIB_NODE_FLAG_TRACE)
1952            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1953         {
1954           ip4_full_reass_handoff_trace_t *t =
1955             vlib_add_trace (vm, node, b[0], sizeof (*t));
1956           t->next_worker_index = ti[0];
1957         }
1958
1959       n_left_from -= 1;
1960       ti += 1;
1961       b += 1;
1962     }
1963   n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
1964                                          thread_indices, frame->n_vectors, 1);
1965
1966   if (n_enq < frame->n_vectors)
1967     vlib_node_increment_counter (vm, node->node_index,
1968                                  IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1969                                  frame->n_vectors - n_enq);
1970   return frame->n_vectors;
1971 }
1972
1973 VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1974                                             vlib_node_runtime_t * node,
1975                                             vlib_frame_t * frame)
1976 {
1977   return ip4_full_reass_handoff_node_inline (vm, node, frame, NORMAL,
1978                                              false /* is_local */);
1979 }
1980
1981
1982 VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1983   .name = "ip4-full-reassembly-handoff",
1984   .vector_size = sizeof (u32),
1985   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1986   .error_strings = ip4_full_reass_handoff_error_strings,
1987   .format_trace = format_ip4_full_reass_handoff_trace,
1988
1989   .n_next_nodes = 1,
1990
1991   .next_nodes = {
1992     [0] = "error-drop",
1993   },
1994 };
1995
1996 VLIB_NODE_FN (ip4_local_full_reass_handoff_node)
1997 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1998 {
1999   return ip4_full_reass_handoff_node_inline (vm, node, frame, NORMAL,
2000                                              true /* is_local */);
2001 }
2002
2003 VLIB_REGISTER_NODE (ip4_local_full_reass_handoff_node) = {
2004   .name = "ip4-local-full-reassembly-handoff",
2005   .vector_size = sizeof (u32),
2006   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
2007   .error_strings = ip4_full_reass_handoff_error_strings,
2008   .format_trace = format_ip4_full_reass_handoff_trace,
2009
2010   .n_next_nodes = 1,
2011
2012   .next_nodes = {
2013     [0] = "error-drop",
2014   },
2015 };
2016
2017 VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
2018                                                     vlib_node_runtime_t *
2019                                                     node,
2020                                                     vlib_frame_t * frame)
2021 {
2022   return ip4_full_reass_handoff_node_inline (vm, node, frame, FEATURE,
2023                                              false /* is_local */);
2024 }
2025
2026 VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
2027   .name = "ip4-full-reass-feature-hoff",
2028   .vector_size = sizeof (u32),
2029   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
2030   .error_strings = ip4_full_reass_handoff_error_strings,
2031   .format_trace = format_ip4_full_reass_handoff_trace,
2032
2033   .n_next_nodes = 1,
2034
2035   .next_nodes = {
2036     [0] = "error-drop",
2037   },
2038 };
2039
2040 VLIB_NODE_FN (ip4_full_reass_custom_handoff_node) (vlib_main_t * vm,
2041                                                     vlib_node_runtime_t *
2042                                                     node,
2043                                                     vlib_frame_t * frame)
2044 {
2045   return ip4_full_reass_handoff_node_inline (vm, node, frame, CUSTOM,
2046                                              false /* is_local */);
2047 }
2048
2049 VLIB_REGISTER_NODE (ip4_full_reass_custom_handoff_node) = {
2050   .name = "ip4-full-reass-custom-hoff",
2051   .vector_size = sizeof (u32),
2052   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
2053   .error_strings = ip4_full_reass_handoff_error_strings,
2054   .format_trace = format_ip4_full_reass_handoff_trace,
2055
2056   .n_next_nodes = 1,
2057
2058   .next_nodes = {
2059     [0] = "error-drop",
2060   },
2061 };
2062
2063 #ifndef CLIB_MARCH_VARIANT
2064 int
2065 ip4_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
2066 {
2067   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
2068   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
2069   if (is_enable)
2070     {
2071       if (!rm->feature_use_refcount_per_intf[sw_if_index])
2072         {
2073           ++rm->feature_use_refcount_per_intf[sw_if_index];
2074           return vnet_feature_enable_disable ("ip4-unicast",
2075                                               "ip4-full-reassembly-feature",
2076                                               sw_if_index, 1, 0, 0);
2077         }
2078       ++rm->feature_use_refcount_per_intf[sw_if_index];
2079     }
2080   else
2081     {
2082       --rm->feature_use_refcount_per_intf[sw_if_index];
2083       if (!rm->feature_use_refcount_per_intf[sw_if_index])
2084         return vnet_feature_enable_disable ("ip4-unicast",
2085                                             "ip4-full-reassembly-feature",
2086                                             sw_if_index, 0, 0, 0);
2087     }
2088   return -1;
2089 }
2090
2091 void
2092 ip4_local_full_reass_enable_disable (int enable)
2093 {
2094   if (enable)
2095     {
2096       ip4_full_reass_main.is_local_reass_enabled = 1;
2097     }
2098   else
2099     {
2100       ip4_full_reass_main.is_local_reass_enabled = 0;
2101     }
2102 }
2103
2104 int
2105 ip4_local_full_reass_enabled ()
2106 {
2107   return ip4_full_reass_main.is_local_reass_enabled;
2108 }
2109
2110 #endif
2111
2112 /*
2113  * fd.io coding-style-patch-verification: ON
2114  *
2115  * Local Variables:
2116  * eval: (c-set-style "gnu")
2117  * End:
2118  */