ip: Use .api declared error counters
[vpp.git] / src / vnet / ip / reass / ip4_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Full Reassembly.
19  *
20  * This file contains the source code for IPv4 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vnet/ip/ip.api_enum.h>
27 #include <vppinfra/fifo.h>
28 #include <vppinfra/bihash_16_8.h>
29 #include <vnet/ip/reass/ip4_full_reass.h>
30 #include <stddef.h>
31
32 #define MSEC_PER_SEC 1000
33 #define IP4_REASS_TIMEOUT_DEFAULT_MS 200
34
35 /* As there are only 1024 reass context per thread, either the DDOS attacks
36  * or fractions of real timeouts, would consume these contexts quickly and
37  * running out context space and unable to perform reassembly */
38 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 50 // 50 ms default
39 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
40 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT   3
41 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
42
43 #define IP4_REASS_DEBUG_BUFFERS 0
44 #if IP4_REASS_DEBUG_BUFFERS
45 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
46   do                                                 \
47     {                                                \
48       u32 _bi = bi;                                  \
49       printf (#what "buffer %u", _bi);               \
50       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
51       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
52         {                                            \
53           _bi = _b->next_buffer;                     \
54           printf ("[%u]", _bi);                      \
55           _b = vlib_get_buffer (vm, _bi);            \
56         }                                            \
57       printf ("\n");                                 \
58       fflush (stdout);                               \
59     }                                                \
60   while (0)
61 #else
62 #define IP4_REASS_DEBUG_BUFFER(...)
63 #endif
64
65 typedef enum
66 {
67   IP4_REASS_RC_OK,
68   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
69   IP4_REASS_RC_INTERNAL_ERROR,
70   IP4_REASS_RC_NO_BUF,
71   IP4_REASS_RC_HANDOFF,
72 } ip4_full_reass_rc_t;
73
74 typedef struct
75 {
76   union
77   {
78     struct
79     {
80       u32 xx_id;
81       ip4_address_t src;
82       ip4_address_t dst;
83       u16 frag_id;
84       u8 proto;
85       u8 unused;
86     };
87     u64 as_u64[2];
88   };
89 } ip4_full_reass_key_t;
90
91 typedef union
92 {
93   struct
94   {
95     u32 reass_index;
96     u32 memory_owner_thread_index;
97   };
98   u64 as_u64;
99 } ip4_full_reass_val_t;
100
101 typedef union
102 {
103   struct
104   {
105     ip4_full_reass_key_t k;
106     ip4_full_reass_val_t v;
107   };
108   clib_bihash_kv_16_8_t kv;
109 } ip4_full_reass_kv_t;
110
111 always_inline u32
112 ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
113 {
114   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
115   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
116 }
117
118 always_inline u16
119 ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
120 {
121   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
122   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
123     (vnb->ip.reass.fragment_first +
124      ip4_full_reass_buffer_get_data_offset (b)) + 1;
125 }
126
127 typedef struct
128 {
129   // hash table key
130   ip4_full_reass_key_t key;
131   // time when last packet was received
132   f64 last_heard;
133   // internal id of this reassembly
134   u64 id;
135   // buffer index of first buffer in this reassembly context
136   u32 first_bi;
137   // last octet of packet, ~0 until fragment without more_fragments arrives
138   u32 last_packet_octet;
139   // length of data collected so far
140   u32 data_len;
141   // trace operation counter
142   u32 trace_op_counter;
143   // next index - used by non-feature node
144   u32 next_index;
145   // error next index - used by custom apps (~0 if not used)
146   u32 error_next_index;
147   // minimum fragment length for this reassembly - used to estimate MTU
148   u16 min_fragment_length;
149   // number of fragments in this reassembly
150   u32 fragments_n;
151   // thread owning memory for this context (whose pool contains this ctx)
152   u32 memory_owner_thread_index;
153   // thread which received fragment with offset 0 and which sends out the
154   // completed reassembly
155   u32 sendout_thread_index;
156 } ip4_full_reass_t;
157
158 typedef struct
159 {
160   ip4_full_reass_t *pool;
161   u32 reass_n;
162   u32 id_counter;
163   // for pacing the main thread timeouts
164   u32 last_id;
165   clib_spinlock_t lock;
166 } ip4_full_reass_per_thread_t;
167
168 typedef struct
169 {
170   // IPv4 config
171   u32 timeout_ms;
172   f64 timeout;
173   u32 expire_walk_interval_ms;
174   // maximum number of fragments in one reassembly
175   u32 max_reass_len;
176   // maximum number of reassemblies
177   u32 max_reass_n;
178
179   // IPv4 runtime
180   clib_bihash_16_8_t hash;
181   // per-thread data
182   ip4_full_reass_per_thread_t *per_thread_data;
183
184   // convenience
185   vlib_main_t *vlib_main;
186
187   u32 ip4_full_reass_expire_node_idx;
188
189   /** Worker handoff */
190   u32 fq_index;
191   u32 fq_local_index;
192   u32 fq_feature_index;
193   u32 fq_custom_index;
194
195   // reference count for enabling/disabling feature - per interface
196   u32 *feature_use_refcount_per_intf;
197
198   // whether local fragmented packets are reassembled or not
199   int is_local_reass_enabled;
200 } ip4_full_reass_main_t;
201
202 extern ip4_full_reass_main_t ip4_full_reass_main;
203
204 #ifndef CLIB_MARCH_VARIANT
205 ip4_full_reass_main_t ip4_full_reass_main;
206 #endif /* CLIB_MARCH_VARIANT */
207
208 typedef enum
209 {
210   IP4_FULL_REASS_NEXT_INPUT,
211   IP4_FULL_REASS_NEXT_DROP,
212   IP4_FULL_REASS_NEXT_HANDOFF,
213   IP4_FULL_REASS_N_NEXT,
214 } ip4_full_reass_next_t;
215
216 typedef enum
217 {
218   NORMAL,
219   FEATURE,
220   CUSTOM
221 } ip4_full_reass_node_type_t;
222
223 typedef enum
224 {
225   RANGE_NEW,
226   RANGE_SHRINK,
227   RANGE_DISCARD,
228   RANGE_OVERLAP,
229   FINALIZE,
230   HANDOFF,
231   PASSTHROUGH,
232 } ip4_full_reass_trace_operation_e;
233
234 typedef struct
235 {
236   u16 range_first;
237   u16 range_last;
238   u32 range_bi;
239   i32 data_offset;
240   u32 data_len;
241   u32 first_bi;
242 } ip4_full_reass_range_trace_t;
243
244 typedef struct
245 {
246   ip4_full_reass_trace_operation_e action;
247   u32 reass_id;
248   ip4_full_reass_range_trace_t trace_range;
249   u32 size_diff;
250   u32 op_id;
251   u32 thread_id;
252   u32 thread_id_to;
253   u32 fragment_first;
254   u32 fragment_last;
255   u32 total_data_len;
256   bool is_after_handoff;
257   ip4_header_t ip4_header;
258 } ip4_full_reass_trace_t;
259
260 extern vlib_node_registration_t ip4_full_reass_node;
261 extern vlib_node_registration_t ip4_full_reass_node_feature;
262 extern vlib_node_registration_t ip4_full_reass_node_custom;
263
264 static void
265 ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
266                               ip4_full_reass_range_trace_t * trace)
267 {
268   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
269   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
270   trace->range_first = vnb->ip.reass.range_first;
271   trace->range_last = vnb->ip.reass.range_last;
272   trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
273   trace->data_len = ip4_full_reass_buffer_get_data_len (b);
274   trace->range_bi = bi;
275 }
276
277 static u8 *
278 format_ip4_full_reass_range_trace (u8 * s, va_list * args)
279 {
280   ip4_full_reass_range_trace_t *trace =
281     va_arg (*args, ip4_full_reass_range_trace_t *);
282   s =
283     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
284             trace->range_last, trace->data_offset, trace->data_len,
285             trace->range_bi);
286   return s;
287 }
288
289 static u8 *
290 format_ip4_full_reass_trace (u8 * s, va_list * args)
291 {
292   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
293   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
294   ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
295   u32 indent = 0;
296   if (~0 != t->reass_id)
297     {
298       if (t->is_after_handoff)
299         {
300           s =
301             format (s, "%U\n", format_ip4_header, &t->ip4_header,
302                     sizeof (t->ip4_header));
303           indent = 2;
304         }
305       s =
306         format (s, "%Ureass id: %u, op id: %u, ", format_white_space, indent,
307                 t->reass_id, t->op_id);
308       indent = format_get_indent (s);
309       s =
310         format (s,
311                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
312                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
313                 t->fragment_last);
314     }
315   switch (t->action)
316     {
317     case RANGE_SHRINK:
318       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
319                   format_ip4_full_reass_range_trace, &t->trace_range,
320                   t->size_diff);
321       break;
322     case RANGE_DISCARD:
323       s = format (s, "\n%Udiscard %U", format_white_space, indent,
324                   format_ip4_full_reass_range_trace, &t->trace_range);
325       break;
326     case RANGE_NEW:
327       s = format (s, "\n%Unew %U", format_white_space, indent,
328                   format_ip4_full_reass_range_trace, &t->trace_range);
329       break;
330     case RANGE_OVERLAP:
331       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
332                   format_ip4_full_reass_range_trace, &t->trace_range);
333       break;
334     case FINALIZE:
335       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
336       break;
337     case HANDOFF:
338       s =
339         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
340                 t->thread_id_to);
341       break;
342     case PASSTHROUGH:
343       s = format (s, "passthrough - not a fragment");
344       break;
345     }
346   return s;
347 }
348
349 static void
350 ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
351                           ip4_full_reass_t * reass, u32 bi,
352                           ip4_full_reass_trace_operation_e action,
353                           u32 size_diff, u32 thread_id_to)
354 {
355   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
356   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
357   if (pool_is_free_index
358       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
359     {
360       // this buffer's trace is gone
361       b->flags &= ~VLIB_BUFFER_IS_TRACED;
362       return;
363     }
364   bool is_after_handoff = false;
365   if (vlib_buffer_get_trace_thread (b) != vm->thread_index)
366     {
367       is_after_handoff = true;
368     }
369   ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
370   t->is_after_handoff = is_after_handoff;
371   if (t->is_after_handoff)
372     {
373       clib_memcpy (&t->ip4_header, vlib_buffer_get_current (b),
374                    clib_min (sizeof (t->ip4_header), b->current_length));
375     }
376   if (reass)
377     {
378       t->reass_id = reass->id;
379       t->op_id = reass->trace_op_counter;
380       t->trace_range.first_bi = reass->first_bi;
381       t->total_data_len = reass->data_len;
382       ++reass->trace_op_counter;
383     }
384   else
385     {
386       t->reass_id = ~0;
387       t->op_id = 0;
388       t->trace_range.first_bi = 0;
389       t->total_data_len = 0;
390     }
391   t->action = action;
392   ip4_full_reass_trace_details (vm, bi, &t->trace_range);
393   t->size_diff = size_diff;
394   t->thread_id = vm->thread_index;
395   t->thread_id_to = thread_id_to;
396   t->fragment_first = vnb->ip.reass.fragment_first;
397   t->fragment_last = vnb->ip.reass.fragment_last;
398 #if 0
399   static u8 *s = NULL;
400   s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
401   printf ("%.*s\n", vec_len (s), s);
402   fflush (stdout);
403   vec_reset_length (s);
404 #endif
405 }
406
407 always_inline void
408 ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
409                          ip4_full_reass_t * reass)
410 {
411   pool_put (rt->pool, reass);
412   --rt->reass_n;
413 }
414
415 always_inline void
416 ip4_full_reass_free (ip4_full_reass_main_t * rm,
417                      ip4_full_reass_per_thread_t * rt,
418                      ip4_full_reass_t * reass)
419 {
420   clib_bihash_kv_16_8_t kv;
421   kv.key[0] = reass->key.as_u64[0];
422   kv.key[1] = reass->key.as_u64[1];
423   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
424   return ip4_full_reass_free_ctx (rt, reass);
425 }
426
427 /* n_left_to_next, and to_next are taken as input params, as this function
428  * could be called from a graphnode, where its managing local copy of these
429  * variables, and ignoring those and still trying to enqueue the buffers
430  * with local variables would cause either buffer leak or corruption */
431 always_inline void
432 ip4_full_reass_drop_all (vlib_main_t *vm, vlib_node_runtime_t *node,
433                          ip4_full_reass_t *reass, u32 *n_left_to_next,
434                          u32 **to_next)
435 {
436   u32 range_bi = reass->first_bi;
437   vlib_buffer_t *range_b;
438   vnet_buffer_opaque_t *range_vnb;
439   u32 *to_free = NULL;
440
441   while (~0 != range_bi)
442     {
443       range_b = vlib_get_buffer (vm, range_bi);
444       range_vnb = vnet_buffer (range_b);
445
446       if (~0 != range_bi)
447         {
448           vec_add1 (to_free, range_bi);
449         }
450
451       range_bi = range_vnb->ip.reass.next_range_bi;
452     }
453
454   /* send to next_error_index */
455   if (~0 != reass->error_next_index &&
456       reass->error_next_index < node->n_next_nodes)
457     {
458       u32 next_index;
459
460       next_index = reass->error_next_index;
461       u32 bi = ~0;
462
463       /* record number of packets sent to custom app */
464       vlib_node_increment_counter (vm, node->node_index,
465                                    IP4_ERROR_REASS_TO_CUSTOM_APP,
466                                    vec_len (to_free));
467
468       while (vec_len (to_free) > 0)
469         {
470           vlib_get_next_frame (vm, node, next_index, *to_next,
471                                (*n_left_to_next));
472
473           while (vec_len (to_free) > 0 && (*n_left_to_next) > 0)
474             {
475               bi = vec_pop (to_free);
476
477               if (~0 != bi)
478                 {
479                   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
480                   if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
481                     {
482                       ip4_full_reass_add_trace (vm, node, reass, bi,
483                                                 RANGE_DISCARD, 0, ~0);
484                     }
485                   *to_next[0] = bi;
486                   (*to_next) += 1;
487                   (*n_left_to_next) -= 1;
488                 }
489             }
490           vlib_put_next_frame (vm, node, next_index, (*n_left_to_next));
491         }
492     }
493   else
494     {
495       vlib_buffer_free (vm, to_free, vec_len (to_free));
496     }
497   vec_free (to_free);
498 }
499
500 always_inline void
501 sanitize_reass_buffers_add_missing (vlib_main_t *vm, ip4_full_reass_t *reass,
502                                     u32 *bi0)
503 {
504   u32 range_bi = reass->first_bi;
505   vlib_buffer_t *range_b;
506   vnet_buffer_opaque_t *range_vnb;
507
508   while (~0 != range_bi)
509     {
510       range_b = vlib_get_buffer (vm, range_bi);
511       range_vnb = vnet_buffer (range_b);
512       u32 bi = range_bi;
513       if (~0 != bi)
514         {
515           if (bi == *bi0)
516             *bi0 = ~0;
517           if (range_b->flags & VLIB_BUFFER_NEXT_PRESENT)
518             {
519               u32 _bi = bi;
520               vlib_buffer_t *_b = vlib_get_buffer (vm, _bi);
521               while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)
522                 {
523                   if (_b->next_buffer != range_vnb->ip.reass.next_range_bi)
524                     {
525                       _bi = _b->next_buffer;
526                       _b = vlib_get_buffer (vm, _bi);
527                     }
528                   else
529                     {
530                       _b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
531                       break;
532                     }
533                 }
534             }
535           range_bi = range_vnb->ip.reass.next_range_bi;
536         }
537     }
538   if (*bi0 != ~0)
539     {
540       vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
541       vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
542       if (~0 != reass->first_bi)
543         {
544           fvnb->ip.reass.next_range_bi = reass->first_bi;
545           reass->first_bi = *bi0;
546         }
547       else
548         {
549           reass->first_bi = *bi0;
550           fvnb->ip.reass.next_range_bi = ~0;
551         }
552       *bi0 = ~0;
553     }
554 }
555
556 always_inline void
557 ip4_full_reass_init (ip4_full_reass_t * reass)
558 {
559   reass->first_bi = ~0;
560   reass->last_packet_octet = ~0;
561   reass->data_len = 0;
562   reass->next_index = ~0;
563   reass->error_next_index = ~0;
564 }
565
566 always_inline ip4_full_reass_t *
567 ip4_full_reass_find_or_create (vlib_main_t *vm, vlib_node_runtime_t *node,
568                                ip4_full_reass_main_t *rm,
569                                ip4_full_reass_per_thread_t *rt,
570                                ip4_full_reass_kv_t *kv, u8 *do_handoff,
571                                u32 *n_left_to_next, u32 **to_next)
572 {
573   ip4_full_reass_t *reass;
574   f64 now;
575
576 again:
577
578   reass = NULL;
579   now = vlib_time_now (vm);
580   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
581     {
582       if (vm->thread_index != kv->v.memory_owner_thread_index)
583         {
584           *do_handoff = 1;
585           return NULL;
586         }
587       reass =
588         pool_elt_at_index (rm->per_thread_data
589                            [kv->v.memory_owner_thread_index].pool,
590                            kv->v.reass_index);
591
592       if (now > reass->last_heard + rm->timeout)
593         {
594           vlib_node_increment_counter (vm, node->node_index,
595                                        IP4_ERROR_REASS_TIMEOUT, 1);
596           ip4_full_reass_drop_all (vm, node, reass, n_left_to_next, to_next);
597           ip4_full_reass_free (rm, rt, reass);
598           reass = NULL;
599         }
600     }
601
602   if (reass)
603     {
604       reass->last_heard = now;
605       return reass;
606     }
607
608   if (rt->reass_n >= rm->max_reass_n)
609     {
610       reass = NULL;
611       return reass;
612     }
613   else
614     {
615       pool_get (rt->pool, reass);
616       clib_memset (reass, 0, sizeof (*reass));
617       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
618       reass->memory_owner_thread_index = vm->thread_index;
619       ++rt->id_counter;
620       ip4_full_reass_init (reass);
621       ++rt->reass_n;
622     }
623
624   reass->key.as_u64[0] = kv->kv.key[0];
625   reass->key.as_u64[1] = kv->kv.key[1];
626   kv->v.reass_index = (reass - rt->pool);
627   kv->v.memory_owner_thread_index = vm->thread_index;
628   reass->last_heard = now;
629
630   int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
631   if (rv)
632     {
633       ip4_full_reass_free_ctx (rt, reass);
634       reass = NULL;
635       // if other worker created a context already work with the other copy
636       if (-2 == rv)
637         goto again;
638     }
639
640   return reass;
641 }
642
643 always_inline ip4_full_reass_rc_t
644 ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
645                          ip4_full_reass_main_t * rm,
646                          ip4_full_reass_per_thread_t * rt,
647                          ip4_full_reass_t * reass, u32 * bi0,
648                          u32 * next0, u32 * error0, bool is_custom)
649 {
650   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
651   vlib_buffer_t *last_b = NULL;
652   u32 sub_chain_bi = reass->first_bi;
653   u32 total_length = 0;
654   u32 buf_cnt = 0;
655   do
656     {
657       u32 tmp_bi = sub_chain_bi;
658       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
659       ip4_header_t *ip = vlib_buffer_get_current (tmp);
660       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
661       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
662           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
663         {
664           return IP4_REASS_RC_INTERNAL_ERROR;
665         }
666
667       u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
668       u32 trim_front =
669         ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
670       u32 trim_end =
671         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
672       if (tmp_bi == reass->first_bi)
673         {
674           /* first buffer - keep ip4 header */
675           if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
676             {
677               return IP4_REASS_RC_INTERNAL_ERROR;
678             }
679           trim_front = 0;
680           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
681             ip4_header_bytes (ip);
682           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
683             {
684               return IP4_REASS_RC_INTERNAL_ERROR;
685             }
686         }
687       u32 keep_data =
688         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
689       while (1)
690         {
691           ++buf_cnt;
692           if (trim_front)
693             {
694               if (trim_front > tmp->current_length)
695                 {
696                   /* drop whole buffer */
697                   u32 to_be_freed_bi = tmp_bi;
698                   trim_front -= tmp->current_length;
699                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
700                     {
701                       return IP4_REASS_RC_INTERNAL_ERROR;
702                     }
703                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
704                   tmp_bi = tmp->next_buffer;
705                   tmp->next_buffer = 0;
706                   tmp = vlib_get_buffer (vm, tmp_bi);
707                   vlib_buffer_free_one (vm, to_be_freed_bi);
708                   continue;
709                 }
710               else
711                 {
712                   vlib_buffer_advance (tmp, trim_front);
713                   trim_front = 0;
714                 }
715             }
716           if (keep_data)
717             {
718               if (last_b)
719                 {
720                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
721                   last_b->next_buffer = tmp_bi;
722                 }
723               last_b = tmp;
724               if (keep_data <= tmp->current_length)
725                 {
726                   tmp->current_length = keep_data;
727                   keep_data = 0;
728                 }
729               else
730                 {
731                   keep_data -= tmp->current_length;
732                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
733                     {
734                       return IP4_REASS_RC_INTERNAL_ERROR;
735                     }
736                 }
737               total_length += tmp->current_length;
738               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
739                 {
740                   tmp_bi = tmp->next_buffer;
741                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
742                 }
743               else
744                 {
745                   break;
746                 }
747             }
748           else
749             {
750               u32 to_be_freed_bi = tmp_bi;
751               if (reass->first_bi == tmp_bi)
752                 {
753                   return IP4_REASS_RC_INTERNAL_ERROR;
754                 }
755               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
756                 {
757                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
758                   tmp_bi = tmp->next_buffer;
759                   tmp->next_buffer = 0;
760                   tmp = vlib_get_buffer (vm, tmp_bi);
761                   vlib_buffer_free_one (vm, to_be_freed_bi);
762                 }
763               else
764                 {
765                   tmp->next_buffer = 0;
766                   vlib_buffer_free_one (vm, to_be_freed_bi);
767                   break;
768                 }
769             }
770         }
771       sub_chain_bi =
772         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
773         reass.next_range_bi;
774     }
775   while (~0 != sub_chain_bi);
776
777   if (!last_b)
778     {
779       return IP4_REASS_RC_INTERNAL_ERROR;
780     }
781   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
782
783   if (total_length < first_b->current_length)
784     {
785       return IP4_REASS_RC_INTERNAL_ERROR;
786     }
787   total_length -= first_b->current_length;
788   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
789   first_b->total_length_not_including_first_buffer = total_length;
790   ip4_header_t *ip = vlib_buffer_get_current (first_b);
791   ip->flags_and_fragment_offset = 0;
792   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
793   ip->checksum = ip4_header_checksum (ip);
794   if (!vlib_buffer_chain_linearize (vm, first_b))
795     {
796       return IP4_REASS_RC_NO_BUF;
797     }
798   // reset to reconstruct the mbuf linking
799   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
800   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
801     {
802       ip4_full_reass_add_trace (vm, node, reass, reass->first_bi, FINALIZE, 0,
803                                 ~0);
804 #if 0
805       // following code does a hexdump of packet fragments to stdout ...
806       do
807         {
808           u32 bi = reass->first_bi;
809           u8 *s = NULL;
810           while (~0 != bi)
811             {
812               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
813               s = format (s, "%u: %U\n", bi, format_hexdump,
814                           vlib_buffer_get_current (b), b->current_length);
815               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
816                 {
817                   bi = b->next_buffer;
818                 }
819               else
820                 {
821                   break;
822                 }
823             }
824           printf ("%.*s\n", vec_len (s), s);
825           fflush (stdout);
826           vec_free (s);
827         }
828       while (0);
829 #endif
830     }
831   *bi0 = reass->first_bi;
832   if (!is_custom)
833     {
834       *next0 = IP4_FULL_REASS_NEXT_INPUT;
835     }
836   else
837     {
838       *next0 = reass->next_index;
839     }
840   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
841
842   /* Keep track of number of successfully reassembled packets and number of
843    * fragments reassembled */
844   vlib_node_increment_counter (vm, node->node_index, IP4_ERROR_REASS_SUCCESS,
845                                1);
846
847   vlib_node_increment_counter (vm, node->node_index,
848                                IP4_ERROR_REASS_FRAGMENTS_REASSEMBLED,
849                                reass->fragments_n);
850
851   *error0 = IP4_ERROR_NONE;
852   ip4_full_reass_free (rm, rt, reass);
853   reass = NULL;
854   return IP4_REASS_RC_OK;
855 }
856
857 always_inline ip4_full_reass_rc_t
858 ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
859                                       ip4_full_reass_t * reass,
860                                       u32 prev_range_bi, u32 new_next_bi)
861 {
862   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
863   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
864   if (~0 != prev_range_bi)
865     {
866       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
867       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
868       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
869       prev_vnb->ip.reass.next_range_bi = new_next_bi;
870     }
871   else
872     {
873       if (~0 != reass->first_bi)
874         {
875           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
876         }
877       reass->first_bi = new_next_bi;
878     }
879   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
880   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
881       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
882     {
883       return IP4_REASS_RC_INTERNAL_ERROR;
884     }
885   reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
886   return IP4_REASS_RC_OK;
887 }
888
889 always_inline ip4_full_reass_rc_t
890 ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
891                                         vlib_node_runtime_t * node,
892                                         ip4_full_reass_t * reass,
893                                         u32 prev_range_bi, u32 discard_bi)
894 {
895   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
896   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
897   if (~0 != prev_range_bi)
898     {
899       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
900       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
901       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
902         {
903           return IP4_REASS_RC_INTERNAL_ERROR;
904         }
905       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
906     }
907   else
908     {
909       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
910     }
911   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
912   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
913       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
914     {
915       return IP4_REASS_RC_INTERNAL_ERROR;
916     }
917   reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
918   while (1)
919     {
920       u32 to_be_freed_bi = discard_bi;
921       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
922         {
923           ip4_full_reass_add_trace (vm, node, reass, discard_bi, RANGE_DISCARD,
924                                     0, ~0);
925         }
926       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
927         {
928           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
929           discard_bi = discard_b->next_buffer;
930           discard_b->next_buffer = 0;
931           discard_b = vlib_get_buffer (vm, discard_bi);
932           vlib_buffer_free_one (vm, to_be_freed_bi);
933         }
934       else
935         {
936           discard_b->next_buffer = 0;
937           vlib_buffer_free_one (vm, to_be_freed_bi);
938           break;
939         }
940     }
941   return IP4_REASS_RC_OK;
942 }
943
944 always_inline ip4_full_reass_rc_t
945 ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
946                        ip4_full_reass_main_t * rm,
947                        ip4_full_reass_per_thread_t * rt,
948                        ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
949                        u32 * error0, bool is_custom, u32 * handoff_thread_idx)
950 {
951   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
952   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
953   if (is_custom)
954     {
955       // store (error_)next_index before it's overwritten
956       reass->next_index = fvnb->ip.reass.next_index;
957       reass->error_next_index = fvnb->ip.reass.error_next_index;
958     }
959   ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
960   int consumed = 0;
961   ip4_header_t *fip = vlib_buffer_get_current (fb);
962   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
963   const u32 fragment_length =
964     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
965   const u32 fragment_last = fragment_first + fragment_length - 1;
966   fvnb->ip.reass.fragment_first = fragment_first;
967   fvnb->ip.reass.fragment_last = fragment_last;
968   int more_fragments = ip4_get_fragment_more (fip);
969   u32 candidate_range_bi = reass->first_bi;
970   u32 prev_range_bi = ~0;
971   fvnb->ip.reass.range_first = fragment_first;
972   fvnb->ip.reass.range_last = fragment_last;
973   fvnb->ip.reass.next_range_bi = ~0;
974   if (!more_fragments)
975     {
976       reass->last_packet_octet = fragment_last;
977     }
978   if (~0 == reass->first_bi)
979     {
980       // starting a new reassembly
981       rc =
982         ip4_full_reass_insert_range_in_chain (vm, reass, prev_range_bi, *bi0);
983       if (IP4_REASS_RC_OK != rc)
984         {
985           return rc;
986         }
987       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
988         {
989           ip4_full_reass_add_trace (vm, node, reass, *bi0, RANGE_NEW, 0, ~0);
990         }
991       *bi0 = ~0;
992       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
993       reass->fragments_n = 1;
994       return IP4_REASS_RC_OK;
995     }
996   reass->min_fragment_length =
997     clib_min (clib_net_to_host_u16 (fip->length),
998               fvnb->ip.reass.estimated_mtu);
999   while (~0 != candidate_range_bi)
1000     {
1001       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
1002       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
1003       if (fragment_first > candidate_vnb->ip.reass.range_last)
1004         {
1005           // this fragments starts after candidate range
1006           prev_range_bi = candidate_range_bi;
1007           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
1008           if (candidate_vnb->ip.reass.range_last < fragment_last &&
1009               ~0 == candidate_range_bi)
1010             {
1011               // special case - this fragment falls beyond all known ranges
1012               rc = ip4_full_reass_insert_range_in_chain (vm, reass,
1013                                                          prev_range_bi, *bi0);
1014               if (IP4_REASS_RC_OK != rc)
1015                 {
1016                   return rc;
1017                 }
1018               consumed = 1;
1019               break;
1020             }
1021           continue;
1022         }
1023       if (fragment_last < candidate_vnb->ip.reass.range_first)
1024         {
1025           // this fragment ends before candidate range without any overlap
1026           rc = ip4_full_reass_insert_range_in_chain (vm, reass, prev_range_bi,
1027                                                      *bi0);
1028           if (IP4_REASS_RC_OK != rc)
1029             {
1030               return rc;
1031             }
1032           consumed = 1;
1033         }
1034       else
1035         {
1036           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
1037               fragment_last <= candidate_vnb->ip.reass.range_last)
1038             {
1039               // this fragment is a (sub)part of existing range, ignore it
1040               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1041                 {
1042                   ip4_full_reass_add_trace (vm, node, reass, *bi0,
1043                                             RANGE_OVERLAP, 0, ~0);
1044                 }
1045               break;
1046             }
1047           int discard_candidate = 0;
1048           if (fragment_first < candidate_vnb->ip.reass.range_first)
1049             {
1050               u32 overlap =
1051                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
1052               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
1053                 {
1054                   candidate_vnb->ip.reass.range_first += overlap;
1055                   if (reass->data_len < overlap)
1056                     {
1057                       return IP4_REASS_RC_INTERNAL_ERROR;
1058                     }
1059                   reass->data_len -= overlap;
1060                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1061                     {
1062                       ip4_full_reass_add_trace (vm, node, reass,
1063                                                 candidate_range_bi,
1064                                                 RANGE_SHRINK, 0, ~0);
1065                     }
1066                   rc = ip4_full_reass_insert_range_in_chain (
1067                     vm, reass, prev_range_bi, *bi0);
1068                   if (IP4_REASS_RC_OK != rc)
1069                     {
1070                       return rc;
1071                     }
1072                   consumed = 1;
1073                 }
1074               else
1075                 {
1076                   discard_candidate = 1;
1077                 }
1078             }
1079           else if (fragment_last > candidate_vnb->ip.reass.range_last)
1080             {
1081               u32 overlap =
1082                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
1083               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
1084                 {
1085                   fvnb->ip.reass.range_first += overlap;
1086                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
1087                     {
1088                       prev_range_bi = candidate_range_bi;
1089                       candidate_range_bi =
1090                         candidate_vnb->ip.reass.next_range_bi;
1091                       continue;
1092                     }
1093                   else
1094                     {
1095                       // special case - last range discarded
1096                       rc = ip4_full_reass_insert_range_in_chain (
1097                         vm, reass, candidate_range_bi, *bi0);
1098                       if (IP4_REASS_RC_OK != rc)
1099                         {
1100                           return rc;
1101                         }
1102                       consumed = 1;
1103                     }
1104                 }
1105               else
1106                 {
1107                   discard_candidate = 1;
1108                 }
1109             }
1110           else
1111             {
1112               discard_candidate = 1;
1113             }
1114           if (discard_candidate)
1115             {
1116               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
1117               // discard candidate range, probe next range
1118               rc = ip4_full_reass_remove_range_from_chain (
1119                 vm, node, reass, prev_range_bi, candidate_range_bi);
1120               if (IP4_REASS_RC_OK != rc)
1121                 {
1122                   return rc;
1123                 }
1124               if (~0 != next_range_bi)
1125                 {
1126                   candidate_range_bi = next_range_bi;
1127                   continue;
1128                 }
1129               else
1130                 {
1131                   // special case - last range discarded
1132                   rc = ip4_full_reass_insert_range_in_chain (
1133                     vm, reass, prev_range_bi, *bi0);
1134                   if (IP4_REASS_RC_OK != rc)
1135                     {
1136                       return rc;
1137                     }
1138                   consumed = 1;
1139                 }
1140             }
1141         }
1142       break;
1143     }
1144   ++reass->fragments_n;
1145   if (consumed)
1146     {
1147       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1148         {
1149           ip4_full_reass_add_trace (vm, node, reass, *bi0, RANGE_NEW, 0, ~0);
1150         }
1151     }
1152   if (~0 != reass->last_packet_octet &&
1153       reass->data_len == reass->last_packet_octet + 1)
1154     {
1155       *handoff_thread_idx = reass->sendout_thread_index;
1156       int handoff =
1157         reass->memory_owner_thread_index != reass->sendout_thread_index;
1158       rc =
1159         ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1160                                  is_custom);
1161       if (IP4_REASS_RC_OK == rc && handoff)
1162         {
1163           rc = IP4_REASS_RC_HANDOFF;
1164         }
1165     }
1166   else
1167     {
1168       if (consumed)
1169         {
1170           *bi0 = ~0;
1171           if (reass->fragments_n > rm->max_reass_len)
1172             {
1173               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1174             }
1175         }
1176       else
1177         {
1178           *next0 = IP4_FULL_REASS_NEXT_DROP;
1179           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1180         }
1181     }
1182   return rc;
1183 }
1184
1185 always_inline uword
1186 ip4_full_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
1187                        vlib_frame_t *frame, ip4_full_reass_node_type_t type,
1188                        bool is_local)
1189 {
1190   u32 *from = vlib_frame_vector_args (frame);
1191   u32 n_left_from, n_left_to_next, *to_next, next_index;
1192   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1193   ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1194   clib_spinlock_lock (&rt->lock);
1195
1196   n_left_from = frame->n_vectors;
1197   next_index = node->cached_next_index;
1198   while (n_left_from > 0)
1199     {
1200       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1201
1202       while (n_left_from > 0 && n_left_to_next > 0)
1203         {
1204           u32 bi0;
1205           vlib_buffer_t *b0;
1206           u32 next0;
1207           u32 error0 = IP4_ERROR_NONE;
1208
1209           bi0 = from[0];
1210           b0 = vlib_get_buffer (vm, bi0);
1211
1212           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1213           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1214             {
1215               // this is a whole packet - no fragmentation
1216               if (CUSTOM != type)
1217                 {
1218                   next0 = IP4_FULL_REASS_NEXT_INPUT;
1219                 }
1220               else
1221                 {
1222                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1223                 }
1224               ip4_full_reass_add_trace (vm, node, NULL, bi0, PASSTHROUGH, 0,
1225                                         ~0);
1226               goto packet_enqueue;
1227             }
1228
1229           if (is_local && !rm->is_local_reass_enabled)
1230             {
1231               next0 = IP4_FULL_REASS_NEXT_DROP;
1232               goto packet_enqueue;
1233             }
1234
1235           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1236           const u32 fragment_length =
1237             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1238           const u32 fragment_last = fragment_first + fragment_length - 1;
1239
1240           /* Keep track of received fragments */
1241           vlib_node_increment_counter (vm, node->node_index,
1242                                        IP4_ERROR_REASS_FRAGMENTS_RCVD, 1);
1243
1244           if (fragment_first > fragment_last ||
1245               fragment_first + fragment_length > UINT16_MAX - 20 ||
1246               (fragment_length < 8 && // 8 is minimum frag length per RFC 791
1247                ip4_get_fragment_more (ip0)))
1248             {
1249               next0 = IP4_FULL_REASS_NEXT_DROP;
1250               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1251               goto packet_enqueue;
1252             }
1253           ip4_full_reass_kv_t kv;
1254           u8 do_handoff = 0;
1255
1256           kv.k.as_u64[0] =
1257             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1258                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1259             (u64) ip0->src_address.as_u32 << 32;
1260           kv.k.as_u64[1] =
1261             (u64) ip0->dst_address.
1262             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1263
1264           ip4_full_reass_t *reass = ip4_full_reass_find_or_create (
1265             vm, node, rm, rt, &kv, &do_handoff, &n_left_to_next, &to_next);
1266
1267           if (reass)
1268             {
1269               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1270               if (0 == fragment_first)
1271                 {
1272                   reass->sendout_thread_index = vm->thread_index;
1273                 }
1274             }
1275
1276           if (PREDICT_FALSE (do_handoff))
1277             {
1278               next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1279               vnet_buffer (b0)->ip.reass.owner_thread_index =
1280                 kv.v.memory_owner_thread_index;
1281             }
1282           else if (reass)
1283             {
1284               u32 handoff_thread_idx;
1285               u32 counter = ~0;
1286               switch (ip4_full_reass_update
1287                       (vm, node, rm, rt, reass, &bi0, &next0,
1288                        &error0, CUSTOM == type, &handoff_thread_idx))
1289                 {
1290                 case IP4_REASS_RC_OK:
1291                   /* nothing to do here */
1292                   break;
1293                 case IP4_REASS_RC_HANDOFF:
1294                   next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1295                   b0 = vlib_get_buffer (vm, bi0);
1296                   vnet_buffer (b0)->ip.reass.owner_thread_index =
1297                     handoff_thread_idx;
1298                   break;
1299                 case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1300                   counter = IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
1301                   break;
1302                 case IP4_REASS_RC_NO_BUF:
1303                   counter = IP4_ERROR_REASS_NO_BUF;
1304                   break;
1305                 case IP4_REASS_RC_INTERNAL_ERROR:
1306                   counter = IP4_ERROR_REASS_INTERNAL_ERROR;
1307                   /* Sanitization is needed in internal error cases only, as
1308                    * the incoming packet is already dropped in other cases,
1309                    * also adding bi0 back to the reassembly list, fixes the
1310                    * leaking of buffers during internal errors.
1311                    *
1312                    * Also it doesnt make sense to send these buffers custom
1313                    * app, these fragments are with internal errors */
1314                   sanitize_reass_buffers_add_missing (vm, reass, &bi0);
1315                   reass->error_next_index = ~0;
1316                   break;
1317                 }
1318
1319               if (~0 != counter)
1320                 {
1321                   vlib_node_increment_counter (vm, node->node_index, counter,
1322                                                1);
1323                   ip4_full_reass_drop_all (vm, node, reass, &n_left_to_next,
1324                                            &to_next);
1325                   ip4_full_reass_free (rm, rt, reass);
1326                   goto next_packet;
1327                 }
1328             }
1329           else
1330             {
1331               next0 = IP4_FULL_REASS_NEXT_DROP;
1332               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1333             }
1334
1335
1336         packet_enqueue:
1337
1338           if (bi0 != ~0)
1339             {
1340               to_next[0] = bi0;
1341               to_next += 1;
1342               n_left_to_next -= 1;
1343
1344               /* bi0 might have been updated by reass_finalize, reload */
1345               b0 = vlib_get_buffer (vm, bi0);
1346               if (IP4_ERROR_NONE != error0)
1347                 {
1348                   b0->error = node->errors[error0];
1349                 }
1350
1351               if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1352                 {
1353                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1354                     {
1355                       ip4_full_reass_add_trace (
1356                         vm, node, NULL, bi0, HANDOFF, 0,
1357                         vnet_buffer (b0)->ip.reass.owner_thread_index);
1358                     }
1359                 }
1360               else if (FEATURE == type && IP4_ERROR_NONE == error0)
1361                 {
1362                   vnet_feature_next (&next0, b0);
1363                 }
1364
1365               /* Increment the counter to-custom-app also as this fragment is
1366                * also going to application */
1367               if (CUSTOM == type)
1368                 {
1369                   vlib_node_increment_counter (
1370                     vm, node->node_index, IP4_ERROR_REASS_TO_CUSTOM_APP, 1);
1371                 }
1372
1373               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1374                                                to_next, n_left_to_next,
1375                                                bi0, next0);
1376               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1377             }
1378
1379         next_packet:
1380           from += 1;
1381           n_left_from -= 1;
1382         }
1383
1384       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1385     }
1386
1387   clib_spinlock_unlock (&rt->lock);
1388   return frame->n_vectors;
1389 }
1390
1391 VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1392                                     vlib_node_runtime_t * node,
1393                                     vlib_frame_t * frame)
1394 {
1395   return ip4_full_reass_inline (vm, node, frame, NORMAL, false /* is_local */);
1396 }
1397
1398 VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1399     .name = "ip4-full-reassembly",
1400     .vector_size = sizeof (u32),
1401     .format_trace = format_ip4_full_reass_trace,
1402     .n_errors = IP4_N_ERROR,
1403     .error_counters = ip4_error_counters,
1404     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1405     .next_nodes =
1406         {
1407                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1408                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1409                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1410
1411         },
1412 };
1413
1414 VLIB_NODE_FN (ip4_local_full_reass_node)
1415 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1416 {
1417   return ip4_full_reass_inline (vm, node, frame, NORMAL, true /* is_local */);
1418 }
1419
1420 VLIB_REGISTER_NODE (ip4_local_full_reass_node) = {
1421     .name = "ip4-local-full-reassembly",
1422     .vector_size = sizeof (u32),
1423     .format_trace = format_ip4_full_reass_trace,
1424     .n_errors = IP4_N_ERROR,
1425     .error_counters = ip4_error_counters,
1426     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1427     .next_nodes =
1428         {
1429                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1430                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1431                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-local-full-reassembly-handoff",
1432
1433         },
1434 };
1435
1436 VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1437                                             vlib_node_runtime_t * node,
1438                                             vlib_frame_t * frame)
1439 {
1440   return ip4_full_reass_inline (vm, node, frame, FEATURE,
1441                                 false /* is_local */);
1442 }
1443
1444 VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1445     .name = "ip4-full-reassembly-feature",
1446     .vector_size = sizeof (u32),
1447     .format_trace = format_ip4_full_reass_trace,
1448     .n_errors = IP4_N_ERROR,
1449     .error_counters = ip4_error_counters,
1450     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1451     .next_nodes =
1452         {
1453                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1454                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1455                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1456         },
1457 };
1458
1459 VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1460     .arc_name = "ip4-unicast",
1461     .node_name = "ip4-full-reassembly-feature",
1462     .runs_before = VNET_FEATURES ("ip4-lookup",
1463                                   "ipsec4-input-feature"),
1464     .runs_after = 0,
1465 };
1466
1467 VLIB_NODE_FN (ip4_full_reass_node_custom) (vlib_main_t * vm,
1468                                            vlib_node_runtime_t * node,
1469                                            vlib_frame_t * frame)
1470 {
1471   return ip4_full_reass_inline (vm, node, frame, CUSTOM, false /* is_local */);
1472 }
1473
1474 VLIB_REGISTER_NODE (ip4_full_reass_node_custom) = {
1475     .name = "ip4-full-reassembly-custom",
1476     .vector_size = sizeof (u32),
1477     .format_trace = format_ip4_full_reass_trace,
1478     .n_errors = IP4_N_ERROR,
1479     .error_counters = ip4_error_counters,
1480     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1481     .next_nodes =
1482         {
1483                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1484                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1485                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-custom-hoff",
1486         },
1487 };
1488
1489 VNET_FEATURE_INIT (ip4_full_reass_custom, static) = {
1490     .arc_name = "ip4-unicast",
1491     .node_name = "ip4-full-reassembly-feature",
1492     .runs_before = VNET_FEATURES ("ip4-lookup",
1493                                   "ipsec4-input-feature"),
1494     .runs_after = 0,
1495 };
1496
1497
1498 #ifndef CLIB_MARCH_VARIANT
1499 uword
1500 ip4_full_reass_custom_register_next_node (uword node_index)
1501 {
1502   return vlib_node_add_next (vlib_get_main (),
1503                              ip4_full_reass_node_custom.index, node_index);
1504 }
1505
1506 always_inline u32
1507 ip4_full_reass_get_nbuckets ()
1508 {
1509   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1510   u32 nbuckets;
1511   u8 i;
1512
1513   /* need more mem with more workers */
1514   nbuckets = (u32) (rm->max_reass_n * (vlib_num_workers () + 1) /
1515                     IP4_REASS_HT_LOAD_FACTOR);
1516
1517   for (i = 0; i < 31; i++)
1518     if ((1 << i) >= nbuckets)
1519       break;
1520   nbuckets = 1 << i;
1521
1522   return nbuckets;
1523 }
1524 #endif /* CLIB_MARCH_VARIANT */
1525
1526 typedef enum
1527 {
1528   IP4_EVENT_CONFIG_CHANGED = 1,
1529 } ip4_full_reass_event_t;
1530
1531 typedef struct
1532 {
1533   int failure;
1534   clib_bihash_16_8_t *new_hash;
1535 } ip4_rehash_cb_ctx;
1536
1537 #ifndef CLIB_MARCH_VARIANT
1538 static int
1539 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1540 {
1541   ip4_rehash_cb_ctx *ctx = _ctx;
1542   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1543     {
1544       ctx->failure = 1;
1545     }
1546   return (BIHASH_WALK_CONTINUE);
1547 }
1548
1549 static void
1550 ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1551                            u32 max_reassembly_length,
1552                            u32 expire_walk_interval_ms)
1553 {
1554   ip4_full_reass_main.timeout_ms = timeout_ms;
1555   ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1556   ip4_full_reass_main.max_reass_n = max_reassemblies;
1557   ip4_full_reass_main.max_reass_len = max_reassembly_length;
1558   ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1559 }
1560
1561 vnet_api_error_t
1562 ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1563                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1564 {
1565   u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1566   ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1567                              max_reassembly_length, expire_walk_interval_ms);
1568   vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1569                              ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1570                              IP4_EVENT_CONFIG_CHANGED, 0);
1571   u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1572   if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1573     {
1574       clib_bihash_16_8_t new_hash;
1575       clib_memset (&new_hash, 0, sizeof (new_hash));
1576       ip4_rehash_cb_ctx ctx;
1577       ctx.failure = 0;
1578       ctx.new_hash = &new_hash;
1579       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1580                              new_nbuckets * 1024);
1581       clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1582                                                ip4_rehash_cb, &ctx);
1583       if (ctx.failure)
1584         {
1585           clib_bihash_free_16_8 (&new_hash);
1586           return -1;
1587         }
1588       else
1589         {
1590           clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1591           clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1592                             sizeof (ip4_full_reass_main.hash));
1593           clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1594         }
1595     }
1596   return 0;
1597 }
1598
1599 vnet_api_error_t
1600 ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1601                     u32 * max_reassembly_length,
1602                     u32 * expire_walk_interval_ms)
1603 {
1604   *timeout_ms = ip4_full_reass_main.timeout_ms;
1605   *max_reassemblies = ip4_full_reass_main.max_reass_n;
1606   *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1607   *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1608   return 0;
1609 }
1610
1611 static clib_error_t *
1612 ip4_full_reass_init_function (vlib_main_t * vm)
1613 {
1614   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1615   clib_error_t *error = 0;
1616   u32 nbuckets;
1617   vlib_node_t *node;
1618
1619   rm->vlib_main = vm;
1620
1621   vec_validate (rm->per_thread_data, vlib_num_workers ());
1622   ip4_full_reass_per_thread_t *rt;
1623   vec_foreach (rt, rm->per_thread_data)
1624   {
1625     clib_spinlock_init (&rt->lock);
1626     pool_alloc (rt->pool, rm->max_reass_n);
1627   }
1628
1629   node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1630   ASSERT (node);
1631   rm->ip4_full_reass_expire_node_idx = node->index;
1632
1633   ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1634                              IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1635                              IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1636                              IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1637
1638   nbuckets = ip4_full_reass_get_nbuckets ();
1639   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1640
1641   rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1642   rm->fq_local_index =
1643     vlib_frame_queue_main_init (ip4_local_full_reass_node.index, 0);
1644   rm->fq_feature_index =
1645     vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1646   rm->fq_custom_index =
1647     vlib_frame_queue_main_init (ip4_full_reass_node_custom.index, 0);
1648
1649   rm->feature_use_refcount_per_intf = NULL;
1650   rm->is_local_reass_enabled = 1;
1651
1652   return error;
1653 }
1654
1655 VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1656 #endif /* CLIB_MARCH_VARIANT */
1657
1658 static uword
1659 ip4_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
1660                              CLIB_UNUSED (vlib_frame_t *f))
1661 {
1662   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1663   uword event_type, *event_data = 0;
1664
1665   while (true)
1666     {
1667       vlib_process_wait_for_event_or_clock (vm,
1668                                             (f64)
1669                                             rm->expire_walk_interval_ms /
1670                                             (f64) MSEC_PER_SEC);
1671       event_type = vlib_process_get_events (vm, &event_data);
1672
1673       switch (event_type)
1674         {
1675         case ~0:
1676           /* no events => timeout */
1677           /* fallthrough */
1678         case IP4_EVENT_CONFIG_CHANGED:
1679           /* nothing to do here */
1680           break;
1681         default:
1682           clib_warning ("BUG: event type 0x%wx", event_type);
1683           break;
1684         }
1685       f64 now = vlib_time_now (vm);
1686
1687       ip4_full_reass_t *reass;
1688       int *pool_indexes_to_free = NULL;
1689
1690       uword thread_index = 0;
1691       int index;
1692       const uword nthreads = vlib_num_workers () + 1;
1693       u32 n_left_to_next, *to_next;
1694
1695       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1696         {
1697           ip4_full_reass_per_thread_t *rt =
1698             &rm->per_thread_data[thread_index];
1699           clib_spinlock_lock (&rt->lock);
1700
1701           vec_reset_length (pool_indexes_to_free);
1702
1703           /* Pace the number of timeouts handled per thread,to avoid barrier
1704            * sync issues in real world scenarios */
1705
1706           u32 beg = rt->last_id;
1707           /* to ensure we walk at least once per sec per context */
1708           u32 end =
1709             beg + (IP4_REASS_MAX_REASSEMBLIES_DEFAULT *
1710                      IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS / MSEC_PER_SEC +
1711                    1);
1712           if (end > vec_len (rt->pool))
1713             {
1714               end = vec_len (rt->pool);
1715               rt->last_id = 0;
1716             }
1717           else
1718             {
1719               rt->last_id = end;
1720             }
1721
1722           pool_foreach_stepping_index (index, beg, end, rt->pool)
1723           {
1724             reass = pool_elt_at_index (rt->pool, index);
1725             if (now > reass->last_heard + rm->timeout)
1726               {
1727                 vec_add1 (pool_indexes_to_free, index);
1728               }
1729           }
1730
1731           if (vec_len (pool_indexes_to_free))
1732             vlib_node_increment_counter (vm, node->node_index,
1733                                          IP4_ERROR_REASS_TIMEOUT,
1734                                          vec_len (pool_indexes_to_free));
1735           int *i;
1736           vec_foreach (i, pool_indexes_to_free)
1737           {
1738             ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1739             ip4_full_reass_drop_all (vm, node, reass, &n_left_to_next,
1740                                      &to_next);
1741             ip4_full_reass_free (rm, rt, reass);
1742           }
1743
1744           clib_spinlock_unlock (&rt->lock);
1745         }
1746
1747       vec_free (pool_indexes_to_free);
1748       if (event_data)
1749         {
1750           vec_set_len (event_data, 0);
1751         }
1752     }
1753
1754   return 0;
1755 }
1756
1757 VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1758   .function = ip4_full_reass_walk_expired,
1759   .type = VLIB_NODE_TYPE_PROCESS,
1760   .name = "ip4-full-reassembly-expire-walk",
1761   .format_trace = format_ip4_full_reass_trace,
1762   .n_errors = IP4_N_ERROR,
1763   .error_counters = ip4_error_counters,
1764 };
1765
1766 static u8 *
1767 format_ip4_full_reass_key (u8 * s, va_list * args)
1768 {
1769   ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1770   s =
1771     format (s,
1772             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1773             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1774             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1775   return s;
1776 }
1777
1778 static u8 *
1779 format_ip4_reass (u8 * s, va_list * args)
1780 {
1781   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1782   ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1783
1784   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1785               "last_packet_octet: %u, trace_op_counter: %u\n",
1786               reass->id, format_ip4_full_reass_key, &reass->key,
1787               reass->first_bi, reass->data_len,
1788               reass->last_packet_octet, reass->trace_op_counter);
1789
1790   u32 bi = reass->first_bi;
1791   u32 counter = 0;
1792   while (~0 != bi)
1793     {
1794       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1795       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1796       s =
1797         format (s,
1798                 "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1799                 "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1800                 vnb->ip.reass.range_last, bi,
1801                 ip4_full_reass_buffer_get_data_offset (b),
1802                 ip4_full_reass_buffer_get_data_len (b),
1803                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1804       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1805         {
1806           bi = b->next_buffer;
1807         }
1808       else
1809         {
1810           bi = ~0;
1811         }
1812     }
1813   return s;
1814 }
1815
1816 static clib_error_t *
1817 show_ip4_reass (vlib_main_t * vm,
1818                 unformat_input_t * input,
1819                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1820 {
1821   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1822
1823   vlib_cli_output (vm, "---------------------");
1824   vlib_cli_output (vm, "IP4 reassembly status");
1825   vlib_cli_output (vm, "---------------------");
1826   bool details = false;
1827   if (unformat (input, "details"))
1828     {
1829       details = true;
1830     }
1831
1832   u32 sum_reass_n = 0;
1833   ip4_full_reass_t *reass;
1834   uword thread_index;
1835   const uword nthreads = vlib_num_workers () + 1;
1836   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1837     {
1838       ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1839       clib_spinlock_lock (&rt->lock);
1840       if (details)
1841         {
1842           pool_foreach (reass, rt->pool) {
1843             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1844           }
1845         }
1846       sum_reass_n += rt->reass_n;
1847       clib_spinlock_unlock (&rt->lock);
1848     }
1849   vlib_cli_output (vm, "---------------------");
1850   vlib_cli_output (vm, "Current full IP4 reassemblies count: %lu\n",
1851                    (long unsigned) sum_reass_n);
1852   vlib_cli_output (vm,
1853                    "Maximum configured concurrent full IP4 reassemblies per worker-thread: %lu\n",
1854                    (long unsigned) rm->max_reass_n);
1855   vlib_cli_output (vm,
1856                    "Maximum configured amount of fragments "
1857                    "per full IP4 reassembly: %lu\n",
1858                    (long unsigned) rm->max_reass_len);
1859   vlib_cli_output (vm,
1860                    "Maximum configured full IP4 reassembly timeout: %lums\n",
1861                    (long unsigned) rm->timeout_ms);
1862   vlib_cli_output (vm,
1863                    "Maximum configured full IP4 reassembly expire walk interval: %lums\n",
1864                    (long unsigned) rm->expire_walk_interval_ms);
1865   return 0;
1866 }
1867
1868 VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1869     .path = "show ip4-full-reassembly",
1870     .short_help = "show ip4-full-reassembly [details]",
1871     .function = show_ip4_reass,
1872 };
1873
1874 #ifndef CLIB_MARCH_VARIANT
1875 vnet_api_error_t
1876 ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1877 {
1878   return vnet_feature_enable_disable ("ip4-unicast",
1879                                       "ip4-full-reassembly-feature",
1880                                       sw_if_index, enable_disable, 0, 0);
1881 }
1882 #endif /* CLIB_MARCH_VARIANT */
1883
1884
1885 #define foreach_ip4_full_reass_handoff_error                       \
1886 _(CONGESTION_DROP, "congestion drop")
1887
1888
1889 typedef enum
1890 {
1891 #define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1892   foreach_ip4_full_reass_handoff_error
1893 #undef _
1894     IP4_FULL_REASS_HANDOFF_N_ERROR,
1895 } ip4_full_reass_handoff_error_t;
1896
1897 static char *ip4_full_reass_handoff_error_strings[] = {
1898 #define _(sym,string) string,
1899   foreach_ip4_full_reass_handoff_error
1900 #undef _
1901 };
1902
1903 typedef struct
1904 {
1905   u32 next_worker_index;
1906 } ip4_full_reass_handoff_trace_t;
1907
1908 static u8 *
1909 format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1910 {
1911   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1912   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1913   ip4_full_reass_handoff_trace_t *t =
1914     va_arg (*args, ip4_full_reass_handoff_trace_t *);
1915
1916   s =
1917     format (s, "ip4-full-reassembly-handoff: next-worker %d",
1918             t->next_worker_index);
1919
1920   return s;
1921 }
1922
1923 always_inline uword
1924 ip4_full_reass_handoff_node_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
1925                                     vlib_frame_t *frame,
1926                                     ip4_full_reass_node_type_t type,
1927                                     bool is_local)
1928 {
1929   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1930
1931   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1932   u32 n_enq, n_left_from, *from;
1933   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1934   u32 fq_index;
1935
1936   from = vlib_frame_vector_args (frame);
1937   n_left_from = frame->n_vectors;
1938   vlib_get_buffers (vm, from, bufs, n_left_from);
1939
1940   b = bufs;
1941   ti = thread_indices;
1942
1943   switch (type)
1944     {
1945     case NORMAL:
1946       if (is_local)
1947         {
1948           fq_index = rm->fq_local_index;
1949         }
1950       else
1951         {
1952           fq_index = rm->fq_index;
1953         }
1954       break;
1955     case FEATURE:
1956       fq_index = rm->fq_feature_index;
1957       break;
1958     case CUSTOM:
1959       fq_index = rm->fq_custom_index;
1960       break;
1961     default:
1962       clib_warning ("Unexpected `type' (%d)!", type);
1963     }
1964
1965   while (n_left_from > 0)
1966     {
1967       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1968
1969       if (PREDICT_FALSE
1970           ((node->flags & VLIB_NODE_FLAG_TRACE)
1971            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1972         {
1973           ip4_full_reass_handoff_trace_t *t =
1974             vlib_add_trace (vm, node, b[0], sizeof (*t));
1975           t->next_worker_index = ti[0];
1976         }
1977
1978       n_left_from -= 1;
1979       ti += 1;
1980       b += 1;
1981     }
1982   n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
1983                                          thread_indices, frame->n_vectors, 1);
1984
1985   if (n_enq < frame->n_vectors)
1986     vlib_node_increment_counter (vm, node->node_index,
1987                                  IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1988                                  frame->n_vectors - n_enq);
1989   return frame->n_vectors;
1990 }
1991
1992 VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1993                                             vlib_node_runtime_t * node,
1994                                             vlib_frame_t * frame)
1995 {
1996   return ip4_full_reass_handoff_node_inline (vm, node, frame, NORMAL,
1997                                              false /* is_local */);
1998 }
1999
2000
2001 VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
2002   .name = "ip4-full-reassembly-handoff",
2003   .vector_size = sizeof (u32),
2004   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
2005   .error_strings = ip4_full_reass_handoff_error_strings,
2006   .format_trace = format_ip4_full_reass_handoff_trace,
2007
2008   .n_next_nodes = 1,
2009
2010   .next_nodes = {
2011     [0] = "error-drop",
2012   },
2013 };
2014
2015 VLIB_NODE_FN (ip4_local_full_reass_handoff_node)
2016 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
2017 {
2018   return ip4_full_reass_handoff_node_inline (vm, node, frame, NORMAL,
2019                                              true /* is_local */);
2020 }
2021
2022 VLIB_REGISTER_NODE (ip4_local_full_reass_handoff_node) = {
2023   .name = "ip4-local-full-reassembly-handoff",
2024   .vector_size = sizeof (u32),
2025   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
2026   .error_strings = ip4_full_reass_handoff_error_strings,
2027   .format_trace = format_ip4_full_reass_handoff_trace,
2028
2029   .n_next_nodes = 1,
2030
2031   .next_nodes = {
2032     [0] = "error-drop",
2033   },
2034 };
2035
2036 VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
2037                                                     vlib_node_runtime_t *
2038                                                     node,
2039                                                     vlib_frame_t * frame)
2040 {
2041   return ip4_full_reass_handoff_node_inline (vm, node, frame, FEATURE,
2042                                              false /* is_local */);
2043 }
2044
2045 VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
2046   .name = "ip4-full-reass-feature-hoff",
2047   .vector_size = sizeof (u32),
2048   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
2049   .error_strings = ip4_full_reass_handoff_error_strings,
2050   .format_trace = format_ip4_full_reass_handoff_trace,
2051
2052   .n_next_nodes = 1,
2053
2054   .next_nodes = {
2055     [0] = "error-drop",
2056   },
2057 };
2058
2059 VLIB_NODE_FN (ip4_full_reass_custom_handoff_node) (vlib_main_t * vm,
2060                                                     vlib_node_runtime_t *
2061                                                     node,
2062                                                     vlib_frame_t * frame)
2063 {
2064   return ip4_full_reass_handoff_node_inline (vm, node, frame, CUSTOM,
2065                                              false /* is_local */);
2066 }
2067
2068 VLIB_REGISTER_NODE (ip4_full_reass_custom_handoff_node) = {
2069   .name = "ip4-full-reass-custom-hoff",
2070   .vector_size = sizeof (u32),
2071   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
2072   .error_strings = ip4_full_reass_handoff_error_strings,
2073   .format_trace = format_ip4_full_reass_handoff_trace,
2074
2075   .n_next_nodes = 1,
2076
2077   .next_nodes = {
2078     [0] = "error-drop",
2079   },
2080 };
2081
2082 #ifndef CLIB_MARCH_VARIANT
2083 int
2084 ip4_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
2085 {
2086   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
2087   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
2088   if (is_enable)
2089     {
2090       if (!rm->feature_use_refcount_per_intf[sw_if_index])
2091         {
2092           ++rm->feature_use_refcount_per_intf[sw_if_index];
2093           return vnet_feature_enable_disable ("ip4-unicast",
2094                                               "ip4-full-reassembly-feature",
2095                                               sw_if_index, 1, 0, 0);
2096         }
2097       ++rm->feature_use_refcount_per_intf[sw_if_index];
2098     }
2099   else
2100     {
2101       --rm->feature_use_refcount_per_intf[sw_if_index];
2102       if (!rm->feature_use_refcount_per_intf[sw_if_index])
2103         return vnet_feature_enable_disable ("ip4-unicast",
2104                                             "ip4-full-reassembly-feature",
2105                                             sw_if_index, 0, 0, 0);
2106     }
2107   return -1;
2108 }
2109
2110 void
2111 ip4_local_full_reass_enable_disable (int enable)
2112 {
2113   if (enable)
2114     {
2115       ip4_full_reass_main.is_local_reass_enabled = 1;
2116     }
2117   else
2118     {
2119       ip4_full_reass_main.is_local_reass_enabled = 0;
2120     }
2121 }
2122
2123 int
2124 ip4_local_full_reass_enabled ()
2125 {
2126   return ip4_full_reass_main.is_local_reass_enabled;
2127 }
2128
2129 #endif
2130
2131 /*
2132  * fd.io coding-style-patch-verification: ON
2133  *
2134  * Local Variables:
2135  * eval: (c-set-style "gnu")
2136  * End:
2137  */