ip: add shallow virtual reassembly functionality
[vpp.git] / src / vnet / ip / reass / ip4_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Full Reassembly.
19  *
20  * This file contains the source code for IPv4 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/fifo.h>
27 #include <vppinfra/bihash_16_8.h>
28 #include <vnet/ip/reass/ip4_full_reass.h>
29 #include <stddef.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
34 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38 #define IP4_REASS_DEBUG_BUFFERS 0
39 #if IP4_REASS_DEBUG_BUFFERS
40 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41   do                                                 \
42     {                                                \
43       u32 _bi = bi;                                  \
44       printf (#what "buffer %u", _bi);               \
45       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47         {                                            \
48           _bi = _b->next_buffer;                     \
49           printf ("[%u]", _bi);                      \
50           _b = vlib_get_buffer (vm, _bi);            \
51         }                                            \
52       printf ("\n");                                 \
53       fflush (stdout);                               \
54     }                                                \
55   while (0)
56 #else
57 #define IP4_REASS_DEBUG_BUFFER(...)
58 #endif
59
60 typedef enum
61 {
62   IP4_REASS_RC_OK,
63   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64   IP4_REASS_RC_INTERNAL_ERROR,
65   IP4_REASS_RC_NO_BUF,
66   IP4_REASS_RC_HANDOFF,
67 } ip4_full_reass_rc_t;
68
69 typedef struct
70 {
71   union
72   {
73     struct
74     {
75       u32 xx_id;
76       ip4_address_t src;
77       ip4_address_t dst;
78       u16 frag_id;
79       u8 proto;
80       u8 unused;
81     };
82     u64 as_u64[2];
83   };
84 } ip4_full_reass_key_t;
85
86 typedef union
87 {
88   struct
89   {
90     u32 reass_index;
91     u32 memory_owner_thread_index;
92   };
93   u64 as_u64;
94 } ip4_full_reass_val_t;
95
96 typedef union
97 {
98   struct
99   {
100     ip4_full_reass_key_t k;
101     ip4_full_reass_val_t v;
102   };
103   clib_bihash_kv_16_8_t kv;
104 } ip4_full_reass_kv_t;
105
106 always_inline u32
107 ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108 {
109   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111 }
112
113 always_inline u16
114 ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115 {
116   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118     (vnb->ip.reass.fragment_first +
119      ip4_full_reass_buffer_get_data_offset (b)) + 1;
120 }
121
122 typedef struct
123 {
124   // hash table key
125   ip4_full_reass_key_t key;
126   // time when last packet was received
127   f64 last_heard;
128   // internal id of this reassembly
129   u64 id;
130   // buffer index of first buffer in this reassembly context
131   u32 first_bi;
132   // last octet of packet, ~0 until fragment without more_fragments arrives
133   u32 last_packet_octet;
134   // length of data collected so far
135   u32 data_len;
136   // trace operation counter
137   u32 trace_op_counter;
138   // next index - used by non-feature node
139   u32 next_index;
140   // error next index - used by custom apps (~0 if not used)
141   u32 error_next_index;
142   // minimum fragment length for this reassembly - used to estimate MTU
143   u16 min_fragment_length;
144   // number of fragments in this reassembly
145   u32 fragments_n;
146   // thread owning memory for this context (whose pool contains this ctx)
147   u32 memory_owner_thread_index;
148   // thread which received fragment with offset 0 and which sends out the
149   // completed reassembly
150   u32 sendout_thread_index;
151 } ip4_full_reass_t;
152
153 typedef struct
154 {
155   ip4_full_reass_t *pool;
156   u32 reass_n;
157   u32 id_counter;
158   clib_spinlock_t lock;
159 } ip4_full_reass_per_thread_t;
160
161 typedef struct
162 {
163   // IPv4 config
164   u32 timeout_ms;
165   f64 timeout;
166   u32 expire_walk_interval_ms;
167   // maximum number of fragments in one reassembly
168   u32 max_reass_len;
169   // maximum number of reassemblies
170   u32 max_reass_n;
171
172   // IPv4 runtime
173   clib_bihash_16_8_t hash;
174   // per-thread data
175   ip4_full_reass_per_thread_t *per_thread_data;
176
177   // convenience
178   vlib_main_t *vlib_main;
179
180   // node index of ip4-drop node
181   u32 ip4_drop_idx;
182   u32 ip4_full_reass_expire_node_idx;
183
184   /** Worker handoff */
185   u32 fq_index;
186   u32 fq_feature_index;
187
188 } ip4_full_reass_main_t;
189
190 extern ip4_full_reass_main_t ip4_full_reass_main;
191
192 #ifndef CLIB_MARCH_VARIANT
193 ip4_full_reass_main_t ip4_full_reass_main;
194 #endif /* CLIB_MARCH_VARIANT */
195
196 typedef enum
197 {
198   IP4_FULL_REASS_NEXT_INPUT,
199   IP4_FULL_REASS_NEXT_DROP,
200   IP4_FULL_REASS_NEXT_HANDOFF,
201   IP4_FULL_REASS_N_NEXT,
202 } ip4_full_reass_next_t;
203
204 typedef enum
205 {
206   RANGE_NEW,
207   RANGE_SHRINK,
208   RANGE_DISCARD,
209   RANGE_OVERLAP,
210   FINALIZE,
211   HANDOFF,
212 } ip4_full_reass_trace_operation_e;
213
214 typedef struct
215 {
216   u16 range_first;
217   u16 range_last;
218   u32 range_bi;
219   i32 data_offset;
220   u32 data_len;
221   u32 first_bi;
222 } ip4_full_reass_range_trace_t;
223
224 typedef struct
225 {
226   ip4_full_reass_trace_operation_e action;
227   u32 reass_id;
228   ip4_full_reass_range_trace_t trace_range;
229   u32 size_diff;
230   u32 op_id;
231   u32 thread_id;
232   u32 thread_id_to;
233   u32 fragment_first;
234   u32 fragment_last;
235   u32 total_data_len;
236 } ip4_full_reass_trace_t;
237
238 extern vlib_node_registration_t ip4_full_reass_node;
239 extern vlib_node_registration_t ip4_full_reass_node_feature;
240
241 static void
242 ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
243                               ip4_full_reass_range_trace_t * trace)
244 {
245   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
246   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
247   trace->range_first = vnb->ip.reass.range_first;
248   trace->range_last = vnb->ip.reass.range_last;
249   trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
250   trace->data_len = ip4_full_reass_buffer_get_data_len (b);
251   trace->range_bi = bi;
252 }
253
254 static u8 *
255 format_ip4_full_reass_range_trace (u8 * s, va_list * args)
256 {
257   ip4_full_reass_range_trace_t *trace =
258     va_arg (*args, ip4_full_reass_range_trace_t *);
259   s =
260     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
261             trace->range_last, trace->data_offset, trace->data_len,
262             trace->range_bi);
263   return s;
264 }
265
266 static u8 *
267 format_ip4_full_reass_trace (u8 * s, va_list * args)
268 {
269   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
270   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
271   ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
272   u32 indent = 0;
273   if (~0 != t->reass_id)
274     {
275       s = format (s, "reass id: %u, op id: %u, ", t->reass_id, t->op_id);
276       indent = format_get_indent (s);
277       s =
278         format (s,
279                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
280                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
281                 t->fragment_last);
282     }
283   switch (t->action)
284     {
285     case RANGE_SHRINK:
286       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
287                   format_ip4_full_reass_range_trace, &t->trace_range,
288                   t->size_diff);
289       break;
290     case RANGE_DISCARD:
291       s = format (s, "\n%Udiscard %U", format_white_space, indent,
292                   format_ip4_full_reass_range_trace, &t->trace_range);
293       break;
294     case RANGE_NEW:
295       s = format (s, "\n%Unew %U", format_white_space, indent,
296                   format_ip4_full_reass_range_trace, &t->trace_range);
297       break;
298     case RANGE_OVERLAP:
299       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
300                   format_ip4_full_reass_range_trace, &t->trace_range);
301       break;
302     case FINALIZE:
303       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
304       break;
305     case HANDOFF:
306       s =
307         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
308                 t->thread_id_to);
309       break;
310     }
311   return s;
312 }
313
314 static void
315 ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
316                           ip4_full_reass_main_t * rm,
317                           ip4_full_reass_t * reass, u32 bi,
318                           ip4_full_reass_trace_operation_e action,
319                           u32 size_diff, u32 thread_id_to)
320 {
321   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
322   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
323   ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
324   if (reass)
325     {
326       t->reass_id = reass->id;
327       t->op_id = reass->trace_op_counter;
328       t->trace_range.first_bi = reass->first_bi;
329       t->total_data_len = reass->data_len;
330       ++reass->trace_op_counter;
331     }
332   else
333     {
334       t->reass_id = ~0;
335       t->op_id = 0;
336       t->trace_range.first_bi = 0;
337       t->total_data_len = 0;
338     }
339   t->action = action;
340   ip4_full_reass_trace_details (vm, bi, &t->trace_range);
341   t->size_diff = size_diff;
342   t->thread_id = vm->thread_index;
343   t->thread_id_to = thread_id_to;
344   t->fragment_first = vnb->ip.reass.fragment_first;
345   t->fragment_last = vnb->ip.reass.fragment_last;
346 #if 0
347   static u8 *s = NULL;
348   s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
349   printf ("%.*s\n", vec_len (s), s);
350   fflush (stdout);
351   vec_reset_length (s);
352 #endif
353 }
354
355 always_inline void
356 ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
357                          ip4_full_reass_t * reass)
358 {
359   pool_put (rt->pool, reass);
360   --rt->reass_n;
361 }
362
363 always_inline void
364 ip4_full_reass_free (ip4_full_reass_main_t * rm,
365                      ip4_full_reass_per_thread_t * rt,
366                      ip4_full_reass_t * reass)
367 {
368   clib_bihash_kv_16_8_t kv;
369   kv.key[0] = reass->key.as_u64[0];
370   kv.key[1] = reass->key.as_u64[1];
371   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
372   return ip4_full_reass_free_ctx (rt, reass);
373 }
374
375 always_inline void
376 ip4_full_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
377                          ip4_full_reass_main_t * rm, ip4_full_reass_t * reass)
378 {
379   u32 range_bi = reass->first_bi;
380   vlib_buffer_t *range_b;
381   vnet_buffer_opaque_t *range_vnb;
382   u32 *to_free = NULL;
383   while (~0 != range_bi)
384     {
385       range_b = vlib_get_buffer (vm, range_bi);
386       range_vnb = vnet_buffer (range_b);
387       u32 bi = range_bi;
388       while (~0 != bi)
389         {
390           vec_add1 (to_free, bi);
391           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
392           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
393             {
394               bi = b->next_buffer;
395               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
396             }
397           else
398             {
399               bi = ~0;
400             }
401         }
402       range_bi = range_vnb->ip.reass.next_range_bi;
403     }
404   /* send to next_error_index */
405   if (~0 != reass->error_next_index)
406     {
407       u32 n_left_to_next, *to_next, next_index;
408
409       next_index = reass->error_next_index;
410       u32 bi = ~0;
411
412       while (vec_len (to_free) > 0)
413         {
414           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
415
416           while (vec_len (to_free) > 0 && n_left_to_next > 0)
417             {
418               bi = vec_pop (to_free);
419
420               if (~0 != bi)
421                 {
422                   to_next[0] = bi;
423                   to_next += 1;
424                   n_left_to_next -= 1;
425                 }
426             }
427           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
428         }
429     }
430   else
431     {
432       vlib_buffer_free (vm, to_free, vec_len (to_free));
433     }
434 }
435
436 always_inline void
437 ip4_full_reass_init (ip4_full_reass_t * reass)
438 {
439   reass->first_bi = ~0;
440   reass->last_packet_octet = ~0;
441   reass->data_len = 0;
442   reass->next_index = ~0;
443   reass->error_next_index = ~0;
444 }
445
446 always_inline ip4_full_reass_t *
447 ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
448                                ip4_full_reass_main_t * rm,
449                                ip4_full_reass_per_thread_t * rt,
450                                ip4_full_reass_kv_t * kv, u8 * do_handoff)
451 {
452   ip4_full_reass_t *reass;
453   f64 now;
454
455 again:
456
457   reass = NULL;
458   now = vlib_time_now (vm);
459   if (!clib_bihash_search_16_8
460       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
461     {
462       reass =
463         pool_elt_at_index (rm->per_thread_data
464                            [kv->v.memory_owner_thread_index].pool,
465                            kv->v.reass_index);
466       if (vm->thread_index != reass->memory_owner_thread_index)
467         {
468           *do_handoff = 1;
469           return reass;
470         }
471
472       if (now > reass->last_heard + rm->timeout)
473         {
474           ip4_full_reass_drop_all (vm, node, rm, reass);
475           ip4_full_reass_free (rm, rt, reass);
476           reass = NULL;
477         }
478     }
479
480   if (reass)
481     {
482       reass->last_heard = now;
483       return reass;
484     }
485
486   if (rt->reass_n >= rm->max_reass_n)
487     {
488       reass = NULL;
489       return reass;
490     }
491   else
492     {
493       pool_get (rt->pool, reass);
494       clib_memset (reass, 0, sizeof (*reass));
495       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
496       reass->memory_owner_thread_index = vm->thread_index;
497       ++rt->id_counter;
498       ip4_full_reass_init (reass);
499       ++rt->reass_n;
500     }
501
502   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
503   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
504   kv->v.reass_index = (reass - rt->pool);
505   kv->v.memory_owner_thread_index = vm->thread_index;
506   reass->last_heard = now;
507
508   int rv =
509     clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 2);
510   if (rv)
511     {
512       ip4_full_reass_free_ctx (rt, reass);
513       reass = NULL;
514       // if other worker created a context already work with the other copy
515       if (-2 == rv)
516         goto again;
517     }
518
519   return reass;
520 }
521
522 always_inline ip4_full_reass_rc_t
523 ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
524                          ip4_full_reass_main_t * rm,
525                          ip4_full_reass_per_thread_t * rt,
526                          ip4_full_reass_t * reass, u32 * bi0,
527                          u32 * next0, u32 * error0, bool is_custom_app)
528 {
529   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
530   vlib_buffer_t *last_b = NULL;
531   u32 sub_chain_bi = reass->first_bi;
532   u32 total_length = 0;
533   u32 buf_cnt = 0;
534   do
535     {
536       u32 tmp_bi = sub_chain_bi;
537       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
538       ip4_header_t *ip = vlib_buffer_get_current (tmp);
539       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
540       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
541           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
542         {
543           return IP4_REASS_RC_INTERNAL_ERROR;
544         }
545
546       u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
547       u32 trim_front =
548         ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
549       u32 trim_end =
550         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
551       if (tmp_bi == reass->first_bi)
552         {
553           /* first buffer - keep ip4 header */
554           if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
555             {
556               return IP4_REASS_RC_INTERNAL_ERROR;
557             }
558           trim_front = 0;
559           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
560             ip4_header_bytes (ip);
561           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
562             {
563               return IP4_REASS_RC_INTERNAL_ERROR;
564             }
565         }
566       u32 keep_data =
567         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
568       while (1)
569         {
570           ++buf_cnt;
571           if (trim_front)
572             {
573               if (trim_front > tmp->current_length)
574                 {
575                   /* drop whole buffer */
576                   u32 to_be_freed_bi = tmp_bi;
577                   trim_front -= tmp->current_length;
578                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
579                     {
580                       return IP4_REASS_RC_INTERNAL_ERROR;
581                     }
582                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
583                   tmp_bi = tmp->next_buffer;
584                   tmp->next_buffer = 0;
585                   tmp = vlib_get_buffer (vm, tmp_bi);
586                   vlib_buffer_free_one (vm, to_be_freed_bi);
587                   continue;
588                 }
589               else
590                 {
591                   vlib_buffer_advance (tmp, trim_front);
592                   trim_front = 0;
593                 }
594             }
595           if (keep_data)
596             {
597               if (last_b)
598                 {
599                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
600                   last_b->next_buffer = tmp_bi;
601                 }
602               last_b = tmp;
603               if (keep_data <= tmp->current_length)
604                 {
605                   tmp->current_length = keep_data;
606                   keep_data = 0;
607                 }
608               else
609                 {
610                   keep_data -= tmp->current_length;
611                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
612                     {
613                       return IP4_REASS_RC_INTERNAL_ERROR;
614                     }
615                 }
616               total_length += tmp->current_length;
617               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
618                 {
619                   tmp_bi = tmp->next_buffer;
620                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
621                 }
622               else
623                 {
624                   break;
625                 }
626             }
627           else
628             {
629               u32 to_be_freed_bi = tmp_bi;
630               if (reass->first_bi == tmp_bi)
631                 {
632                   return IP4_REASS_RC_INTERNAL_ERROR;
633                 }
634               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
635                 {
636                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
637                   tmp_bi = tmp->next_buffer;
638                   tmp->next_buffer = 0;
639                   tmp = vlib_get_buffer (vm, tmp_bi);
640                   vlib_buffer_free_one (vm, to_be_freed_bi);
641                 }
642               else
643                 {
644                   tmp->next_buffer = 0;
645                   vlib_buffer_free_one (vm, to_be_freed_bi);
646                   break;
647                 }
648             }
649         }
650       sub_chain_bi =
651         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
652         reass.next_range_bi;
653     }
654   while (~0 != sub_chain_bi);
655
656   if (!last_b)
657     {
658       return IP4_REASS_RC_INTERNAL_ERROR;
659     }
660   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
661
662   if (total_length < first_b->current_length)
663     {
664       return IP4_REASS_RC_INTERNAL_ERROR;
665     }
666   total_length -= first_b->current_length;
667   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
668   first_b->total_length_not_including_first_buffer = total_length;
669   ip4_header_t *ip = vlib_buffer_get_current (first_b);
670   ip->flags_and_fragment_offset = 0;
671   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
672   ip->checksum = ip4_header_checksum (ip);
673   if (!vlib_buffer_chain_linearize (vm, first_b))
674     {
675       return IP4_REASS_RC_NO_BUF;
676     }
677   // reset to reconstruct the mbuf linking
678   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
679   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
680     {
681       ip4_full_reass_add_trace (vm, node, rm, reass, reass->first_bi,
682                                 FINALIZE, 0, ~0);
683 #if 0
684       // following code does a hexdump of packet fragments to stdout ...
685       do
686         {
687           u32 bi = reass->first_bi;
688           u8 *s = NULL;
689           while (~0 != bi)
690             {
691               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
692               s = format (s, "%u: %U\n", bi, format_hexdump,
693                           vlib_buffer_get_current (b), b->current_length);
694               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
695                 {
696                   bi = b->next_buffer;
697                 }
698               else
699                 {
700                   break;
701                 }
702             }
703           printf ("%.*s\n", vec_len (s), s);
704           fflush (stdout);
705           vec_free (s);
706         }
707       while (0);
708 #endif
709     }
710   *bi0 = reass->first_bi;
711   if (!is_custom_app)
712     {
713       *next0 = IP4_FULL_REASS_NEXT_INPUT;
714     }
715   else
716     {
717       *next0 = reass->next_index;
718     }
719   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
720   *error0 = IP4_ERROR_NONE;
721   ip4_full_reass_free (rm, rt, reass);
722   reass = NULL;
723   return IP4_REASS_RC_OK;
724 }
725
726 always_inline ip4_full_reass_rc_t
727 ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
728                                       ip4_full_reass_main_t * rm,
729                                       ip4_full_reass_per_thread_t * rt,
730                                       ip4_full_reass_t * reass,
731                                       u32 prev_range_bi, u32 new_next_bi)
732 {
733   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
734   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
735   if (~0 != prev_range_bi)
736     {
737       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
738       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
739       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
740       prev_vnb->ip.reass.next_range_bi = new_next_bi;
741     }
742   else
743     {
744       if (~0 != reass->first_bi)
745         {
746           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
747         }
748       reass->first_bi = new_next_bi;
749     }
750   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
751   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
752       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
753     {
754       return IP4_REASS_RC_INTERNAL_ERROR;
755     }
756   reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
757   return IP4_REASS_RC_OK;
758 }
759
760 always_inline ip4_full_reass_rc_t
761 ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
762                                         vlib_node_runtime_t * node,
763                                         ip4_full_reass_main_t * rm,
764                                         ip4_full_reass_t * reass,
765                                         u32 prev_range_bi, u32 discard_bi)
766 {
767   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
768   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
769   if (~0 != prev_range_bi)
770     {
771       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
772       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
773       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
774         {
775           return IP4_REASS_RC_INTERNAL_ERROR;
776         }
777       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
778     }
779   else
780     {
781       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
782     }
783   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
784   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
785       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
786     {
787       return IP4_REASS_RC_INTERNAL_ERROR;
788     }
789   reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
790   while (1)
791     {
792       u32 to_be_freed_bi = discard_bi;
793       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
794         {
795           ip4_full_reass_add_trace (vm, node, rm, reass, discard_bi,
796                                     RANGE_DISCARD, 0, ~0);
797         }
798       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
799         {
800           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
801           discard_bi = discard_b->next_buffer;
802           discard_b->next_buffer = 0;
803           discard_b = vlib_get_buffer (vm, discard_bi);
804           vlib_buffer_free_one (vm, to_be_freed_bi);
805         }
806       else
807         {
808           discard_b->next_buffer = 0;
809           vlib_buffer_free_one (vm, to_be_freed_bi);
810           break;
811         }
812     }
813   return IP4_REASS_RC_OK;
814 }
815
816 always_inline ip4_full_reass_rc_t
817 ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
818                        ip4_full_reass_main_t * rm,
819                        ip4_full_reass_per_thread_t * rt,
820                        ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
821                        u32 * error0, bool is_custom_app,
822                        u32 * handoff_thread_idx)
823 {
824   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
825   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
826   if (is_custom_app)
827     {
828       // store (error_)next_index before it's overwritten
829       reass->next_index = fvnb->ip.reass.next_index;
830       reass->error_next_index = fvnb->ip.reass.error_next_index;
831     }
832   ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
833   int consumed = 0;
834   ip4_header_t *fip = vlib_buffer_get_current (fb);
835   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
836   const u32 fragment_length =
837     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
838   const u32 fragment_last = fragment_first + fragment_length - 1;
839   fvnb->ip.reass.fragment_first = fragment_first;
840   fvnb->ip.reass.fragment_last = fragment_last;
841   int more_fragments = ip4_get_fragment_more (fip);
842   u32 candidate_range_bi = reass->first_bi;
843   u32 prev_range_bi = ~0;
844   fvnb->ip.reass.range_first = fragment_first;
845   fvnb->ip.reass.range_last = fragment_last;
846   fvnb->ip.reass.next_range_bi = ~0;
847   if (!more_fragments)
848     {
849       reass->last_packet_octet = fragment_last;
850     }
851   if (~0 == reass->first_bi)
852     {
853       // starting a new reassembly
854       rc =
855         ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
856                                               prev_range_bi, *bi0);
857       if (IP4_REASS_RC_OK != rc)
858         {
859           return rc;
860         }
861       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
862         {
863           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
864                                     ~0);
865         }
866       *bi0 = ~0;
867       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
868       reass->fragments_n = 1;
869       return IP4_REASS_RC_OK;
870     }
871   reass->min_fragment_length =
872     clib_min (clib_net_to_host_u16 (fip->length),
873               fvnb->ip.reass.estimated_mtu);
874   while (~0 != candidate_range_bi)
875     {
876       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
877       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
878       if (fragment_first > candidate_vnb->ip.reass.range_last)
879         {
880           // this fragments starts after candidate range
881           prev_range_bi = candidate_range_bi;
882           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
883           if (candidate_vnb->ip.reass.range_last < fragment_last &&
884               ~0 == candidate_range_bi)
885             {
886               // special case - this fragment falls beyond all known ranges
887               rc =
888                 ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
889                                                       prev_range_bi, *bi0);
890               if (IP4_REASS_RC_OK != rc)
891                 {
892                   return rc;
893                 }
894               consumed = 1;
895               break;
896             }
897           continue;
898         }
899       if (fragment_last < candidate_vnb->ip.reass.range_first)
900         {
901           // this fragment ends before candidate range without any overlap
902           rc =
903             ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
904                                                   prev_range_bi, *bi0);
905           if (IP4_REASS_RC_OK != rc)
906             {
907               return rc;
908             }
909           consumed = 1;
910         }
911       else
912         {
913           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
914               fragment_last <= candidate_vnb->ip.reass.range_last)
915             {
916               // this fragment is a (sub)part of existing range, ignore it
917               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
918                 {
919                   ip4_full_reass_add_trace (vm, node, rm, reass, *bi0,
920                                             RANGE_OVERLAP, 0, ~0);
921                 }
922               break;
923             }
924           int discard_candidate = 0;
925           if (fragment_first < candidate_vnb->ip.reass.range_first)
926             {
927               u32 overlap =
928                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
929               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
930                 {
931                   candidate_vnb->ip.reass.range_first += overlap;
932                   if (reass->data_len < overlap)
933                     {
934                       return IP4_REASS_RC_INTERNAL_ERROR;
935                     }
936                   reass->data_len -= overlap;
937                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
938                     {
939                       ip4_full_reass_add_trace (vm, node, rm, reass,
940                                                 candidate_range_bi,
941                                                 RANGE_SHRINK, 0, ~0);
942                     }
943                   rc =
944                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
945                                                           prev_range_bi,
946                                                           *bi0);
947                   if (IP4_REASS_RC_OK != rc)
948                     {
949                       return rc;
950                     }
951                   consumed = 1;
952                 }
953               else
954                 {
955                   discard_candidate = 1;
956                 }
957             }
958           else if (fragment_last > candidate_vnb->ip.reass.range_last)
959             {
960               u32 overlap =
961                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
962               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
963                 {
964                   fvnb->ip.reass.range_first += overlap;
965                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
966                     {
967                       prev_range_bi = candidate_range_bi;
968                       candidate_range_bi =
969                         candidate_vnb->ip.reass.next_range_bi;
970                       continue;
971                     }
972                   else
973                     {
974                       // special case - last range discarded
975                       rc =
976                         ip4_full_reass_insert_range_in_chain (vm, rm, rt,
977                                                               reass,
978                                                               candidate_range_bi,
979                                                               *bi0);
980                       if (IP4_REASS_RC_OK != rc)
981                         {
982                           return rc;
983                         }
984                       consumed = 1;
985                     }
986                 }
987               else
988                 {
989                   discard_candidate = 1;
990                 }
991             }
992           else
993             {
994               discard_candidate = 1;
995             }
996           if (discard_candidate)
997             {
998               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
999               // discard candidate range, probe next range
1000               rc =
1001                 ip4_full_reass_remove_range_from_chain (vm, node, rm, reass,
1002                                                         prev_range_bi,
1003                                                         candidate_range_bi);
1004               if (IP4_REASS_RC_OK != rc)
1005                 {
1006                   return rc;
1007                 }
1008               if (~0 != next_range_bi)
1009                 {
1010                   candidate_range_bi = next_range_bi;
1011                   continue;
1012                 }
1013               else
1014                 {
1015                   // special case - last range discarded
1016                   rc =
1017                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
1018                                                           prev_range_bi,
1019                                                           *bi0);
1020                   if (IP4_REASS_RC_OK != rc)
1021                     {
1022                       return rc;
1023                     }
1024                   consumed = 1;
1025                 }
1026             }
1027         }
1028       break;
1029     }
1030   ++reass->fragments_n;
1031   if (consumed)
1032     {
1033       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1034         {
1035           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
1036                                     ~0);
1037         }
1038     }
1039   if (~0 != reass->last_packet_octet &&
1040       reass->data_len == reass->last_packet_octet + 1)
1041     {
1042       *handoff_thread_idx = reass->sendout_thread_index;
1043       rc =
1044         ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1045                                  is_custom_app);
1046       if (IP4_REASS_RC_OK == rc
1047           && reass->memory_owner_thread_index != reass->sendout_thread_index)
1048         {
1049           rc = IP4_REASS_RC_HANDOFF;
1050         }
1051     }
1052   else
1053     {
1054       if (consumed)
1055         {
1056           *bi0 = ~0;
1057           if (reass->fragments_n > rm->max_reass_len)
1058             {
1059               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1060             }
1061         }
1062       else
1063         {
1064           *next0 = IP4_FULL_REASS_NEXT_DROP;
1065           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1066         }
1067     }
1068   return rc;
1069 }
1070
1071 always_inline uword
1072 ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1073                        vlib_frame_t * frame, bool is_feature,
1074                        bool is_custom_app)
1075 {
1076   u32 *from = vlib_frame_vector_args (frame);
1077   u32 n_left_from, n_left_to_next, *to_next, next_index;
1078   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1079   ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1080   clib_spinlock_lock (&rt->lock);
1081
1082   n_left_from = frame->n_vectors;
1083   next_index = node->cached_next_index;
1084   while (n_left_from > 0)
1085     {
1086       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1087
1088       while (n_left_from > 0 && n_left_to_next > 0)
1089         {
1090           u32 bi0;
1091           vlib_buffer_t *b0;
1092           u32 next0;
1093           u32 error0 = IP4_ERROR_NONE;
1094
1095           bi0 = from[0];
1096           b0 = vlib_get_buffer (vm, bi0);
1097
1098           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1099           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1100             {
1101               // this is a whole packet - no fragmentation
1102               if (!is_custom_app)
1103                 {
1104                   next0 = IP4_FULL_REASS_NEXT_INPUT;
1105                 }
1106               else
1107                 {
1108                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1109                 }
1110               goto packet_enqueue;
1111             }
1112           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1113           const u32 fragment_length =
1114             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1115           const u32 fragment_last = fragment_first + fragment_length - 1;
1116           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
1117             {
1118               next0 = IP4_FULL_REASS_NEXT_DROP;
1119               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1120               goto packet_enqueue;
1121             }
1122           ip4_full_reass_kv_t kv;
1123           u8 do_handoff = 0;
1124
1125           kv.k.as_u64[0] =
1126             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1127                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1128             (u64) ip0->src_address.as_u32 << 32;
1129           kv.k.as_u64[1] =
1130             (u64) ip0->dst_address.
1131             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1132
1133           ip4_full_reass_t *reass =
1134             ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
1135                                            &do_handoff);
1136
1137           if (reass)
1138             {
1139               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1140               if (0 == fragment_first)
1141                 {
1142                   reass->sendout_thread_index = vm->thread_index;
1143                 }
1144             }
1145
1146           if (PREDICT_FALSE (do_handoff))
1147             {
1148               next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1149               vnet_buffer (b0)->ip.reass.owner_thread_index =
1150                 kv.v.memory_owner_thread_index;
1151             }
1152           else if (reass)
1153             {
1154               u32 handoff_thread_idx;
1155               switch (ip4_full_reass_update
1156                       (vm, node, rm, rt, reass, &bi0, &next0,
1157                        &error0, is_custom_app, &handoff_thread_idx))
1158                 {
1159                 case IP4_REASS_RC_OK:
1160                   /* nothing to do here */
1161                   break;
1162                 case IP4_REASS_RC_HANDOFF:
1163                   next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1164                   b0 = vlib_get_buffer (vm, bi0);
1165                   vnet_buffer (b0)->ip.reass.owner_thread_index =
1166                     handoff_thread_idx;
1167                   break;
1168                 case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1169                   vlib_node_increment_counter (vm, node->node_index,
1170                                                IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1171                                                1);
1172                   ip4_full_reass_drop_all (vm, node, rm, reass);
1173                   ip4_full_reass_free (rm, rt, reass);
1174                   goto next_packet;
1175                   break;
1176                 case IP4_REASS_RC_NO_BUF:
1177                   vlib_node_increment_counter (vm, node->node_index,
1178                                                IP4_ERROR_REASS_NO_BUF, 1);
1179                   ip4_full_reass_drop_all (vm, node, rm, reass);
1180                   ip4_full_reass_free (rm, rt, reass);
1181                   goto next_packet;
1182                   break;
1183                 case IP4_REASS_RC_INTERNAL_ERROR:
1184                   /* drop everything and start with a clean slate */
1185                   vlib_node_increment_counter (vm, node->node_index,
1186                                                IP4_ERROR_REASS_INTERNAL_ERROR,
1187                                                1);
1188                   ip4_full_reass_drop_all (vm, node, rm, reass);
1189                   ip4_full_reass_free (rm, rt, reass);
1190                   goto next_packet;
1191                   break;
1192                 }
1193             }
1194           else
1195             {
1196               next0 = IP4_FULL_REASS_NEXT_DROP;
1197               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1198             }
1199
1200
1201         packet_enqueue:
1202           b0->error = node->errors[error0];
1203
1204           if (bi0 != ~0)
1205             {
1206               to_next[0] = bi0;
1207               to_next += 1;
1208               n_left_to_next -= 1;
1209               if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1210                 {
1211                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1212                     {
1213                       ip4_full_reass_add_trace (vm, node, rm, NULL, bi0,
1214                                                 HANDOFF, 0,
1215                                                 vnet_buffer (b0)->ip.
1216                                                 reass.owner_thread_index);
1217                     }
1218                 }
1219               else if (is_feature && IP4_ERROR_NONE == error0)
1220                 {
1221                   b0 = vlib_get_buffer (vm, bi0);
1222                   vnet_feature_next (&next0, b0);
1223                 }
1224               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1225                                                to_next, n_left_to_next,
1226                                                bi0, next0);
1227               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1228             }
1229
1230         next_packet:
1231           from += 1;
1232           n_left_from -= 1;
1233         }
1234
1235       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1236     }
1237
1238   clib_spinlock_unlock (&rt->lock);
1239   return frame->n_vectors;
1240 }
1241
1242 static char *ip4_full_reass_error_strings[] = {
1243 #define _(sym, string) string,
1244   foreach_ip4_error
1245 #undef _
1246 };
1247
1248 VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1249                                     vlib_node_runtime_t * node,
1250                                     vlib_frame_t * frame)
1251 {
1252   return ip4_full_reass_inline (vm, node, frame, false /* is_feature */ ,
1253                                 false /* is_custom_app */ );
1254 }
1255
1256 /* *INDENT-OFF* */
1257 VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1258     .name = "ip4-full-reassembly",
1259     .vector_size = sizeof (u32),
1260     .format_trace = format_ip4_full_reass_trace,
1261     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1262     .error_strings = ip4_full_reass_error_strings,
1263     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1264     .next_nodes =
1265         {
1266                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1267                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1268                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1269
1270         },
1271 };
1272 /* *INDENT-ON* */
1273
1274 VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1275                                             vlib_node_runtime_t * node,
1276                                             vlib_frame_t * frame)
1277 {
1278   return ip4_full_reass_inline (vm, node, frame, true /* is_feature */ ,
1279                                 false /* is_custom_app */ );
1280 }
1281
1282 /* *INDENT-OFF* */
1283 VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1284     .name = "ip4-full-reassembly-feature",
1285     .vector_size = sizeof (u32),
1286     .format_trace = format_ip4_full_reass_trace,
1287     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1288     .error_strings = ip4_full_reass_error_strings,
1289     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1290     .next_nodes =
1291         {
1292                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1293                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1294                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1295         },
1296 };
1297 /* *INDENT-ON* */
1298
1299 /* *INDENT-OFF* */
1300 VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1301     .arc_name = "ip4-unicast",
1302     .node_name = "ip4-full-reassembly-feature",
1303     .runs_before = VNET_FEATURES ("ip4-lookup",
1304                                   "ipsec4-input-feature"),
1305     .runs_after = 0,
1306 };
1307 /* *INDENT-ON* */
1308
1309 #ifndef CLIB_MARCH_VARIANT
1310 always_inline u32
1311 ip4_full_reass_get_nbuckets ()
1312 {
1313   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1314   u32 nbuckets;
1315   u8 i;
1316
1317   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1318
1319   for (i = 0; i < 31; i++)
1320     if ((1 << i) >= nbuckets)
1321       break;
1322   nbuckets = 1 << i;
1323
1324   return nbuckets;
1325 }
1326 #endif /* CLIB_MARCH_VARIANT */
1327
1328 typedef enum
1329 {
1330   IP4_EVENT_CONFIG_CHANGED = 1,
1331 } ip4_full_reass_event_t;
1332
1333 typedef struct
1334 {
1335   int failure;
1336   clib_bihash_16_8_t *new_hash;
1337 } ip4_rehash_cb_ctx;
1338
1339 #ifndef CLIB_MARCH_VARIANT
1340 static void
1341 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1342 {
1343   ip4_rehash_cb_ctx *ctx = _ctx;
1344   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1345     {
1346       ctx->failure = 1;
1347     }
1348 }
1349
1350 static void
1351 ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1352                            u32 max_reassembly_length,
1353                            u32 expire_walk_interval_ms)
1354 {
1355   ip4_full_reass_main.timeout_ms = timeout_ms;
1356   ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1357   ip4_full_reass_main.max_reass_n = max_reassemblies;
1358   ip4_full_reass_main.max_reass_len = max_reassembly_length;
1359   ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1360 }
1361
1362 vnet_api_error_t
1363 ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1364                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1365 {
1366   u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1367   ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1368                              max_reassembly_length, expire_walk_interval_ms);
1369   vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1370                              ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1371                              IP4_EVENT_CONFIG_CHANGED, 0);
1372   u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1373   if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1374     {
1375       clib_bihash_16_8_t new_hash;
1376       clib_memset (&new_hash, 0, sizeof (new_hash));
1377       ip4_rehash_cb_ctx ctx;
1378       ctx.failure = 0;
1379       ctx.new_hash = &new_hash;
1380       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1381                              new_nbuckets * 1024);
1382       clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1383                                                ip4_rehash_cb, &ctx);
1384       if (ctx.failure)
1385         {
1386           clib_bihash_free_16_8 (&new_hash);
1387           return -1;
1388         }
1389       else
1390         {
1391           clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1392           clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1393                             sizeof (ip4_full_reass_main.hash));
1394           clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1395         }
1396     }
1397   return 0;
1398 }
1399
1400 vnet_api_error_t
1401 ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1402                     u32 * max_reassembly_length,
1403                     u32 * expire_walk_interval_ms)
1404 {
1405   *timeout_ms = ip4_full_reass_main.timeout_ms;
1406   *max_reassemblies = ip4_full_reass_main.max_reass_n;
1407   *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1408   *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1409   return 0;
1410 }
1411
1412 static clib_error_t *
1413 ip4_full_reass_init_function (vlib_main_t * vm)
1414 {
1415   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1416   clib_error_t *error = 0;
1417   u32 nbuckets;
1418   vlib_node_t *node;
1419
1420   rm->vlib_main = vm;
1421
1422   vec_validate (rm->per_thread_data, vlib_num_workers ());
1423   ip4_full_reass_per_thread_t *rt;
1424   vec_foreach (rt, rm->per_thread_data)
1425   {
1426     clib_spinlock_init (&rt->lock);
1427     pool_alloc (rt->pool, rm->max_reass_n);
1428   }
1429
1430   node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1431   ASSERT (node);
1432   rm->ip4_full_reass_expire_node_idx = node->index;
1433
1434   ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1435                              IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1436                              IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1437                              IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1438
1439   nbuckets = ip4_full_reass_get_nbuckets ();
1440   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1441
1442   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1443   ASSERT (node);
1444   rm->ip4_drop_idx = node->index;
1445
1446   rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1447   rm->fq_feature_index =
1448     vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1449
1450   return error;
1451 }
1452
1453 VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1454 #endif /* CLIB_MARCH_VARIANT */
1455
1456 static uword
1457 ip4_full_reass_walk_expired (vlib_main_t * vm,
1458                              vlib_node_runtime_t * node, vlib_frame_t * f)
1459 {
1460   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1461   uword event_type, *event_data = 0;
1462
1463   while (true)
1464     {
1465       vlib_process_wait_for_event_or_clock (vm,
1466                                             (f64)
1467                                             rm->expire_walk_interval_ms /
1468                                             (f64) MSEC_PER_SEC);
1469       event_type = vlib_process_get_events (vm, &event_data);
1470
1471       switch (event_type)
1472         {
1473         case ~0:                /* no events => timeout */
1474           /* nothing to do here */
1475           break;
1476         case IP4_EVENT_CONFIG_CHANGED:
1477           break;
1478         default:
1479           clib_warning ("BUG: event type 0x%wx", event_type);
1480           break;
1481         }
1482       f64 now = vlib_time_now (vm);
1483
1484       ip4_full_reass_t *reass;
1485       int *pool_indexes_to_free = NULL;
1486
1487       uword thread_index = 0;
1488       int index;
1489       const uword nthreads = vlib_num_workers () + 1;
1490       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1491         {
1492           ip4_full_reass_per_thread_t *rt =
1493             &rm->per_thread_data[thread_index];
1494           clib_spinlock_lock (&rt->lock);
1495
1496           vec_reset_length (pool_indexes_to_free);
1497           /* *INDENT-OFF* */
1498           pool_foreach_index (index, rt->pool, ({
1499                                 reass = pool_elt_at_index (rt->pool, index);
1500                                 if (now > reass->last_heard + rm->timeout)
1501                                   {
1502                                     vec_add1 (pool_indexes_to_free, index);
1503                                   }
1504                               }));
1505           /* *INDENT-ON* */
1506           int *i;
1507           /* *INDENT-OFF* */
1508           vec_foreach (i, pool_indexes_to_free)
1509           {
1510             ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1511             ip4_full_reass_drop_all (vm, node, rm, reass);
1512             ip4_full_reass_free (rm, rt, reass);
1513           }
1514           /* *INDENT-ON* */
1515
1516           clib_spinlock_unlock (&rt->lock);
1517         }
1518
1519       vec_free (pool_indexes_to_free);
1520       if (event_data)
1521         {
1522           _vec_len (event_data) = 0;
1523         }
1524     }
1525
1526   return 0;
1527 }
1528
1529 /* *INDENT-OFF* */
1530 VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1531     .function = ip4_full_reass_walk_expired,
1532     .type = VLIB_NODE_TYPE_PROCESS,
1533     .name = "ip4-full-reassembly-expire-walk",
1534     .format_trace = format_ip4_full_reass_trace,
1535     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1536     .error_strings = ip4_full_reass_error_strings,
1537
1538 };
1539 /* *INDENT-ON* */
1540
1541 static u8 *
1542 format_ip4_full_reass_key (u8 * s, va_list * args)
1543 {
1544   ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1545   s =
1546     format (s,
1547             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1548             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1549             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1550   return s;
1551 }
1552
1553 static u8 *
1554 format_ip4_reass (u8 * s, va_list * args)
1555 {
1556   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1557   ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1558
1559   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1560               "last_packet_octet: %u, trace_op_counter: %u\n",
1561               reass->id, format_ip4_full_reass_key, &reass->key,
1562               reass->first_bi, reass->data_len,
1563               reass->last_packet_octet, reass->trace_op_counter);
1564
1565   u32 bi = reass->first_bi;
1566   u32 counter = 0;
1567   while (~0 != bi)
1568     {
1569       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1570       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1571       s =
1572         format (s,
1573                 "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1574                 "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1575                 vnb->ip.reass.range_last, bi,
1576                 ip4_full_reass_buffer_get_data_offset (b),
1577                 ip4_full_reass_buffer_get_data_len (b),
1578                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1579       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1580         {
1581           bi = b->next_buffer;
1582         }
1583       else
1584         {
1585           bi = ~0;
1586         }
1587     }
1588   return s;
1589 }
1590
1591 static clib_error_t *
1592 show_ip4_reass (vlib_main_t * vm,
1593                 unformat_input_t * input,
1594                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1595 {
1596   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1597
1598   vlib_cli_output (vm, "---------------------");
1599   vlib_cli_output (vm, "IP4 reassembly status");
1600   vlib_cli_output (vm, "---------------------");
1601   bool details = false;
1602   if (unformat (input, "details"))
1603     {
1604       details = true;
1605     }
1606
1607   u32 sum_reass_n = 0;
1608   ip4_full_reass_t *reass;
1609   uword thread_index;
1610   const uword nthreads = vlib_num_workers () + 1;
1611   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1612     {
1613       ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1614       clib_spinlock_lock (&rt->lock);
1615       if (details)
1616         {
1617           /* *INDENT-OFF* */
1618           pool_foreach (reass, rt->pool, {
1619             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1620           });
1621           /* *INDENT-ON* */
1622         }
1623       sum_reass_n += rt->reass_n;
1624       clib_spinlock_unlock (&rt->lock);
1625     }
1626   vlib_cli_output (vm, "---------------------");
1627   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1628                    (long unsigned) sum_reass_n);
1629   vlib_cli_output (vm,
1630                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1631                    (long unsigned) rm->max_reass_n);
1632   return 0;
1633 }
1634
1635 /* *INDENT-OFF* */
1636 VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1637     .path = "show ip4-full-reassembly",
1638     .short_help = "show ip4-full-reassembly [details]",
1639     .function = show_ip4_reass,
1640 };
1641 /* *INDENT-ON* */
1642
1643 #ifndef CLIB_MARCH_VARIANT
1644 vnet_api_error_t
1645 ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1646 {
1647   return vnet_feature_enable_disable ("ip4-unicast",
1648                                       "ip4-full-reassembly-feature",
1649                                       sw_if_index, enable_disable, 0, 0);
1650 }
1651 #endif /* CLIB_MARCH_VARIANT */
1652
1653
1654 #define foreach_ip4_full_reass_handoff_error                       \
1655 _(CONGESTION_DROP, "congestion drop")
1656
1657
1658 typedef enum
1659 {
1660 #define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1661   foreach_ip4_full_reass_handoff_error
1662 #undef _
1663     IP4_FULL_REASS_HANDOFF_N_ERROR,
1664 } ip4_full_reass_handoff_error_t;
1665
1666 static char *ip4_full_reass_handoff_error_strings[] = {
1667 #define _(sym,string) string,
1668   foreach_ip4_full_reass_handoff_error
1669 #undef _
1670 };
1671
1672 typedef struct
1673 {
1674   u32 next_worker_index;
1675 } ip4_full_reass_handoff_trace_t;
1676
1677 static u8 *
1678 format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1679 {
1680   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1681   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1682   ip4_full_reass_handoff_trace_t *t =
1683     va_arg (*args, ip4_full_reass_handoff_trace_t *);
1684
1685   s =
1686     format (s, "ip4-full-reassembly-handoff: next-worker %d",
1687             t->next_worker_index);
1688
1689   return s;
1690 }
1691
1692 always_inline uword
1693 ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
1694                                     vlib_node_runtime_t * node,
1695                                     vlib_frame_t * frame, bool is_feature)
1696 {
1697   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1698
1699   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1700   u32 n_enq, n_left_from, *from;
1701   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1702   u32 fq_index;
1703
1704   from = vlib_frame_vector_args (frame);
1705   n_left_from = frame->n_vectors;
1706   vlib_get_buffers (vm, from, bufs, n_left_from);
1707
1708   b = bufs;
1709   ti = thread_indices;
1710
1711   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1712
1713   while (n_left_from > 0)
1714     {
1715       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1716
1717       if (PREDICT_FALSE
1718           ((node->flags & VLIB_NODE_FLAG_TRACE)
1719            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1720         {
1721           ip4_full_reass_handoff_trace_t *t =
1722             vlib_add_trace (vm, node, b[0], sizeof (*t));
1723           t->next_worker_index = ti[0];
1724         }
1725
1726       n_left_from -= 1;
1727       ti += 1;
1728       b += 1;
1729     }
1730   n_enq =
1731     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1732                                    frame->n_vectors, 1);
1733
1734   if (n_enq < frame->n_vectors)
1735     vlib_node_increment_counter (vm, node->node_index,
1736                                  IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1737                                  frame->n_vectors - n_enq);
1738   return frame->n_vectors;
1739 }
1740
1741 VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1742                                             vlib_node_runtime_t * node,
1743                                             vlib_frame_t * frame)
1744 {
1745   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1746                                              false /* is_feature */ );
1747 }
1748
1749
1750 /* *INDENT-OFF* */
1751 VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1752   .name = "ip4-full-reassembly-handoff",
1753   .vector_size = sizeof (u32),
1754   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1755   .error_strings = ip4_full_reass_handoff_error_strings,
1756   .format_trace = format_ip4_full_reass_handoff_trace,
1757
1758   .n_next_nodes = 1,
1759
1760   .next_nodes = {
1761     [0] = "error-drop",
1762   },
1763 };
1764 /* *INDENT-ON* */
1765
1766
1767 /* *INDENT-OFF* */
1768 VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
1769                                                     vlib_node_runtime_t *
1770                                                     node,
1771                                                     vlib_frame_t * frame)
1772 {
1773   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1774                                              true /* is_feature */ );
1775 }
1776 /* *INDENT-ON* */
1777
1778
1779 /* *INDENT-OFF* */
1780 VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
1781   .name = "ip4-full-reass-feature-hoff",
1782   .vector_size = sizeof (u32),
1783   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1784   .error_strings = ip4_full_reass_handoff_error_strings,
1785   .format_trace = format_ip4_full_reass_handoff_trace,
1786
1787   .n_next_nodes = 1,
1788
1789   .next_nodes = {
1790     [0] = "error-drop",
1791   },
1792 };
1793 /* *INDENT-ON* */
1794
1795 /*
1796  * fd.io coding-style-patch-verification: ON
1797  *
1798  * Local Variables:
1799  * eval: (c-set-style "gnu")
1800  * End:
1801  */