ip: refactor reassembly
[vpp.git] / src / vnet / ip / reass / ip4_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Full Reassembly.
19  *
20  * This file contains the source code for IPv4 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/fifo.h>
27 #include <vppinfra/bihash_16_8.h>
28 #include <vnet/ip/reass/ip4_full_reass.h>
29 #include <stddef.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
34 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38 #define IP4_REASS_DEBUG_BUFFERS 0
39 #if IP4_REASS_DEBUG_BUFFERS
40 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41   do                                                 \
42     {                                                \
43       u32 _bi = bi;                                  \
44       printf (#what "buffer %u", _bi);               \
45       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47         {                                            \
48           _bi = _b->next_buffer;                     \
49           printf ("[%u]", _bi);                      \
50           _b = vlib_get_buffer (vm, _bi);            \
51         }                                            \
52       printf ("\n");                                 \
53       fflush (stdout);                               \
54     }                                                \
55   while (0)
56 #else
57 #define IP4_REASS_DEBUG_BUFFER(...)
58 #endif
59
60 typedef enum
61 {
62   IP4_REASS_RC_OK,
63   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64   IP4_REASS_RC_INTERNAL_ERROR,
65   IP4_REASS_RC_NO_BUF,
66   IP4_REASS_RC_HANDOFF,
67 } ip4_full_reass_rc_t;
68
69 typedef struct
70 {
71   union
72   {
73     struct
74     {
75       u32 xx_id;
76       ip4_address_t src;
77       ip4_address_t dst;
78       u16 frag_id;
79       u8 proto;
80       u8 unused;
81     };
82     u64 as_u64[2];
83   };
84 } ip4_full_reass_key_t;
85
86 typedef union
87 {
88   struct
89   {
90     u32 reass_index;
91     u32 memory_owner_thread_index;
92   };
93   u64 as_u64;
94 } ip4_full_reass_val_t;
95
96 typedef union
97 {
98   struct
99   {
100     ip4_full_reass_key_t k;
101     ip4_full_reass_val_t v;
102   };
103   clib_bihash_kv_16_8_t kv;
104 } ip4_full_reass_kv_t;
105
106 always_inline u32
107 ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108 {
109   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111 }
112
113 always_inline u16
114 ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115 {
116   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118     (vnb->ip.reass.fragment_first +
119      ip4_full_reass_buffer_get_data_offset (b)) + 1;
120 }
121
122 typedef struct
123 {
124   // hash table key
125   ip4_full_reass_key_t key;
126   // time when last packet was received
127   f64 last_heard;
128   // internal id of this reassembly
129   u64 id;
130   // buffer index of first buffer in this reassembly context
131   u32 first_bi;
132   // last octet of packet, ~0 until fragment without more_fragments arrives
133   u32 last_packet_octet;
134   // length of data collected so far
135   u32 data_len;
136   // trace operation counter
137   u32 trace_op_counter;
138   // next index - used by non-feature node
139   u32 next_index;
140   // error next index - used by custom apps (~0 if not used)
141   u32 error_next_index;
142   // minimum fragment length for this reassembly - used to estimate MTU
143   u16 min_fragment_length;
144   // number of fragments in this reassembly
145   u32 fragments_n;
146   // thread owning memory for this context (whose pool contains this ctx)
147   u32 memory_owner_thread_index;
148   // thread which received fragment with offset 0 and which sends out the
149   // completed reassembly
150   u32 sendout_thread_index;
151 } ip4_full_reass_t;
152
153 typedef struct
154 {
155   ip4_full_reass_t *pool;
156   u32 reass_n;
157   u32 id_counter;
158   clib_spinlock_t lock;
159 } ip4_full_reass_per_thread_t;
160
161 typedef struct
162 {
163   // IPv4 config
164   u32 timeout_ms;
165   f64 timeout;
166   u32 expire_walk_interval_ms;
167   // maximum number of fragments in one reassembly
168   u32 max_reass_len;
169   // maximum number of reassemblies
170   u32 max_reass_n;
171
172   // IPv4 runtime
173   clib_bihash_16_8_t hash;
174   // per-thread data
175   ip4_full_reass_per_thread_t *per_thread_data;
176
177   // convenience
178   vlib_main_t *vlib_main;
179
180   // node index of ip4-drop node
181   u32 ip4_drop_idx;
182   u32 ip4_full_reass_expire_node_idx;
183
184   /** Worker handoff */
185   u32 fq_index;
186   u32 fq_feature_index;
187
188 } ip4_full_reass_main_t;
189
190 extern ip4_full_reass_main_t ip4_full_reass_main;
191
192 #ifndef CLIB_MARCH_VARIANT
193 ip4_full_reass_main_t ip4_full_reass_main;
194 #endif /* CLIB_MARCH_VARIANT */
195
196 typedef enum
197 {
198   IP4_FULL_REASS_NEXT_INPUT,
199   IP4_FULL_REASS_NEXT_DROP,
200   IP4_FULL_REASS_NEXT_HANDOFF,
201   IP4_FULL_REASS_N_NEXT,
202 } ip4_full_reass_next_t;
203
204 typedef enum
205 {
206   RANGE_NEW,
207   RANGE_SHRINK,
208   RANGE_DISCARD,
209   RANGE_OVERLAP,
210   FINALIZE,
211   HANDOFF,
212 } ip4_full_reass_trace_operation_e;
213
214 typedef struct
215 {
216   u16 range_first;
217   u16 range_last;
218   u32 range_bi;
219   i32 data_offset;
220   u32 data_len;
221   u32 first_bi;
222 } ip4_full_reass_range_trace_t;
223
224 typedef struct
225 {
226   ip4_full_reass_trace_operation_e action;
227   u32 reass_id;
228   ip4_full_reass_range_trace_t trace_range;
229   u32 size_diff;
230   u32 op_id;
231   u32 thread_id;
232   u32 thread_id_to;
233   u32 fragment_first;
234   u32 fragment_last;
235   u32 total_data_len;
236 } ip4_full_reass_trace_t;
237
238 extern vlib_node_registration_t ip4_full_reass_node;
239 extern vlib_node_registration_t ip4_full_reass_node_feature;
240
241 static void
242 ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
243                               ip4_full_reass_range_trace_t * trace)
244 {
245   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
246   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
247   trace->range_first = vnb->ip.reass.range_first;
248   trace->range_last = vnb->ip.reass.range_last;
249   trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
250   trace->data_len = ip4_full_reass_buffer_get_data_len (b);
251   trace->range_bi = bi;
252 }
253
254 static u8 *
255 format_ip4_full_reass_range_trace (u8 * s, va_list * args)
256 {
257   ip4_full_reass_range_trace_t *trace =
258     va_arg (*args, ip4_full_reass_range_trace_t *);
259   s =
260     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
261             trace->range_last, trace->data_offset, trace->data_len,
262             trace->range_bi);
263   return s;
264 }
265
266 static u8 *
267 format_ip4_full_reass_trace (u8 * s, va_list * args)
268 {
269   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
270   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
271   ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
272   u32 indent = 0;
273   if (~0 != t->reass_id)
274     {
275       s = format (s, "reass id: %u, op id: %u, ", t->reass_id, t->op_id);
276       indent = format_get_indent (s);
277       s =
278         format (s,
279                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
280                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
281                 t->fragment_last);
282     }
283   switch (t->action)
284     {
285     case RANGE_SHRINK:
286       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
287                   format_ip4_full_reass_range_trace, &t->trace_range,
288                   t->size_diff);
289       break;
290     case RANGE_DISCARD:
291       s = format (s, "\n%Udiscard %U", format_white_space, indent,
292                   format_ip4_full_reass_range_trace, &t->trace_range);
293       break;
294     case RANGE_NEW:
295       s = format (s, "\n%Unew %U", format_white_space, indent,
296                   format_ip4_full_reass_range_trace, &t->trace_range);
297       break;
298     case RANGE_OVERLAP:
299       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
300                   format_ip4_full_reass_range_trace, &t->trace_range);
301       break;
302     case FINALIZE:
303       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
304       break;
305     case HANDOFF:
306       s =
307         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
308                 t->thread_id_to);
309       break;
310     }
311   return s;
312 }
313
314 static void
315 ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
316                           ip4_full_reass_main_t * rm,
317                           ip4_full_reass_t * reass, u32 bi,
318                           ip4_full_reass_trace_operation_e action,
319                           u32 size_diff, u32 thread_id_to)
320 {
321   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
322   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
323   ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
324   if (reass)
325     {
326       t->reass_id = reass->id;
327       t->op_id = reass->trace_op_counter;
328       t->trace_range.first_bi = reass->first_bi;
329       t->total_data_len = reass->data_len;
330       ++reass->trace_op_counter;
331     }
332   else
333     {
334       t->reass_id = ~0;
335       t->op_id = 0;
336       t->trace_range.first_bi = 0;
337       t->total_data_len = 0;
338     }
339   t->action = action;
340   ip4_full_reass_trace_details (vm, bi, &t->trace_range);
341   t->size_diff = size_diff;
342   t->thread_id = vm->thread_index;
343   t->thread_id_to = thread_id_to;
344   t->fragment_first = vnb->ip.reass.fragment_first;
345   t->fragment_last = vnb->ip.reass.fragment_last;
346 #if 0
347   static u8 *s = NULL;
348   s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
349   printf ("%.*s\n", vec_len (s), s);
350   fflush (stdout);
351   vec_reset_length (s);
352 #endif
353 }
354
355 always_inline void
356 ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
357                          ip4_full_reass_t * reass)
358 {
359   pool_put (rt->pool, reass);
360   --rt->reass_n;
361 }
362
363 always_inline void
364 ip4_full_reass_free (ip4_full_reass_main_t * rm,
365                      ip4_full_reass_per_thread_t * rt,
366                      ip4_full_reass_t * reass)
367 {
368   clib_bihash_kv_16_8_t kv;
369   kv.key[0] = reass->key.as_u64[0];
370   kv.key[1] = reass->key.as_u64[1];
371   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
372   return ip4_full_reass_free_ctx (rt, reass);
373 }
374
375 always_inline void
376 ip4_full_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
377                          ip4_full_reass_main_t * rm, ip4_full_reass_t * reass)
378 {
379   u32 range_bi = reass->first_bi;
380   vlib_buffer_t *range_b;
381   vnet_buffer_opaque_t *range_vnb;
382   u32 *to_free = NULL;
383   while (~0 != range_bi)
384     {
385       range_b = vlib_get_buffer (vm, range_bi);
386       range_vnb = vnet_buffer (range_b);
387       u32 bi = range_bi;
388       while (~0 != bi)
389         {
390           vec_add1 (to_free, bi);
391           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
392           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
393             {
394               bi = b->next_buffer;
395               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
396             }
397           else
398             {
399               bi = ~0;
400             }
401         }
402       range_bi = range_vnb->ip.reass.next_range_bi;
403     }
404   /* send to next_error_index */
405   if (~0 != reass->error_next_index)
406     {
407       u32 n_left_to_next, *to_next, next_index;
408
409       next_index = reass->error_next_index;
410       u32 bi = ~0;
411
412       while (vec_len (to_free) > 0)
413         {
414           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
415
416           while (vec_len (to_free) > 0 && n_left_to_next > 0)
417             {
418               bi = vec_pop (to_free);
419
420               if (~0 != bi)
421                 {
422                   to_next[0] = bi;
423                   to_next += 1;
424                   n_left_to_next -= 1;
425                 }
426             }
427           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
428         }
429     }
430   else
431     {
432       vlib_buffer_free (vm, to_free, vec_len (to_free));
433     }
434 }
435
436 always_inline void
437 ip4_full_reass_init (ip4_full_reass_t * reass)
438 {
439   reass->first_bi = ~0;
440   reass->last_packet_octet = ~0;
441   reass->data_len = 0;
442   reass->next_index = ~0;
443   reass->error_next_index = ~0;
444 }
445
446 always_inline ip4_full_reass_t *
447 ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
448                                ip4_full_reass_main_t * rm,
449                                ip4_full_reass_per_thread_t * rt,
450                                ip4_full_reass_kv_t * kv, u8 * do_handoff)
451 {
452   ip4_full_reass_t *reass;
453   f64 now;
454
455 again:
456
457   reass = NULL;
458   now = vlib_time_now (vm);
459   if (!clib_bihash_search_16_8
460       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
461     {
462       reass =
463         pool_elt_at_index (rm->per_thread_data
464                            [kv->v.memory_owner_thread_index].pool,
465                            kv->v.reass_index);
466       if (vm->thread_index != reass->memory_owner_thread_index)
467         {
468           *do_handoff = 1;
469           return reass;
470         }
471
472       if (now > reass->last_heard + rm->timeout)
473         {
474           ip4_full_reass_drop_all (vm, node, rm, reass);
475           ip4_full_reass_free (rm, rt, reass);
476           reass = NULL;
477         }
478     }
479
480   if (reass)
481     {
482       reass->last_heard = now;
483       return reass;
484     }
485
486   if (rt->reass_n >= rm->max_reass_n)
487     {
488       reass = NULL;
489       return reass;
490     }
491   else
492     {
493       pool_get (rt->pool, reass);
494       clib_memset (reass, 0, sizeof (*reass));
495       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
496       reass->memory_owner_thread_index = vm->thread_index;
497       ++rt->id_counter;
498       ip4_full_reass_init (reass);
499       ++rt->reass_n;
500     }
501
502   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
503   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
504   kv->v.reass_index = (reass - rt->pool);
505   kv->v.memory_owner_thread_index = vm->thread_index;
506   reass->last_heard = now;
507
508   int rv =
509     clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 2);
510   if (rv)
511     {
512       ip4_full_reass_free_ctx (rt, reass);
513       reass = NULL;
514       // if other worker created a context already work with the other copy
515       if (-2 == rv)
516         goto again;
517     }
518
519   return reass;
520 }
521
522 always_inline ip4_full_reass_rc_t
523 ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
524                          ip4_full_reass_main_t * rm,
525                          ip4_full_reass_per_thread_t * rt,
526                          ip4_full_reass_t * reass, u32 * bi0,
527                          u32 * next0, u32 * error0, bool is_custom_app)
528 {
529   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
530   vlib_buffer_t *last_b = NULL;
531   u32 sub_chain_bi = reass->first_bi;
532   u32 total_length = 0;
533   u32 buf_cnt = 0;
534   do
535     {
536       u32 tmp_bi = sub_chain_bi;
537       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
538       ip4_header_t *ip = vlib_buffer_get_current (tmp);
539       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
540       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
541           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
542         {
543           return IP4_REASS_RC_INTERNAL_ERROR;
544         }
545
546       u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
547       u32 trim_front =
548         ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
549       u32 trim_end =
550         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
551       if (tmp_bi == reass->first_bi)
552         {
553           /* first buffer - keep ip4 header */
554           if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
555             {
556               return IP4_REASS_RC_INTERNAL_ERROR;
557             }
558           trim_front = 0;
559           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
560             ip4_header_bytes (ip);
561           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
562             {
563               return IP4_REASS_RC_INTERNAL_ERROR;
564             }
565         }
566       u32 keep_data =
567         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
568       while (1)
569         {
570           ++buf_cnt;
571           if (trim_front)
572             {
573               if (trim_front > tmp->current_length)
574                 {
575                   /* drop whole buffer */
576                   u32 to_be_freed_bi = tmp_bi;
577                   trim_front -= tmp->current_length;
578                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
579                     {
580                       return IP4_REASS_RC_INTERNAL_ERROR;
581                     }
582                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
583                   tmp_bi = tmp->next_buffer;
584                   tmp->next_buffer = 0;
585                   tmp = vlib_get_buffer (vm, tmp_bi);
586                   vlib_buffer_free_one (vm, to_be_freed_bi);
587                   continue;
588                 }
589               else
590                 {
591                   vlib_buffer_advance (tmp, trim_front);
592                   trim_front = 0;
593                 }
594             }
595           if (keep_data)
596             {
597               if (last_b)
598                 {
599                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
600                   last_b->next_buffer = tmp_bi;
601                 }
602               last_b = tmp;
603               if (keep_data <= tmp->current_length)
604                 {
605                   tmp->current_length = keep_data;
606                   keep_data = 0;
607                 }
608               else
609                 {
610                   keep_data -= tmp->current_length;
611                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
612                     {
613                       return IP4_REASS_RC_INTERNAL_ERROR;
614                     }
615                 }
616               total_length += tmp->current_length;
617               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
618                 {
619                   tmp_bi = tmp->next_buffer;
620                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
621                 }
622               else
623                 {
624                   break;
625                 }
626             }
627           else
628             {
629               u32 to_be_freed_bi = tmp_bi;
630               if (reass->first_bi == tmp_bi)
631                 {
632                   return IP4_REASS_RC_INTERNAL_ERROR;
633                 }
634               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
635                 {
636                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
637                   tmp_bi = tmp->next_buffer;
638                   tmp->next_buffer = 0;
639                   tmp = vlib_get_buffer (vm, tmp_bi);
640                   vlib_buffer_free_one (vm, to_be_freed_bi);
641                 }
642               else
643                 {
644                   tmp->next_buffer = 0;
645                   vlib_buffer_free_one (vm, to_be_freed_bi);
646                   break;
647                 }
648             }
649         }
650       sub_chain_bi =
651         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
652         reass.next_range_bi;
653     }
654   while (~0 != sub_chain_bi);
655
656   if (!last_b)
657     {
658       return IP4_REASS_RC_INTERNAL_ERROR;
659     }
660   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
661
662   if (total_length < first_b->current_length)
663     {
664       return IP4_REASS_RC_INTERNAL_ERROR;
665     }
666   total_length -= first_b->current_length;
667   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
668   first_b->total_length_not_including_first_buffer = total_length;
669   ip4_header_t *ip = vlib_buffer_get_current (first_b);
670   ip->flags_and_fragment_offset = 0;
671   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
672   ip->checksum = ip4_header_checksum (ip);
673   if (!vlib_buffer_chain_linearize (vm, first_b))
674     {
675       return IP4_REASS_RC_NO_BUF;
676     }
677   // reset to reconstruct the mbuf linking
678   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
679   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
680     {
681       ip4_full_reass_add_trace (vm, node, rm, reass, reass->first_bi,
682                                 FINALIZE, 0, ~0);
683 #if 0
684       // following code does a hexdump of packet fragments to stdout ...
685       do
686         {
687           u32 bi = reass->first_bi;
688           u8 *s = NULL;
689           while (~0 != bi)
690             {
691               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
692               s = format (s, "%u: %U\n", bi, format_hexdump,
693                           vlib_buffer_get_current (b), b->current_length);
694               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
695                 {
696                   bi = b->next_buffer;
697                 }
698               else
699                 {
700                   break;
701                 }
702             }
703           printf ("%.*s\n", vec_len (s), s);
704           fflush (stdout);
705           vec_free (s);
706         }
707       while (0);
708 #endif
709     }
710   *bi0 = reass->first_bi;
711   if (!is_custom_app)
712     {
713       *next0 = IP4_FULL_REASS_NEXT_INPUT;
714     }
715   else
716     {
717       *next0 = reass->next_index;
718     }
719   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
720   *error0 = IP4_ERROR_NONE;
721   ip4_full_reass_free (rm, rt, reass);
722   reass = NULL;
723   return IP4_REASS_RC_OK;
724 }
725
726 always_inline ip4_full_reass_rc_t
727 ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
728                                       ip4_full_reass_main_t * rm,
729                                       ip4_full_reass_per_thread_t * rt,
730                                       ip4_full_reass_t * reass,
731                                       u32 prev_range_bi, u32 new_next_bi)
732 {
733   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
734   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
735   if (~0 != prev_range_bi)
736     {
737       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
738       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
739       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
740       prev_vnb->ip.reass.next_range_bi = new_next_bi;
741     }
742   else
743     {
744       if (~0 != reass->first_bi)
745         {
746           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
747         }
748       reass->first_bi = new_next_bi;
749     }
750   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
751   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
752       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
753     {
754       return IP4_REASS_RC_INTERNAL_ERROR;
755     }
756   reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
757   return IP4_REASS_RC_OK;
758 }
759
760 always_inline ip4_full_reass_rc_t
761 ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
762                                         vlib_node_runtime_t * node,
763                                         ip4_full_reass_main_t * rm,
764                                         ip4_full_reass_t * reass,
765                                         u32 prev_range_bi, u32 discard_bi)
766 {
767   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
768   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
769   if (~0 != prev_range_bi)
770     {
771       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
772       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
773       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
774         {
775           return IP4_REASS_RC_INTERNAL_ERROR;
776         }
777       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
778     }
779   else
780     {
781       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
782     }
783   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
784   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
785       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
786     {
787       return IP4_REASS_RC_INTERNAL_ERROR;
788     }
789   reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
790   while (1)
791     {
792       u32 to_be_freed_bi = discard_bi;
793       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
794         {
795           ip4_full_reass_add_trace (vm, node, rm, reass, discard_bi,
796                                     RANGE_DISCARD, 0, ~0);
797         }
798       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
799         {
800           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
801           discard_bi = discard_b->next_buffer;
802           discard_b->next_buffer = 0;
803           discard_b = vlib_get_buffer (vm, discard_bi);
804           vlib_buffer_free_one (vm, to_be_freed_bi);
805         }
806       else
807         {
808           discard_b->next_buffer = 0;
809           vlib_buffer_free_one (vm, to_be_freed_bi);
810           break;
811         }
812     }
813   return IP4_REASS_RC_OK;
814 }
815
816 always_inline ip4_full_reass_rc_t
817 ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
818                        ip4_full_reass_main_t * rm,
819                        ip4_full_reass_per_thread_t * rt,
820                        ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
821                        u32 * error0, bool is_custom_app,
822                        u32 * handoff_thread_idx)
823 {
824   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
825   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
826   if (is_custom_app)
827     {
828       // store (error_)next_index before it's overwritten
829       reass->next_index = fvnb->ip.reass.next_index;
830       reass->error_next_index = fvnb->ip.reass.error_next_index;
831     }
832   ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
833   int consumed = 0;
834   ip4_header_t *fip = vlib_buffer_get_current (fb);
835   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
836   const u32 fragment_length =
837     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
838   const u32 fragment_last = fragment_first + fragment_length - 1;
839   fvnb->ip.reass.fragment_first = fragment_first;
840   fvnb->ip.reass.fragment_last = fragment_last;
841   int more_fragments = ip4_get_fragment_more (fip);
842   u32 candidate_range_bi = reass->first_bi;
843   u32 prev_range_bi = ~0;
844   fvnb->ip.reass.range_first = fragment_first;
845   fvnb->ip.reass.range_last = fragment_last;
846   fvnb->ip.reass.next_range_bi = ~0;
847   if (!more_fragments)
848     {
849       reass->last_packet_octet = fragment_last;
850     }
851   if (~0 == reass->first_bi)
852     {
853       // starting a new reassembly
854       rc =
855         ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
856                                               prev_range_bi, *bi0);
857       if (IP4_REASS_RC_OK != rc)
858         {
859           return rc;
860         }
861       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
862         {
863           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
864                                     ~0);
865         }
866       *bi0 = ~0;
867       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
868       reass->fragments_n = 1;
869       return IP4_REASS_RC_OK;
870     }
871   reass->min_fragment_length =
872     clib_min (clib_net_to_host_u16 (fip->length),
873               fvnb->ip.reass.estimated_mtu);
874   while (~0 != candidate_range_bi)
875     {
876       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
877       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
878       if (fragment_first > candidate_vnb->ip.reass.range_last)
879         {
880           // this fragments starts after candidate range
881           prev_range_bi = candidate_range_bi;
882           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
883           if (candidate_vnb->ip.reass.range_last < fragment_last &&
884               ~0 == candidate_range_bi)
885             {
886               // special case - this fragment falls beyond all known ranges
887               rc =
888                 ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
889                                                       prev_range_bi, *bi0);
890               if (IP4_REASS_RC_OK != rc)
891                 {
892                   return rc;
893                 }
894               consumed = 1;
895               break;
896             }
897           continue;
898         }
899       if (fragment_last < candidate_vnb->ip.reass.range_first)
900         {
901           // this fragment ends before candidate range without any overlap
902           rc =
903             ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
904                                                   prev_range_bi, *bi0);
905           if (IP4_REASS_RC_OK != rc)
906             {
907               return rc;
908             }
909           consumed = 1;
910         }
911       else
912         {
913           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
914               fragment_last <= candidate_vnb->ip.reass.range_last)
915             {
916               // this fragment is a (sub)part of existing range, ignore it
917               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
918                 {
919                   ip4_full_reass_add_trace (vm, node, rm, reass, *bi0,
920                                             RANGE_OVERLAP, 0, ~0);
921                 }
922               break;
923             }
924           int discard_candidate = 0;
925           if (fragment_first < candidate_vnb->ip.reass.range_first)
926             {
927               u32 overlap =
928                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
929               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
930                 {
931                   candidate_vnb->ip.reass.range_first += overlap;
932                   if (reass->data_len < overlap)
933                     {
934                       return IP4_REASS_RC_INTERNAL_ERROR;
935                     }
936                   reass->data_len -= overlap;
937                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
938                     {
939                       ip4_full_reass_add_trace (vm, node, rm, reass,
940                                                 candidate_range_bi,
941                                                 RANGE_SHRINK, 0, ~0);
942                     }
943                   rc =
944                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
945                                                           prev_range_bi,
946                                                           *bi0);
947                   if (IP4_REASS_RC_OK != rc)
948                     {
949                       return rc;
950                     }
951                   consumed = 1;
952                 }
953               else
954                 {
955                   discard_candidate = 1;
956                 }
957             }
958           else if (fragment_last > candidate_vnb->ip.reass.range_last)
959             {
960               u32 overlap =
961                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
962               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
963                 {
964                   fvnb->ip.reass.range_first += overlap;
965                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
966                     {
967                       prev_range_bi = candidate_range_bi;
968                       candidate_range_bi =
969                         candidate_vnb->ip.reass.next_range_bi;
970                       continue;
971                     }
972                   else
973                     {
974                       // special case - last range discarded
975                       rc =
976                         ip4_full_reass_insert_range_in_chain (vm, rm, rt,
977                                                               reass,
978                                                               candidate_range_bi,
979                                                               *bi0);
980                       if (IP4_REASS_RC_OK != rc)
981                         {
982                           return rc;
983                         }
984                       consumed = 1;
985                     }
986                 }
987               else
988                 {
989                   discard_candidate = 1;
990                 }
991             }
992           else
993             {
994               discard_candidate = 1;
995             }
996           if (discard_candidate)
997             {
998               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
999               // discard candidate range, probe next range
1000               rc =
1001                 ip4_full_reass_remove_range_from_chain (vm, node, rm, reass,
1002                                                         prev_range_bi,
1003                                                         candidate_range_bi);
1004               if (IP4_REASS_RC_OK != rc)
1005                 {
1006                   return rc;
1007                 }
1008               if (~0 != next_range_bi)
1009                 {
1010                   candidate_range_bi = next_range_bi;
1011                   continue;
1012                 }
1013               else
1014                 {
1015                   // special case - last range discarded
1016                   rc =
1017                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
1018                                                           prev_range_bi,
1019                                                           *bi0);
1020                   if (IP4_REASS_RC_OK != rc)
1021                     {
1022                       return rc;
1023                     }
1024                   consumed = 1;
1025                 }
1026             }
1027         }
1028       break;
1029     }
1030   ++reass->fragments_n;
1031   if (consumed)
1032     {
1033       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1034         {
1035           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
1036                                     ~0);
1037         }
1038     }
1039   if (~0 != reass->last_packet_octet &&
1040       reass->data_len == reass->last_packet_octet + 1)
1041     {
1042       *handoff_thread_idx = reass->sendout_thread_index;
1043       rc =
1044         ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1045                                  is_custom_app);
1046       if (IP4_REASS_RC_OK == rc
1047           && reass->memory_owner_thread_index != reass->sendout_thread_index)
1048         {
1049           rc = IP4_REASS_RC_HANDOFF;
1050         }
1051     }
1052   else
1053     {
1054       if (consumed)
1055         {
1056           *bi0 = ~0;
1057           if (reass->fragments_n > rm->max_reass_len)
1058             {
1059               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1060             }
1061         }
1062       else
1063         {
1064           *next0 = IP4_FULL_REASS_NEXT_DROP;
1065           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1066         }
1067     }
1068   return rc;
1069 }
1070
1071 always_inline uword
1072 ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1073                        vlib_frame_t * frame, bool is_feature,
1074                        bool is_custom_app)
1075 {
1076   u32 *from = vlib_frame_vector_args (frame);
1077   u32 n_left_from, n_left_to_next, *to_next, next_index;
1078   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1079   ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1080   clib_spinlock_lock (&rt->lock);
1081
1082   n_left_from = frame->n_vectors;
1083   next_index = node->cached_next_index;
1084   while (n_left_from > 0)
1085     {
1086       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1087
1088       while (n_left_from > 0 && n_left_to_next > 0)
1089         {
1090           u32 bi0;
1091           vlib_buffer_t *b0;
1092           u32 next0;
1093           u32 error0 = IP4_ERROR_NONE;
1094
1095           bi0 = from[0];
1096           b0 = vlib_get_buffer (vm, bi0);
1097
1098           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1099           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1100             {
1101               // this is a whole packet - no fragmentation
1102               if (!is_custom_app)
1103                 {
1104                   next0 = IP4_FULL_REASS_NEXT_INPUT;
1105                 }
1106               else
1107                 {
1108                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1109                 }
1110               goto packet_enqueue;
1111             }
1112           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1113           const u32 fragment_length =
1114             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1115           const u32 fragment_last = fragment_first + fragment_length - 1;
1116           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
1117             {
1118               next0 = IP4_FULL_REASS_NEXT_DROP;
1119               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1120               goto packet_enqueue;
1121             }
1122           ip4_full_reass_kv_t kv;
1123           u8 do_handoff = 0;
1124
1125           kv.k.as_u64[0] =
1126             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1127                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1128             (u64) ip0->src_address.as_u32 << 32;
1129           kv.k.as_u64[1] =
1130             (u64) ip0->dst_address.
1131             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1132
1133           ip4_full_reass_t *reass =
1134             ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
1135                                            &do_handoff);
1136
1137           if (reass)
1138             {
1139               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1140               if (0 == fragment_first)
1141                 {
1142                   reass->sendout_thread_index = vm->thread_index;
1143                 }
1144             }
1145
1146           if (PREDICT_FALSE (do_handoff))
1147             {
1148               next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1149               if (is_feature)
1150                 vnet_buffer (b0)->ip.reass.owner_feature_thread_index =
1151                   kv.v.memory_owner_thread_index;
1152               else
1153                 vnet_buffer (b0)->ip.reass.owner_thread_index =
1154                   kv.v.memory_owner_thread_index;
1155             }
1156           else if (reass)
1157             {
1158               u32 handoff_thread_idx;
1159               switch (ip4_full_reass_update
1160                       (vm, node, rm, rt, reass, &bi0, &next0,
1161                        &error0, is_custom_app, &handoff_thread_idx))
1162                 {
1163                 case IP4_REASS_RC_OK:
1164                   /* nothing to do here */
1165                   break;
1166                 case IP4_REASS_RC_HANDOFF:
1167                   next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1168                   b0 = vlib_get_buffer (vm, bi0);
1169                   if (is_feature)
1170                     vnet_buffer (b0)->ip.reass.owner_feature_thread_index =
1171                       handoff_thread_idx;
1172                   else
1173                     vnet_buffer (b0)->ip.reass.owner_thread_index =
1174                       handoff_thread_idx;
1175                   break;
1176                 case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1177                   vlib_node_increment_counter (vm, node->node_index,
1178                                                IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1179                                                1);
1180                   ip4_full_reass_drop_all (vm, node, rm, reass);
1181                   ip4_full_reass_free (rm, rt, reass);
1182                   goto next_packet;
1183                   break;
1184                 case IP4_REASS_RC_NO_BUF:
1185                   vlib_node_increment_counter (vm, node->node_index,
1186                                                IP4_ERROR_REASS_NO_BUF, 1);
1187                   ip4_full_reass_drop_all (vm, node, rm, reass);
1188                   ip4_full_reass_free (rm, rt, reass);
1189                   goto next_packet;
1190                   break;
1191                 case IP4_REASS_RC_INTERNAL_ERROR:
1192                   /* drop everything and start with a clean slate */
1193                   vlib_node_increment_counter (vm, node->node_index,
1194                                                IP4_ERROR_REASS_INTERNAL_ERROR,
1195                                                1);
1196                   ip4_full_reass_drop_all (vm, node, rm, reass);
1197                   ip4_full_reass_free (rm, rt, reass);
1198                   goto next_packet;
1199                   break;
1200                 }
1201             }
1202           else
1203             {
1204               next0 = IP4_FULL_REASS_NEXT_DROP;
1205               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1206             }
1207
1208
1209         packet_enqueue:
1210           b0->error = node->errors[error0];
1211
1212           if (bi0 != ~0)
1213             {
1214               to_next[0] = bi0;
1215               to_next += 1;
1216               n_left_to_next -= 1;
1217               if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1218                 {
1219                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1220                     {
1221                       if (is_feature)
1222                         ip4_full_reass_add_trace (vm, node, rm, NULL,
1223                                                   bi0, HANDOFF, 0,
1224                                                   vnet_buffer (b0)->ip.
1225                                                   reass.owner_feature_thread_index);
1226                       else
1227                         ip4_full_reass_add_trace (vm, node, rm, NULL,
1228                                                   bi0, HANDOFF, 0,
1229                                                   vnet_buffer (b0)->ip.
1230                                                   reass.owner_thread_index);
1231                     }
1232                 }
1233               else if (is_feature && IP4_ERROR_NONE == error0)
1234                 {
1235                   b0 = vlib_get_buffer (vm, bi0);
1236                   vnet_feature_next (&next0, b0);
1237                 }
1238               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1239                                                to_next, n_left_to_next,
1240                                                bi0, next0);
1241               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1242             }
1243
1244         next_packet:
1245           from += 1;
1246           n_left_from -= 1;
1247         }
1248
1249       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1250     }
1251
1252   clib_spinlock_unlock (&rt->lock);
1253   return frame->n_vectors;
1254 }
1255
1256 static char *ip4_full_reass_error_strings[] = {
1257 #define _(sym, string) string,
1258   foreach_ip4_error
1259 #undef _
1260 };
1261
1262 VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1263                                     vlib_node_runtime_t * node,
1264                                     vlib_frame_t * frame)
1265 {
1266   return ip4_full_reass_inline (vm, node, frame, false /* is_feature */ ,
1267                                 false /* is_custom_app */ );
1268 }
1269
1270 /* *INDENT-OFF* */
1271 VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1272     .name = "ip4-full-reassembly",
1273     .vector_size = sizeof (u32),
1274     .format_trace = format_ip4_full_reass_trace,
1275     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1276     .error_strings = ip4_full_reass_error_strings,
1277     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1278     .next_nodes =
1279         {
1280                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1281                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1282                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1283
1284         },
1285 };
1286 /* *INDENT-ON* */
1287
1288 VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1289                                             vlib_node_runtime_t * node,
1290                                             vlib_frame_t * frame)
1291 {
1292   return ip4_full_reass_inline (vm, node, frame, true /* is_feature */ ,
1293                                 false /* is_custom_app */ );
1294 }
1295
1296 /* *INDENT-OFF* */
1297 VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1298     .name = "ip4-full-reassembly-feature",
1299     .vector_size = sizeof (u32),
1300     .format_trace = format_ip4_full_reass_trace,
1301     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1302     .error_strings = ip4_full_reass_error_strings,
1303     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1304     .next_nodes =
1305         {
1306                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1307                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1308                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1309         },
1310 };
1311 /* *INDENT-ON* */
1312
1313 /* *INDENT-OFF* */
1314 VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1315     .arc_name = "ip4-unicast",
1316     .node_name = "ip4-full-reassembly-feature",
1317     .runs_before = VNET_FEATURES ("ip4-lookup",
1318                                   "ipsec4-input-feature"),
1319     .runs_after = 0,
1320 };
1321 /* *INDENT-ON* */
1322
1323 #ifndef CLIB_MARCH_VARIANT
1324 always_inline u32
1325 ip4_full_reass_get_nbuckets ()
1326 {
1327   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1328   u32 nbuckets;
1329   u8 i;
1330
1331   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1332
1333   for (i = 0; i < 31; i++)
1334     if ((1 << i) >= nbuckets)
1335       break;
1336   nbuckets = 1 << i;
1337
1338   return nbuckets;
1339 }
1340 #endif /* CLIB_MARCH_VARIANT */
1341
1342 typedef enum
1343 {
1344   IP4_EVENT_CONFIG_CHANGED = 1,
1345 } ip4_full_reass_event_t;
1346
1347 typedef struct
1348 {
1349   int failure;
1350   clib_bihash_16_8_t *new_hash;
1351 } ip4_rehash_cb_ctx;
1352
1353 #ifndef CLIB_MARCH_VARIANT
1354 static void
1355 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1356 {
1357   ip4_rehash_cb_ctx *ctx = _ctx;
1358   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1359     {
1360       ctx->failure = 1;
1361     }
1362 }
1363
1364 static void
1365 ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1366                            u32 max_reassembly_length,
1367                            u32 expire_walk_interval_ms)
1368 {
1369   ip4_full_reass_main.timeout_ms = timeout_ms;
1370   ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1371   ip4_full_reass_main.max_reass_n = max_reassemblies;
1372   ip4_full_reass_main.max_reass_len = max_reassembly_length;
1373   ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1374 }
1375
1376 vnet_api_error_t
1377 ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1378                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1379 {
1380   u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1381   ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1382                              max_reassembly_length, expire_walk_interval_ms);
1383   vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1384                              ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1385                              IP4_EVENT_CONFIG_CHANGED, 0);
1386   u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1387   if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1388     {
1389       clib_bihash_16_8_t new_hash;
1390       clib_memset (&new_hash, 0, sizeof (new_hash));
1391       ip4_rehash_cb_ctx ctx;
1392       ctx.failure = 0;
1393       ctx.new_hash = &new_hash;
1394       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1395                              new_nbuckets * 1024);
1396       clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1397                                                ip4_rehash_cb, &ctx);
1398       if (ctx.failure)
1399         {
1400           clib_bihash_free_16_8 (&new_hash);
1401           return -1;
1402         }
1403       else
1404         {
1405           clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1406           clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1407                             sizeof (ip4_full_reass_main.hash));
1408           clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1409         }
1410     }
1411   return 0;
1412 }
1413
1414 vnet_api_error_t
1415 ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1416                     u32 * max_reassembly_length,
1417                     u32 * expire_walk_interval_ms)
1418 {
1419   *timeout_ms = ip4_full_reass_main.timeout_ms;
1420   *max_reassemblies = ip4_full_reass_main.max_reass_n;
1421   *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1422   *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1423   return 0;
1424 }
1425
1426 static clib_error_t *
1427 ip4_full_reass_init_function (vlib_main_t * vm)
1428 {
1429   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1430   clib_error_t *error = 0;
1431   u32 nbuckets;
1432   vlib_node_t *node;
1433
1434   rm->vlib_main = vm;
1435
1436   vec_validate (rm->per_thread_data, vlib_num_workers ());
1437   ip4_full_reass_per_thread_t *rt;
1438   vec_foreach (rt, rm->per_thread_data)
1439   {
1440     clib_spinlock_init (&rt->lock);
1441     pool_alloc (rt->pool, rm->max_reass_n);
1442   }
1443
1444   node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1445   ASSERT (node);
1446   rm->ip4_full_reass_expire_node_idx = node->index;
1447
1448   ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1449                              IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1450                              IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1451                              IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1452
1453   nbuckets = ip4_full_reass_get_nbuckets ();
1454   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1455
1456   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1457   ASSERT (node);
1458   rm->ip4_drop_idx = node->index;
1459
1460   rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1461   rm->fq_feature_index =
1462     vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1463
1464   return error;
1465 }
1466
1467 VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1468 #endif /* CLIB_MARCH_VARIANT */
1469
1470 static uword
1471 ip4_full_reass_walk_expired (vlib_main_t * vm,
1472                              vlib_node_runtime_t * node, vlib_frame_t * f)
1473 {
1474   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1475   uword event_type, *event_data = 0;
1476
1477   while (true)
1478     {
1479       vlib_process_wait_for_event_or_clock (vm,
1480                                             (f64)
1481                                             rm->expire_walk_interval_ms /
1482                                             (f64) MSEC_PER_SEC);
1483       event_type = vlib_process_get_events (vm, &event_data);
1484
1485       switch (event_type)
1486         {
1487         case ~0:                /* no events => timeout */
1488           /* nothing to do here */
1489           break;
1490         case IP4_EVENT_CONFIG_CHANGED:
1491           break;
1492         default:
1493           clib_warning ("BUG: event type 0x%wx", event_type);
1494           break;
1495         }
1496       f64 now = vlib_time_now (vm);
1497
1498       ip4_full_reass_t *reass;
1499       int *pool_indexes_to_free = NULL;
1500
1501       uword thread_index = 0;
1502       int index;
1503       const uword nthreads = vlib_num_workers () + 1;
1504       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1505         {
1506           ip4_full_reass_per_thread_t *rt =
1507             &rm->per_thread_data[thread_index];
1508           clib_spinlock_lock (&rt->lock);
1509
1510           vec_reset_length (pool_indexes_to_free);
1511           /* *INDENT-OFF* */
1512           pool_foreach_index (index, rt->pool, ({
1513                                 reass = pool_elt_at_index (rt->pool, index);
1514                                 if (now > reass->last_heard + rm->timeout)
1515                                   {
1516                                     vec_add1 (pool_indexes_to_free, index);
1517                                   }
1518                               }));
1519           /* *INDENT-ON* */
1520           int *i;
1521           /* *INDENT-OFF* */
1522           vec_foreach (i, pool_indexes_to_free)
1523           {
1524             ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1525             ip4_full_reass_drop_all (vm, node, rm, reass);
1526             ip4_full_reass_free (rm, rt, reass);
1527           }
1528           /* *INDENT-ON* */
1529
1530           clib_spinlock_unlock (&rt->lock);
1531         }
1532
1533       vec_free (pool_indexes_to_free);
1534       if (event_data)
1535         {
1536           _vec_len (event_data) = 0;
1537         }
1538     }
1539
1540   return 0;
1541 }
1542
1543 /* *INDENT-OFF* */
1544 VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1545     .function = ip4_full_reass_walk_expired,
1546     .type = VLIB_NODE_TYPE_PROCESS,
1547     .name = "ip4-full-reassembly-expire-walk",
1548     .format_trace = format_ip4_full_reass_trace,
1549     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1550     .error_strings = ip4_full_reass_error_strings,
1551
1552 };
1553 /* *INDENT-ON* */
1554
1555 static u8 *
1556 format_ip4_full_reass_key (u8 * s, va_list * args)
1557 {
1558   ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1559   s =
1560     format (s,
1561             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1562             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1563             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1564   return s;
1565 }
1566
1567 static u8 *
1568 format_ip4_reass (u8 * s, va_list * args)
1569 {
1570   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1571   ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1572
1573   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1574               "last_packet_octet: %u, trace_op_counter: %u\n",
1575               reass->id, format_ip4_full_reass_key, &reass->key,
1576               reass->first_bi, reass->data_len,
1577               reass->last_packet_octet, reass->trace_op_counter);
1578
1579   u32 bi = reass->first_bi;
1580   u32 counter = 0;
1581   while (~0 != bi)
1582     {
1583       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1584       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1585       s =
1586         format (s,
1587                 "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1588                 "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1589                 vnb->ip.reass.range_last, bi,
1590                 ip4_full_reass_buffer_get_data_offset (b),
1591                 ip4_full_reass_buffer_get_data_len (b),
1592                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1593       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1594         {
1595           bi = b->next_buffer;
1596         }
1597       else
1598         {
1599           bi = ~0;
1600         }
1601     }
1602   return s;
1603 }
1604
1605 static clib_error_t *
1606 show_ip4_reass (vlib_main_t * vm,
1607                 unformat_input_t * input,
1608                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1609 {
1610   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1611
1612   vlib_cli_output (vm, "---------------------");
1613   vlib_cli_output (vm, "IP4 reassembly status");
1614   vlib_cli_output (vm, "---------------------");
1615   bool details = false;
1616   if (unformat (input, "details"))
1617     {
1618       details = true;
1619     }
1620
1621   u32 sum_reass_n = 0;
1622   ip4_full_reass_t *reass;
1623   uword thread_index;
1624   const uword nthreads = vlib_num_workers () + 1;
1625   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1626     {
1627       ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1628       clib_spinlock_lock (&rt->lock);
1629       if (details)
1630         {
1631           /* *INDENT-OFF* */
1632           pool_foreach (reass, rt->pool, {
1633             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1634           });
1635           /* *INDENT-ON* */
1636         }
1637       sum_reass_n += rt->reass_n;
1638       clib_spinlock_unlock (&rt->lock);
1639     }
1640   vlib_cli_output (vm, "---------------------");
1641   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1642                    (long unsigned) sum_reass_n);
1643   vlib_cli_output (vm,
1644                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1645                    (long unsigned) rm->max_reass_n);
1646   return 0;
1647 }
1648
1649 /* *INDENT-OFF* */
1650 VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1651     .path = "show ip4-full-reassembly",
1652     .short_help = "show ip4-full-reassembly [details]",
1653     .function = show_ip4_reass,
1654 };
1655 /* *INDENT-ON* */
1656
1657 #ifndef CLIB_MARCH_VARIANT
1658 vnet_api_error_t
1659 ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1660 {
1661   return vnet_feature_enable_disable ("ip4-unicast",
1662                                       "ip4-full-reassembly-feature",
1663                                       sw_if_index, enable_disable, 0, 0);
1664 }
1665 #endif /* CLIB_MARCH_VARIANT */
1666
1667
1668 #define foreach_ip4_full_reass_handoff_error                       \
1669 _(CONGESTION_DROP, "congestion drop")
1670
1671
1672 typedef enum
1673 {
1674 #define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1675   foreach_ip4_full_reass_handoff_error
1676 #undef _
1677     IP4_FULL_REASS_HANDOFF_N_ERROR,
1678 } ip4_full_reass_handoff_error_t;
1679
1680 static char *ip4_full_reass_handoff_error_strings[] = {
1681 #define _(sym,string) string,
1682   foreach_ip4_full_reass_handoff_error
1683 #undef _
1684 };
1685
1686 typedef struct
1687 {
1688   u32 next_worker_index;
1689 } ip4_full_reass_handoff_trace_t;
1690
1691 static u8 *
1692 format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1693 {
1694   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1695   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1696   ip4_full_reass_handoff_trace_t *t =
1697     va_arg (*args, ip4_full_reass_handoff_trace_t *);
1698
1699   s =
1700     format (s, "ip4-full-reassembly-handoff: next-worker %d",
1701             t->next_worker_index);
1702
1703   return s;
1704 }
1705
1706 always_inline uword
1707 ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
1708                                     vlib_node_runtime_t * node,
1709                                     vlib_frame_t * frame, bool is_feature)
1710 {
1711   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1712
1713   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1714   u32 n_enq, n_left_from, *from;
1715   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1716   u32 fq_index;
1717
1718   from = vlib_frame_vector_args (frame);
1719   n_left_from = frame->n_vectors;
1720   vlib_get_buffers (vm, from, bufs, n_left_from);
1721
1722   b = bufs;
1723   ti = thread_indices;
1724
1725   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1726
1727   while (n_left_from > 0)
1728     {
1729       ti[0] =
1730         (is_feature) ? vnet_buffer (b[0])->ip.
1731         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1732         reass.owner_thread_index;
1733
1734       if (PREDICT_FALSE
1735           ((node->flags & VLIB_NODE_FLAG_TRACE)
1736            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1737         {
1738           ip4_full_reass_handoff_trace_t *t =
1739             vlib_add_trace (vm, node, b[0], sizeof (*t));
1740           t->next_worker_index = ti[0];
1741         }
1742
1743       n_left_from -= 1;
1744       ti += 1;
1745       b += 1;
1746     }
1747   n_enq =
1748     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1749                                    frame->n_vectors, 1);
1750
1751   if (n_enq < frame->n_vectors)
1752     vlib_node_increment_counter (vm, node->node_index,
1753                                  IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1754                                  frame->n_vectors - n_enq);
1755   return frame->n_vectors;
1756 }
1757
1758 VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1759                                             vlib_node_runtime_t * node,
1760                                             vlib_frame_t * frame)
1761 {
1762   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1763                                              false /* is_feature */ );
1764 }
1765
1766
1767 /* *INDENT-OFF* */
1768 VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1769   .name = "ip4-full-reassembly-handoff",
1770   .vector_size = sizeof (u32),
1771   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1772   .error_strings = ip4_full_reass_handoff_error_strings,
1773   .format_trace = format_ip4_full_reass_handoff_trace,
1774
1775   .n_next_nodes = 1,
1776
1777   .next_nodes = {
1778     [0] = "error-drop",
1779   },
1780 };
1781 /* *INDENT-ON* */
1782
1783
1784 /* *INDENT-OFF* */
1785 VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
1786                                                     vlib_node_runtime_t *
1787                                                     node,
1788                                                     vlib_frame_t * frame)
1789 {
1790   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1791                                              true /* is_feature */ );
1792 }
1793 /* *INDENT-ON* */
1794
1795
1796 /* *INDENT-OFF* */
1797 VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
1798   .name = "ip4-full-reass-feature-hoff",
1799   .vector_size = sizeof (u32),
1800   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1801   .error_strings = ip4_full_reass_handoff_error_strings,
1802   .format_trace = format_ip4_full_reass_handoff_trace,
1803
1804   .n_next_nodes = 1,
1805
1806   .next_nodes = {
1807     [0] = "error-drop",
1808   },
1809 };
1810 /* *INDENT-ON* */
1811
1812 /*
1813  * fd.io coding-style-patch-verification: ON
1814  *
1815  * Local Variables:
1816  * eval: (c-set-style "gnu")
1817  * End:
1818  */