ip: fix use-after-free in reassembly
[vpp.git] / src / vnet / ip / reass / ip4_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Full Reassembly.
19  *
20  * This file contains the source code for IPv4 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/fifo.h>
27 #include <vppinfra/bihash_16_8.h>
28 #include <vnet/ip/reass/ip4_full_reass.h>
29 #include <stddef.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
34 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38 #define IP4_REASS_DEBUG_BUFFERS 0
39 #if IP4_REASS_DEBUG_BUFFERS
40 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41   do                                                 \
42     {                                                \
43       u32 _bi = bi;                                  \
44       printf (#what "buffer %u", _bi);               \
45       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47         {                                            \
48           _bi = _b->next_buffer;                     \
49           printf ("[%u]", _bi);                      \
50           _b = vlib_get_buffer (vm, _bi);            \
51         }                                            \
52       printf ("\n");                                 \
53       fflush (stdout);                               \
54     }                                                \
55   while (0)
56 #else
57 #define IP4_REASS_DEBUG_BUFFER(...)
58 #endif
59
60 typedef enum
61 {
62   IP4_REASS_RC_OK,
63   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64   IP4_REASS_RC_INTERNAL_ERROR,
65   IP4_REASS_RC_NO_BUF,
66   IP4_REASS_RC_HANDOFF,
67 } ip4_full_reass_rc_t;
68
69 typedef struct
70 {
71   union
72   {
73     struct
74     {
75       u32 xx_id;
76       ip4_address_t src;
77       ip4_address_t dst;
78       u16 frag_id;
79       u8 proto;
80       u8 unused;
81     };
82     u64 as_u64[2];
83   };
84 } ip4_full_reass_key_t;
85
86 typedef union
87 {
88   struct
89   {
90     u32 reass_index;
91     u32 memory_owner_thread_index;
92   };
93   u64 as_u64;
94 } ip4_full_reass_val_t;
95
96 typedef union
97 {
98   struct
99   {
100     ip4_full_reass_key_t k;
101     ip4_full_reass_val_t v;
102   };
103   clib_bihash_kv_16_8_t kv;
104 } ip4_full_reass_kv_t;
105
106 always_inline u32
107 ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108 {
109   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111 }
112
113 always_inline u16
114 ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115 {
116   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118     (vnb->ip.reass.fragment_first +
119      ip4_full_reass_buffer_get_data_offset (b)) + 1;
120 }
121
122 typedef struct
123 {
124   // hash table key
125   ip4_full_reass_key_t key;
126   // time when last packet was received
127   f64 last_heard;
128   // internal id of this reassembly
129   u64 id;
130   // buffer index of first buffer in this reassembly context
131   u32 first_bi;
132   // last octet of packet, ~0 until fragment without more_fragments arrives
133   u32 last_packet_octet;
134   // length of data collected so far
135   u32 data_len;
136   // trace operation counter
137   u32 trace_op_counter;
138   // next index - used by non-feature node
139   u32 next_index;
140   // error next index - used by custom apps (~0 if not used)
141   u32 error_next_index;
142   // minimum fragment length for this reassembly - used to estimate MTU
143   u16 min_fragment_length;
144   // number of fragments in this reassembly
145   u32 fragments_n;
146   // thread owning memory for this context (whose pool contains this ctx)
147   u32 memory_owner_thread_index;
148   // thread which received fragment with offset 0 and which sends out the
149   // completed reassembly
150   u32 sendout_thread_index;
151 } ip4_full_reass_t;
152
153 typedef struct
154 {
155   ip4_full_reass_t *pool;
156   u32 reass_n;
157   u32 id_counter;
158   clib_spinlock_t lock;
159 } ip4_full_reass_per_thread_t;
160
161 typedef struct
162 {
163   // IPv4 config
164   u32 timeout_ms;
165   f64 timeout;
166   u32 expire_walk_interval_ms;
167   // maximum number of fragments in one reassembly
168   u32 max_reass_len;
169   // maximum number of reassemblies
170   u32 max_reass_n;
171
172   // IPv4 runtime
173   clib_bihash_16_8_t hash;
174   // per-thread data
175   ip4_full_reass_per_thread_t *per_thread_data;
176
177   // convenience
178   vlib_main_t *vlib_main;
179
180   // node index of ip4-drop node
181   u32 ip4_drop_idx;
182   u32 ip4_full_reass_expire_node_idx;
183
184   /** Worker handoff */
185   u32 fq_index;
186   u32 fq_feature_index;
187
188 } ip4_full_reass_main_t;
189
190 extern ip4_full_reass_main_t ip4_full_reass_main;
191
192 #ifndef CLIB_MARCH_VARIANT
193 ip4_full_reass_main_t ip4_full_reass_main;
194 #endif /* CLIB_MARCH_VARIANT */
195
196 typedef enum
197 {
198   IP4_FULL_REASS_NEXT_INPUT,
199   IP4_FULL_REASS_NEXT_DROP,
200   IP4_FULL_REASS_NEXT_HANDOFF,
201   IP4_FULL_REASS_N_NEXT,
202 } ip4_full_reass_next_t;
203
204 typedef enum
205 {
206   RANGE_NEW,
207   RANGE_SHRINK,
208   RANGE_DISCARD,
209   RANGE_OVERLAP,
210   FINALIZE,
211   HANDOFF,
212 } ip4_full_reass_trace_operation_e;
213
214 typedef struct
215 {
216   u16 range_first;
217   u16 range_last;
218   u32 range_bi;
219   i32 data_offset;
220   u32 data_len;
221   u32 first_bi;
222 } ip4_full_reass_range_trace_t;
223
224 typedef struct
225 {
226   ip4_full_reass_trace_operation_e action;
227   u32 reass_id;
228   ip4_full_reass_range_trace_t trace_range;
229   u32 size_diff;
230   u32 op_id;
231   u32 thread_id;
232   u32 thread_id_to;
233   u32 fragment_first;
234   u32 fragment_last;
235   u32 total_data_len;
236 } ip4_full_reass_trace_t;
237
238 extern vlib_node_registration_t ip4_full_reass_node;
239 extern vlib_node_registration_t ip4_full_reass_node_feature;
240
241 static void
242 ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
243                               ip4_full_reass_range_trace_t * trace)
244 {
245   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
246   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
247   trace->range_first = vnb->ip.reass.range_first;
248   trace->range_last = vnb->ip.reass.range_last;
249   trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
250   trace->data_len = ip4_full_reass_buffer_get_data_len (b);
251   trace->range_bi = bi;
252 }
253
254 static u8 *
255 format_ip4_full_reass_range_trace (u8 * s, va_list * args)
256 {
257   ip4_full_reass_range_trace_t *trace =
258     va_arg (*args, ip4_full_reass_range_trace_t *);
259   s =
260     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
261             trace->range_last, trace->data_offset, trace->data_len,
262             trace->range_bi);
263   return s;
264 }
265
266 static u8 *
267 format_ip4_full_reass_trace (u8 * s, va_list * args)
268 {
269   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
270   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
271   ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
272   u32 indent = 0;
273   if (~0 != t->reass_id)
274     {
275       s = format (s, "reass id: %u, op id: %u, ", t->reass_id, t->op_id);
276       indent = format_get_indent (s);
277       s =
278         format (s,
279                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
280                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
281                 t->fragment_last);
282     }
283   switch (t->action)
284     {
285     case RANGE_SHRINK:
286       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
287                   format_ip4_full_reass_range_trace, &t->trace_range,
288                   t->size_diff);
289       break;
290     case RANGE_DISCARD:
291       s = format (s, "\n%Udiscard %U", format_white_space, indent,
292                   format_ip4_full_reass_range_trace, &t->trace_range);
293       break;
294     case RANGE_NEW:
295       s = format (s, "\n%Unew %U", format_white_space, indent,
296                   format_ip4_full_reass_range_trace, &t->trace_range);
297       break;
298     case RANGE_OVERLAP:
299       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
300                   format_ip4_full_reass_range_trace, &t->trace_range);
301       break;
302     case FINALIZE:
303       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
304       break;
305     case HANDOFF:
306       s =
307         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
308                 t->thread_id_to);
309       break;
310     }
311   return s;
312 }
313
314 static void
315 ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
316                           ip4_full_reass_main_t * rm,
317                           ip4_full_reass_t * reass, u32 bi,
318                           ip4_full_reass_trace_operation_e action,
319                           u32 size_diff, u32 thread_id_to)
320 {
321   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
322   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
323   ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
324   if (reass)
325     {
326       t->reass_id = reass->id;
327       t->op_id = reass->trace_op_counter;
328       t->trace_range.first_bi = reass->first_bi;
329       t->total_data_len = reass->data_len;
330       ++reass->trace_op_counter;
331     }
332   else
333     {
334       t->reass_id = ~0;
335       t->op_id = 0;
336       t->trace_range.first_bi = 0;
337       t->total_data_len = 0;
338     }
339   t->action = action;
340   ip4_full_reass_trace_details (vm, bi, &t->trace_range);
341   t->size_diff = size_diff;
342   t->thread_id = vm->thread_index;
343   t->thread_id_to = thread_id_to;
344   t->fragment_first = vnb->ip.reass.fragment_first;
345   t->fragment_last = vnb->ip.reass.fragment_last;
346 #if 0
347   static u8 *s = NULL;
348   s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
349   printf ("%.*s\n", vec_len (s), s);
350   fflush (stdout);
351   vec_reset_length (s);
352 #endif
353 }
354
355 always_inline void
356 ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
357                          ip4_full_reass_t * reass)
358 {
359   pool_put (rt->pool, reass);
360   --rt->reass_n;
361 }
362
363 always_inline void
364 ip4_full_reass_free (ip4_full_reass_main_t * rm,
365                      ip4_full_reass_per_thread_t * rt,
366                      ip4_full_reass_t * reass)
367 {
368   clib_bihash_kv_16_8_t kv;
369   kv.key[0] = reass->key.as_u64[0];
370   kv.key[1] = reass->key.as_u64[1];
371   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
372   return ip4_full_reass_free_ctx (rt, reass);
373 }
374
375 always_inline void
376 ip4_full_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
377                          ip4_full_reass_main_t * rm, ip4_full_reass_t * reass)
378 {
379   u32 range_bi = reass->first_bi;
380   vlib_buffer_t *range_b;
381   vnet_buffer_opaque_t *range_vnb;
382   u32 *to_free = NULL;
383   while (~0 != range_bi)
384     {
385       range_b = vlib_get_buffer (vm, range_bi);
386       range_vnb = vnet_buffer (range_b);
387       u32 bi = range_bi;
388       while (~0 != bi)
389         {
390           vec_add1 (to_free, bi);
391           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
392           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
393             {
394               bi = b->next_buffer;
395               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
396             }
397           else
398             {
399               bi = ~0;
400             }
401         }
402       range_bi = range_vnb->ip.reass.next_range_bi;
403     }
404   /* send to next_error_index */
405   if (~0 != reass->error_next_index)
406     {
407       u32 n_left_to_next, *to_next, next_index;
408
409       next_index = reass->error_next_index;
410       u32 bi = ~0;
411
412       while (vec_len (to_free) > 0)
413         {
414           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
415
416           while (vec_len (to_free) > 0 && n_left_to_next > 0)
417             {
418               bi = vec_pop (to_free);
419
420               if (~0 != bi)
421                 {
422                   to_next[0] = bi;
423                   to_next += 1;
424                   n_left_to_next -= 1;
425                 }
426             }
427           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
428         }
429     }
430   else
431     {
432       vlib_buffer_free (vm, to_free, vec_len (to_free));
433     }
434 }
435
436 always_inline void
437 ip4_full_reass_init (ip4_full_reass_t * reass)
438 {
439   reass->first_bi = ~0;
440   reass->last_packet_octet = ~0;
441   reass->data_len = 0;
442   reass->next_index = ~0;
443   reass->error_next_index = ~0;
444 }
445
446 always_inline ip4_full_reass_t *
447 ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
448                                ip4_full_reass_main_t * rm,
449                                ip4_full_reass_per_thread_t * rt,
450                                ip4_full_reass_kv_t * kv, u8 * do_handoff)
451 {
452   ip4_full_reass_t *reass;
453   f64 now;
454
455 again:
456
457   reass = NULL;
458   now = vlib_time_now (vm);
459   if (!clib_bihash_search_16_8
460       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
461     {
462       reass =
463         pool_elt_at_index (rm->per_thread_data
464                            [kv->v.memory_owner_thread_index].pool,
465                            kv->v.reass_index);
466       if (vm->thread_index != reass->memory_owner_thread_index)
467         {
468           *do_handoff = 1;
469           return reass;
470         }
471
472       if (now > reass->last_heard + rm->timeout)
473         {
474           ip4_full_reass_drop_all (vm, node, rm, reass);
475           ip4_full_reass_free (rm, rt, reass);
476           reass = NULL;
477         }
478     }
479
480   if (reass)
481     {
482       reass->last_heard = now;
483       return reass;
484     }
485
486   if (rt->reass_n >= rm->max_reass_n)
487     {
488       reass = NULL;
489       return reass;
490     }
491   else
492     {
493       pool_get (rt->pool, reass);
494       clib_memset (reass, 0, sizeof (*reass));
495       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
496       reass->memory_owner_thread_index = vm->thread_index;
497       ++rt->id_counter;
498       ip4_full_reass_init (reass);
499       ++rt->reass_n;
500     }
501
502   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
503   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
504   kv->v.reass_index = (reass - rt->pool);
505   kv->v.memory_owner_thread_index = vm->thread_index;
506   reass->last_heard = now;
507
508   int rv =
509     clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 2);
510   if (rv)
511     {
512       ip4_full_reass_free_ctx (rt, reass);
513       reass = NULL;
514       // if other worker created a context already work with the other copy
515       if (-2 == rv)
516         goto again;
517     }
518
519   return reass;
520 }
521
522 always_inline ip4_full_reass_rc_t
523 ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
524                          ip4_full_reass_main_t * rm,
525                          ip4_full_reass_per_thread_t * rt,
526                          ip4_full_reass_t * reass, u32 * bi0,
527                          u32 * next0, u32 * error0, bool is_custom_app)
528 {
529   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
530   vlib_buffer_t *last_b = NULL;
531   u32 sub_chain_bi = reass->first_bi;
532   u32 total_length = 0;
533   u32 buf_cnt = 0;
534   do
535     {
536       u32 tmp_bi = sub_chain_bi;
537       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
538       ip4_header_t *ip = vlib_buffer_get_current (tmp);
539       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
540       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
541           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
542         {
543           return IP4_REASS_RC_INTERNAL_ERROR;
544         }
545
546       u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
547       u32 trim_front =
548         ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
549       u32 trim_end =
550         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
551       if (tmp_bi == reass->first_bi)
552         {
553           /* first buffer - keep ip4 header */
554           if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
555             {
556               return IP4_REASS_RC_INTERNAL_ERROR;
557             }
558           trim_front = 0;
559           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
560             ip4_header_bytes (ip);
561           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
562             {
563               return IP4_REASS_RC_INTERNAL_ERROR;
564             }
565         }
566       u32 keep_data =
567         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
568       while (1)
569         {
570           ++buf_cnt;
571           if (trim_front)
572             {
573               if (trim_front > tmp->current_length)
574                 {
575                   /* drop whole buffer */
576                   u32 to_be_freed_bi = tmp_bi;
577                   trim_front -= tmp->current_length;
578                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
579                     {
580                       return IP4_REASS_RC_INTERNAL_ERROR;
581                     }
582                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
583                   tmp_bi = tmp->next_buffer;
584                   tmp->next_buffer = 0;
585                   tmp = vlib_get_buffer (vm, tmp_bi);
586                   vlib_buffer_free_one (vm, to_be_freed_bi);
587                   continue;
588                 }
589               else
590                 {
591                   vlib_buffer_advance (tmp, trim_front);
592                   trim_front = 0;
593                 }
594             }
595           if (keep_data)
596             {
597               if (last_b)
598                 {
599                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
600                   last_b->next_buffer = tmp_bi;
601                 }
602               last_b = tmp;
603               if (keep_data <= tmp->current_length)
604                 {
605                   tmp->current_length = keep_data;
606                   keep_data = 0;
607                 }
608               else
609                 {
610                   keep_data -= tmp->current_length;
611                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
612                     {
613                       return IP4_REASS_RC_INTERNAL_ERROR;
614                     }
615                 }
616               total_length += tmp->current_length;
617               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
618                 {
619                   tmp_bi = tmp->next_buffer;
620                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
621                 }
622               else
623                 {
624                   break;
625                 }
626             }
627           else
628             {
629               u32 to_be_freed_bi = tmp_bi;
630               if (reass->first_bi == tmp_bi)
631                 {
632                   return IP4_REASS_RC_INTERNAL_ERROR;
633                 }
634               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
635                 {
636                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
637                   tmp_bi = tmp->next_buffer;
638                   tmp->next_buffer = 0;
639                   tmp = vlib_get_buffer (vm, tmp_bi);
640                   vlib_buffer_free_one (vm, to_be_freed_bi);
641                 }
642               else
643                 {
644                   tmp->next_buffer = 0;
645                   vlib_buffer_free_one (vm, to_be_freed_bi);
646                   break;
647                 }
648             }
649         }
650       sub_chain_bi =
651         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
652         reass.next_range_bi;
653     }
654   while (~0 != sub_chain_bi);
655
656   if (!last_b)
657     {
658       return IP4_REASS_RC_INTERNAL_ERROR;
659     }
660   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
661
662   if (total_length < first_b->current_length)
663     {
664       return IP4_REASS_RC_INTERNAL_ERROR;
665     }
666   total_length -= first_b->current_length;
667   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
668   first_b->total_length_not_including_first_buffer = total_length;
669   ip4_header_t *ip = vlib_buffer_get_current (first_b);
670   ip->flags_and_fragment_offset = 0;
671   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
672   ip->checksum = ip4_header_checksum (ip);
673   if (!vlib_buffer_chain_linearize (vm, first_b))
674     {
675       return IP4_REASS_RC_NO_BUF;
676     }
677   // reset to reconstruct the mbuf linking
678   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
679   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
680     {
681       ip4_full_reass_add_trace (vm, node, rm, reass, reass->first_bi,
682                                 FINALIZE, 0, ~0);
683 #if 0
684       // following code does a hexdump of packet fragments to stdout ...
685       do
686         {
687           u32 bi = reass->first_bi;
688           u8 *s = NULL;
689           while (~0 != bi)
690             {
691               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
692               s = format (s, "%u: %U\n", bi, format_hexdump,
693                           vlib_buffer_get_current (b), b->current_length);
694               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
695                 {
696                   bi = b->next_buffer;
697                 }
698               else
699                 {
700                   break;
701                 }
702             }
703           printf ("%.*s\n", vec_len (s), s);
704           fflush (stdout);
705           vec_free (s);
706         }
707       while (0);
708 #endif
709     }
710   *bi0 = reass->first_bi;
711   if (!is_custom_app)
712     {
713       *next0 = IP4_FULL_REASS_NEXT_INPUT;
714     }
715   else
716     {
717       *next0 = reass->next_index;
718     }
719   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
720   *error0 = IP4_ERROR_NONE;
721   ip4_full_reass_free (rm, rt, reass);
722   reass = NULL;
723   return IP4_REASS_RC_OK;
724 }
725
726 always_inline ip4_full_reass_rc_t
727 ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
728                                       ip4_full_reass_main_t * rm,
729                                       ip4_full_reass_per_thread_t * rt,
730                                       ip4_full_reass_t * reass,
731                                       u32 prev_range_bi, u32 new_next_bi)
732 {
733   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
734   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
735   if (~0 != prev_range_bi)
736     {
737       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
738       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
739       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
740       prev_vnb->ip.reass.next_range_bi = new_next_bi;
741     }
742   else
743     {
744       if (~0 != reass->first_bi)
745         {
746           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
747         }
748       reass->first_bi = new_next_bi;
749     }
750   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
751   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
752       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
753     {
754       return IP4_REASS_RC_INTERNAL_ERROR;
755     }
756   reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
757   return IP4_REASS_RC_OK;
758 }
759
760 always_inline ip4_full_reass_rc_t
761 ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
762                                         vlib_node_runtime_t * node,
763                                         ip4_full_reass_main_t * rm,
764                                         ip4_full_reass_t * reass,
765                                         u32 prev_range_bi, u32 discard_bi)
766 {
767   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
768   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
769   if (~0 != prev_range_bi)
770     {
771       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
772       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
773       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
774         {
775           return IP4_REASS_RC_INTERNAL_ERROR;
776         }
777       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
778     }
779   else
780     {
781       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
782     }
783   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
784   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
785       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
786     {
787       return IP4_REASS_RC_INTERNAL_ERROR;
788     }
789   reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
790   while (1)
791     {
792       u32 to_be_freed_bi = discard_bi;
793       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
794         {
795           ip4_full_reass_add_trace (vm, node, rm, reass, discard_bi,
796                                     RANGE_DISCARD, 0, ~0);
797         }
798       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
799         {
800           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
801           discard_bi = discard_b->next_buffer;
802           discard_b->next_buffer = 0;
803           discard_b = vlib_get_buffer (vm, discard_bi);
804           vlib_buffer_free_one (vm, to_be_freed_bi);
805         }
806       else
807         {
808           discard_b->next_buffer = 0;
809           vlib_buffer_free_one (vm, to_be_freed_bi);
810           break;
811         }
812     }
813   return IP4_REASS_RC_OK;
814 }
815
816 always_inline ip4_full_reass_rc_t
817 ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
818                        ip4_full_reass_main_t * rm,
819                        ip4_full_reass_per_thread_t * rt,
820                        ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
821                        u32 * error0, bool is_custom_app,
822                        u32 * handoff_thread_idx)
823 {
824   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
825   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
826   if (is_custom_app)
827     {
828       // store (error_)next_index before it's overwritten
829       reass->next_index = fvnb->ip.reass.next_index;
830       reass->error_next_index = fvnb->ip.reass.error_next_index;
831     }
832   ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
833   int consumed = 0;
834   ip4_header_t *fip = vlib_buffer_get_current (fb);
835   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
836   const u32 fragment_length =
837     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
838   const u32 fragment_last = fragment_first + fragment_length - 1;
839   fvnb->ip.reass.fragment_first = fragment_first;
840   fvnb->ip.reass.fragment_last = fragment_last;
841   int more_fragments = ip4_get_fragment_more (fip);
842   u32 candidate_range_bi = reass->first_bi;
843   u32 prev_range_bi = ~0;
844   fvnb->ip.reass.range_first = fragment_first;
845   fvnb->ip.reass.range_last = fragment_last;
846   fvnb->ip.reass.next_range_bi = ~0;
847   if (!more_fragments)
848     {
849       reass->last_packet_octet = fragment_last;
850     }
851   if (~0 == reass->first_bi)
852     {
853       // starting a new reassembly
854       rc =
855         ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
856                                               prev_range_bi, *bi0);
857       if (IP4_REASS_RC_OK != rc)
858         {
859           return rc;
860         }
861       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
862         {
863           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
864                                     ~0);
865         }
866       *bi0 = ~0;
867       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
868       reass->fragments_n = 1;
869       return IP4_REASS_RC_OK;
870     }
871   reass->min_fragment_length =
872     clib_min (clib_net_to_host_u16 (fip->length),
873               fvnb->ip.reass.estimated_mtu);
874   while (~0 != candidate_range_bi)
875     {
876       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
877       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
878       if (fragment_first > candidate_vnb->ip.reass.range_last)
879         {
880           // this fragments starts after candidate range
881           prev_range_bi = candidate_range_bi;
882           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
883           if (candidate_vnb->ip.reass.range_last < fragment_last &&
884               ~0 == candidate_range_bi)
885             {
886               // special case - this fragment falls beyond all known ranges
887               rc =
888                 ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
889                                                       prev_range_bi, *bi0);
890               if (IP4_REASS_RC_OK != rc)
891                 {
892                   return rc;
893                 }
894               consumed = 1;
895               break;
896             }
897           continue;
898         }
899       if (fragment_last < candidate_vnb->ip.reass.range_first)
900         {
901           // this fragment ends before candidate range without any overlap
902           rc =
903             ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
904                                                   prev_range_bi, *bi0);
905           if (IP4_REASS_RC_OK != rc)
906             {
907               return rc;
908             }
909           consumed = 1;
910         }
911       else
912         {
913           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
914               fragment_last <= candidate_vnb->ip.reass.range_last)
915             {
916               // this fragment is a (sub)part of existing range, ignore it
917               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
918                 {
919                   ip4_full_reass_add_trace (vm, node, rm, reass, *bi0,
920                                             RANGE_OVERLAP, 0, ~0);
921                 }
922               break;
923             }
924           int discard_candidate = 0;
925           if (fragment_first < candidate_vnb->ip.reass.range_first)
926             {
927               u32 overlap =
928                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
929               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
930                 {
931                   candidate_vnb->ip.reass.range_first += overlap;
932                   if (reass->data_len < overlap)
933                     {
934                       return IP4_REASS_RC_INTERNAL_ERROR;
935                     }
936                   reass->data_len -= overlap;
937                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
938                     {
939                       ip4_full_reass_add_trace (vm, node, rm, reass,
940                                                 candidate_range_bi,
941                                                 RANGE_SHRINK, 0, ~0);
942                     }
943                   rc =
944                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
945                                                           prev_range_bi,
946                                                           *bi0);
947                   if (IP4_REASS_RC_OK != rc)
948                     {
949                       return rc;
950                     }
951                   consumed = 1;
952                 }
953               else
954                 {
955                   discard_candidate = 1;
956                 }
957             }
958           else if (fragment_last > candidate_vnb->ip.reass.range_last)
959             {
960               u32 overlap =
961                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
962               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
963                 {
964                   fvnb->ip.reass.range_first += overlap;
965                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
966                     {
967                       prev_range_bi = candidate_range_bi;
968                       candidate_range_bi =
969                         candidate_vnb->ip.reass.next_range_bi;
970                       continue;
971                     }
972                   else
973                     {
974                       // special case - last range discarded
975                       rc =
976                         ip4_full_reass_insert_range_in_chain (vm, rm, rt,
977                                                               reass,
978                                                               candidate_range_bi,
979                                                               *bi0);
980                       if (IP4_REASS_RC_OK != rc)
981                         {
982                           return rc;
983                         }
984                       consumed = 1;
985                     }
986                 }
987               else
988                 {
989                   discard_candidate = 1;
990                 }
991             }
992           else
993             {
994               discard_candidate = 1;
995             }
996           if (discard_candidate)
997             {
998               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
999               // discard candidate range, probe next range
1000               rc =
1001                 ip4_full_reass_remove_range_from_chain (vm, node, rm, reass,
1002                                                         prev_range_bi,
1003                                                         candidate_range_bi);
1004               if (IP4_REASS_RC_OK != rc)
1005                 {
1006                   return rc;
1007                 }
1008               if (~0 != next_range_bi)
1009                 {
1010                   candidate_range_bi = next_range_bi;
1011                   continue;
1012                 }
1013               else
1014                 {
1015                   // special case - last range discarded
1016                   rc =
1017                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
1018                                                           prev_range_bi,
1019                                                           *bi0);
1020                   if (IP4_REASS_RC_OK != rc)
1021                     {
1022                       return rc;
1023                     }
1024                   consumed = 1;
1025                 }
1026             }
1027         }
1028       break;
1029     }
1030   ++reass->fragments_n;
1031   if (consumed)
1032     {
1033       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1034         {
1035           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
1036                                     ~0);
1037         }
1038     }
1039   if (~0 != reass->last_packet_octet &&
1040       reass->data_len == reass->last_packet_octet + 1)
1041     {
1042       *handoff_thread_idx = reass->sendout_thread_index;
1043       int handoff =
1044         reass->memory_owner_thread_index != reass->sendout_thread_index;
1045       rc =
1046         ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1047                                  is_custom_app);
1048       if (IP4_REASS_RC_OK == rc && handoff)
1049         {
1050           rc = IP4_REASS_RC_HANDOFF;
1051         }
1052     }
1053   else
1054     {
1055       if (consumed)
1056         {
1057           *bi0 = ~0;
1058           if (reass->fragments_n > rm->max_reass_len)
1059             {
1060               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1061             }
1062         }
1063       else
1064         {
1065           *next0 = IP4_FULL_REASS_NEXT_DROP;
1066           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1067         }
1068     }
1069   return rc;
1070 }
1071
1072 always_inline uword
1073 ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1074                        vlib_frame_t * frame, bool is_feature,
1075                        bool is_custom_app)
1076 {
1077   u32 *from = vlib_frame_vector_args (frame);
1078   u32 n_left_from, n_left_to_next, *to_next, next_index;
1079   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1080   ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1081   clib_spinlock_lock (&rt->lock);
1082
1083   n_left_from = frame->n_vectors;
1084   next_index = node->cached_next_index;
1085   while (n_left_from > 0)
1086     {
1087       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1088
1089       while (n_left_from > 0 && n_left_to_next > 0)
1090         {
1091           u32 bi0;
1092           vlib_buffer_t *b0;
1093           u32 next0;
1094           u32 error0 = IP4_ERROR_NONE;
1095
1096           bi0 = from[0];
1097           b0 = vlib_get_buffer (vm, bi0);
1098
1099           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1100           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1101             {
1102               // this is a whole packet - no fragmentation
1103               if (!is_custom_app)
1104                 {
1105                   next0 = IP4_FULL_REASS_NEXT_INPUT;
1106                 }
1107               else
1108                 {
1109                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1110                 }
1111               goto packet_enqueue;
1112             }
1113           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1114           const u32 fragment_length =
1115             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1116           const u32 fragment_last = fragment_first + fragment_length - 1;
1117           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
1118             {
1119               next0 = IP4_FULL_REASS_NEXT_DROP;
1120               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1121               goto packet_enqueue;
1122             }
1123           ip4_full_reass_kv_t kv;
1124           u8 do_handoff = 0;
1125
1126           kv.k.as_u64[0] =
1127             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1128                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1129             (u64) ip0->src_address.as_u32 << 32;
1130           kv.k.as_u64[1] =
1131             (u64) ip0->dst_address.
1132             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1133
1134           ip4_full_reass_t *reass =
1135             ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
1136                                            &do_handoff);
1137
1138           if (reass)
1139             {
1140               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1141               if (0 == fragment_first)
1142                 {
1143                   reass->sendout_thread_index = vm->thread_index;
1144                 }
1145             }
1146
1147           if (PREDICT_FALSE (do_handoff))
1148             {
1149               next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1150               vnet_buffer (b0)->ip.reass.owner_thread_index =
1151                 kv.v.memory_owner_thread_index;
1152             }
1153           else if (reass)
1154             {
1155               u32 handoff_thread_idx;
1156               switch (ip4_full_reass_update
1157                       (vm, node, rm, rt, reass, &bi0, &next0,
1158                        &error0, is_custom_app, &handoff_thread_idx))
1159                 {
1160                 case IP4_REASS_RC_OK:
1161                   /* nothing to do here */
1162                   break;
1163                 case IP4_REASS_RC_HANDOFF:
1164                   next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1165                   b0 = vlib_get_buffer (vm, bi0);
1166                   vnet_buffer (b0)->ip.reass.owner_thread_index =
1167                     handoff_thread_idx;
1168                   break;
1169                 case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1170                   vlib_node_increment_counter (vm, node->node_index,
1171                                                IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1172                                                1);
1173                   ip4_full_reass_drop_all (vm, node, rm, reass);
1174                   ip4_full_reass_free (rm, rt, reass);
1175                   goto next_packet;
1176                   break;
1177                 case IP4_REASS_RC_NO_BUF:
1178                   vlib_node_increment_counter (vm, node->node_index,
1179                                                IP4_ERROR_REASS_NO_BUF, 1);
1180                   ip4_full_reass_drop_all (vm, node, rm, reass);
1181                   ip4_full_reass_free (rm, rt, reass);
1182                   goto next_packet;
1183                   break;
1184                 case IP4_REASS_RC_INTERNAL_ERROR:
1185                   /* drop everything and start with a clean slate */
1186                   vlib_node_increment_counter (vm, node->node_index,
1187                                                IP4_ERROR_REASS_INTERNAL_ERROR,
1188                                                1);
1189                   ip4_full_reass_drop_all (vm, node, rm, reass);
1190                   ip4_full_reass_free (rm, rt, reass);
1191                   goto next_packet;
1192                   break;
1193                 }
1194             }
1195           else
1196             {
1197               next0 = IP4_FULL_REASS_NEXT_DROP;
1198               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1199             }
1200
1201
1202         packet_enqueue:
1203           b0->error = node->errors[error0];
1204
1205           if (bi0 != ~0)
1206             {
1207               to_next[0] = bi0;
1208               to_next += 1;
1209               n_left_to_next -= 1;
1210               if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1211                 {
1212                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1213                     {
1214                       ip4_full_reass_add_trace (vm, node, rm, NULL, bi0,
1215                                                 HANDOFF, 0,
1216                                                 vnet_buffer (b0)->ip.
1217                                                 reass.owner_thread_index);
1218                     }
1219                 }
1220               else if (is_feature && IP4_ERROR_NONE == error0)
1221                 {
1222                   b0 = vlib_get_buffer (vm, bi0);
1223                   vnet_feature_next (&next0, b0);
1224                 }
1225               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1226                                                to_next, n_left_to_next,
1227                                                bi0, next0);
1228               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1229             }
1230
1231         next_packet:
1232           from += 1;
1233           n_left_from -= 1;
1234         }
1235
1236       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1237     }
1238
1239   clib_spinlock_unlock (&rt->lock);
1240   return frame->n_vectors;
1241 }
1242
1243 static char *ip4_full_reass_error_strings[] = {
1244 #define _(sym, string) string,
1245   foreach_ip4_error
1246 #undef _
1247 };
1248
1249 VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1250                                     vlib_node_runtime_t * node,
1251                                     vlib_frame_t * frame)
1252 {
1253   return ip4_full_reass_inline (vm, node, frame, false /* is_feature */ ,
1254                                 false /* is_custom_app */ );
1255 }
1256
1257 /* *INDENT-OFF* */
1258 VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1259     .name = "ip4-full-reassembly",
1260     .vector_size = sizeof (u32),
1261     .format_trace = format_ip4_full_reass_trace,
1262     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1263     .error_strings = ip4_full_reass_error_strings,
1264     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1265     .next_nodes =
1266         {
1267                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1268                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1269                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1270
1271         },
1272 };
1273 /* *INDENT-ON* */
1274
1275 VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1276                                             vlib_node_runtime_t * node,
1277                                             vlib_frame_t * frame)
1278 {
1279   return ip4_full_reass_inline (vm, node, frame, true /* is_feature */ ,
1280                                 false /* is_custom_app */ );
1281 }
1282
1283 /* *INDENT-OFF* */
1284 VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1285     .name = "ip4-full-reassembly-feature",
1286     .vector_size = sizeof (u32),
1287     .format_trace = format_ip4_full_reass_trace,
1288     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1289     .error_strings = ip4_full_reass_error_strings,
1290     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1291     .next_nodes =
1292         {
1293                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1294                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1295                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1296         },
1297 };
1298 /* *INDENT-ON* */
1299
1300 /* *INDENT-OFF* */
1301 VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1302     .arc_name = "ip4-unicast",
1303     .node_name = "ip4-full-reassembly-feature",
1304     .runs_before = VNET_FEATURES ("ip4-lookup",
1305                                   "ipsec4-input-feature"),
1306     .runs_after = 0,
1307 };
1308 /* *INDENT-ON* */
1309
1310 #ifndef CLIB_MARCH_VARIANT
1311 always_inline u32
1312 ip4_full_reass_get_nbuckets ()
1313 {
1314   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1315   u32 nbuckets;
1316   u8 i;
1317
1318   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1319
1320   for (i = 0; i < 31; i++)
1321     if ((1 << i) >= nbuckets)
1322       break;
1323   nbuckets = 1 << i;
1324
1325   return nbuckets;
1326 }
1327 #endif /* CLIB_MARCH_VARIANT */
1328
1329 typedef enum
1330 {
1331   IP4_EVENT_CONFIG_CHANGED = 1,
1332 } ip4_full_reass_event_t;
1333
1334 typedef struct
1335 {
1336   int failure;
1337   clib_bihash_16_8_t *new_hash;
1338 } ip4_rehash_cb_ctx;
1339
1340 #ifndef CLIB_MARCH_VARIANT
1341 static void
1342 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1343 {
1344   ip4_rehash_cb_ctx *ctx = _ctx;
1345   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1346     {
1347       ctx->failure = 1;
1348     }
1349 }
1350
1351 static void
1352 ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1353                            u32 max_reassembly_length,
1354                            u32 expire_walk_interval_ms)
1355 {
1356   ip4_full_reass_main.timeout_ms = timeout_ms;
1357   ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1358   ip4_full_reass_main.max_reass_n = max_reassemblies;
1359   ip4_full_reass_main.max_reass_len = max_reassembly_length;
1360   ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1361 }
1362
1363 vnet_api_error_t
1364 ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1365                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1366 {
1367   u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1368   ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1369                              max_reassembly_length, expire_walk_interval_ms);
1370   vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1371                              ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1372                              IP4_EVENT_CONFIG_CHANGED, 0);
1373   u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1374   if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1375     {
1376       clib_bihash_16_8_t new_hash;
1377       clib_memset (&new_hash, 0, sizeof (new_hash));
1378       ip4_rehash_cb_ctx ctx;
1379       ctx.failure = 0;
1380       ctx.new_hash = &new_hash;
1381       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1382                              new_nbuckets * 1024);
1383       clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1384                                                ip4_rehash_cb, &ctx);
1385       if (ctx.failure)
1386         {
1387           clib_bihash_free_16_8 (&new_hash);
1388           return -1;
1389         }
1390       else
1391         {
1392           clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1393           clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1394                             sizeof (ip4_full_reass_main.hash));
1395           clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1396         }
1397     }
1398   return 0;
1399 }
1400
1401 vnet_api_error_t
1402 ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1403                     u32 * max_reassembly_length,
1404                     u32 * expire_walk_interval_ms)
1405 {
1406   *timeout_ms = ip4_full_reass_main.timeout_ms;
1407   *max_reassemblies = ip4_full_reass_main.max_reass_n;
1408   *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1409   *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1410   return 0;
1411 }
1412
1413 static clib_error_t *
1414 ip4_full_reass_init_function (vlib_main_t * vm)
1415 {
1416   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1417   clib_error_t *error = 0;
1418   u32 nbuckets;
1419   vlib_node_t *node;
1420
1421   rm->vlib_main = vm;
1422
1423   vec_validate (rm->per_thread_data, vlib_num_workers ());
1424   ip4_full_reass_per_thread_t *rt;
1425   vec_foreach (rt, rm->per_thread_data)
1426   {
1427     clib_spinlock_init (&rt->lock);
1428     pool_alloc (rt->pool, rm->max_reass_n);
1429   }
1430
1431   node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1432   ASSERT (node);
1433   rm->ip4_full_reass_expire_node_idx = node->index;
1434
1435   ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1436                              IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1437                              IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1438                              IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1439
1440   nbuckets = ip4_full_reass_get_nbuckets ();
1441   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1442
1443   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1444   ASSERT (node);
1445   rm->ip4_drop_idx = node->index;
1446
1447   rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1448   rm->fq_feature_index =
1449     vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1450
1451   return error;
1452 }
1453
1454 VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1455 #endif /* CLIB_MARCH_VARIANT */
1456
1457 static uword
1458 ip4_full_reass_walk_expired (vlib_main_t * vm,
1459                              vlib_node_runtime_t * node, vlib_frame_t * f)
1460 {
1461   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1462   uword event_type, *event_data = 0;
1463
1464   while (true)
1465     {
1466       vlib_process_wait_for_event_or_clock (vm,
1467                                             (f64)
1468                                             rm->expire_walk_interval_ms /
1469                                             (f64) MSEC_PER_SEC);
1470       event_type = vlib_process_get_events (vm, &event_data);
1471
1472       switch (event_type)
1473         {
1474         case ~0:                /* no events => timeout */
1475           /* nothing to do here */
1476           break;
1477         case IP4_EVENT_CONFIG_CHANGED:
1478           break;
1479         default:
1480           clib_warning ("BUG: event type 0x%wx", event_type);
1481           break;
1482         }
1483       f64 now = vlib_time_now (vm);
1484
1485       ip4_full_reass_t *reass;
1486       int *pool_indexes_to_free = NULL;
1487
1488       uword thread_index = 0;
1489       int index;
1490       const uword nthreads = vlib_num_workers () + 1;
1491       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1492         {
1493           ip4_full_reass_per_thread_t *rt =
1494             &rm->per_thread_data[thread_index];
1495           clib_spinlock_lock (&rt->lock);
1496
1497           vec_reset_length (pool_indexes_to_free);
1498           /* *INDENT-OFF* */
1499           pool_foreach_index (index, rt->pool, ({
1500                                 reass = pool_elt_at_index (rt->pool, index);
1501                                 if (now > reass->last_heard + rm->timeout)
1502                                   {
1503                                     vec_add1 (pool_indexes_to_free, index);
1504                                   }
1505                               }));
1506           /* *INDENT-ON* */
1507           int *i;
1508           /* *INDENT-OFF* */
1509           vec_foreach (i, pool_indexes_to_free)
1510           {
1511             ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1512             ip4_full_reass_drop_all (vm, node, rm, reass);
1513             ip4_full_reass_free (rm, rt, reass);
1514           }
1515           /* *INDENT-ON* */
1516
1517           clib_spinlock_unlock (&rt->lock);
1518         }
1519
1520       vec_free (pool_indexes_to_free);
1521       if (event_data)
1522         {
1523           _vec_len (event_data) = 0;
1524         }
1525     }
1526
1527   return 0;
1528 }
1529
1530 /* *INDENT-OFF* */
1531 VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1532     .function = ip4_full_reass_walk_expired,
1533     .type = VLIB_NODE_TYPE_PROCESS,
1534     .name = "ip4-full-reassembly-expire-walk",
1535     .format_trace = format_ip4_full_reass_trace,
1536     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1537     .error_strings = ip4_full_reass_error_strings,
1538
1539 };
1540 /* *INDENT-ON* */
1541
1542 static u8 *
1543 format_ip4_full_reass_key (u8 * s, va_list * args)
1544 {
1545   ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1546   s =
1547     format (s,
1548             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1549             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1550             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1551   return s;
1552 }
1553
1554 static u8 *
1555 format_ip4_reass (u8 * s, va_list * args)
1556 {
1557   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1558   ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1559
1560   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1561               "last_packet_octet: %u, trace_op_counter: %u\n",
1562               reass->id, format_ip4_full_reass_key, &reass->key,
1563               reass->first_bi, reass->data_len,
1564               reass->last_packet_octet, reass->trace_op_counter);
1565
1566   u32 bi = reass->first_bi;
1567   u32 counter = 0;
1568   while (~0 != bi)
1569     {
1570       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1571       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1572       s =
1573         format (s,
1574                 "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1575                 "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1576                 vnb->ip.reass.range_last, bi,
1577                 ip4_full_reass_buffer_get_data_offset (b),
1578                 ip4_full_reass_buffer_get_data_len (b),
1579                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1580       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1581         {
1582           bi = b->next_buffer;
1583         }
1584       else
1585         {
1586           bi = ~0;
1587         }
1588     }
1589   return s;
1590 }
1591
1592 static clib_error_t *
1593 show_ip4_reass (vlib_main_t * vm,
1594                 unformat_input_t * input,
1595                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1596 {
1597   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1598
1599   vlib_cli_output (vm, "---------------------");
1600   vlib_cli_output (vm, "IP4 reassembly status");
1601   vlib_cli_output (vm, "---------------------");
1602   bool details = false;
1603   if (unformat (input, "details"))
1604     {
1605       details = true;
1606     }
1607
1608   u32 sum_reass_n = 0;
1609   ip4_full_reass_t *reass;
1610   uword thread_index;
1611   const uword nthreads = vlib_num_workers () + 1;
1612   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1613     {
1614       ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1615       clib_spinlock_lock (&rt->lock);
1616       if (details)
1617         {
1618           /* *INDENT-OFF* */
1619           pool_foreach (reass, rt->pool, {
1620             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1621           });
1622           /* *INDENT-ON* */
1623         }
1624       sum_reass_n += rt->reass_n;
1625       clib_spinlock_unlock (&rt->lock);
1626     }
1627   vlib_cli_output (vm, "---------------------");
1628   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1629                    (long unsigned) sum_reass_n);
1630   vlib_cli_output (vm,
1631                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1632                    (long unsigned) rm->max_reass_n);
1633   return 0;
1634 }
1635
1636 /* *INDENT-OFF* */
1637 VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1638     .path = "show ip4-full-reassembly",
1639     .short_help = "show ip4-full-reassembly [details]",
1640     .function = show_ip4_reass,
1641 };
1642 /* *INDENT-ON* */
1643
1644 #ifndef CLIB_MARCH_VARIANT
1645 vnet_api_error_t
1646 ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1647 {
1648   return vnet_feature_enable_disable ("ip4-unicast",
1649                                       "ip4-full-reassembly-feature",
1650                                       sw_if_index, enable_disable, 0, 0);
1651 }
1652 #endif /* CLIB_MARCH_VARIANT */
1653
1654
1655 #define foreach_ip4_full_reass_handoff_error                       \
1656 _(CONGESTION_DROP, "congestion drop")
1657
1658
1659 typedef enum
1660 {
1661 #define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1662   foreach_ip4_full_reass_handoff_error
1663 #undef _
1664     IP4_FULL_REASS_HANDOFF_N_ERROR,
1665 } ip4_full_reass_handoff_error_t;
1666
1667 static char *ip4_full_reass_handoff_error_strings[] = {
1668 #define _(sym,string) string,
1669   foreach_ip4_full_reass_handoff_error
1670 #undef _
1671 };
1672
1673 typedef struct
1674 {
1675   u32 next_worker_index;
1676 } ip4_full_reass_handoff_trace_t;
1677
1678 static u8 *
1679 format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1680 {
1681   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1682   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1683   ip4_full_reass_handoff_trace_t *t =
1684     va_arg (*args, ip4_full_reass_handoff_trace_t *);
1685
1686   s =
1687     format (s, "ip4-full-reassembly-handoff: next-worker %d",
1688             t->next_worker_index);
1689
1690   return s;
1691 }
1692
1693 always_inline uword
1694 ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
1695                                     vlib_node_runtime_t * node,
1696                                     vlib_frame_t * frame, bool is_feature)
1697 {
1698   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1699
1700   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1701   u32 n_enq, n_left_from, *from;
1702   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1703   u32 fq_index;
1704
1705   from = vlib_frame_vector_args (frame);
1706   n_left_from = frame->n_vectors;
1707   vlib_get_buffers (vm, from, bufs, n_left_from);
1708
1709   b = bufs;
1710   ti = thread_indices;
1711
1712   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1713
1714   while (n_left_from > 0)
1715     {
1716       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1717
1718       if (PREDICT_FALSE
1719           ((node->flags & VLIB_NODE_FLAG_TRACE)
1720            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1721         {
1722           ip4_full_reass_handoff_trace_t *t =
1723             vlib_add_trace (vm, node, b[0], sizeof (*t));
1724           t->next_worker_index = ti[0];
1725         }
1726
1727       n_left_from -= 1;
1728       ti += 1;
1729       b += 1;
1730     }
1731   n_enq =
1732     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1733                                    frame->n_vectors, 1);
1734
1735   if (n_enq < frame->n_vectors)
1736     vlib_node_increment_counter (vm, node->node_index,
1737                                  IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1738                                  frame->n_vectors - n_enq);
1739   return frame->n_vectors;
1740 }
1741
1742 VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1743                                             vlib_node_runtime_t * node,
1744                                             vlib_frame_t * frame)
1745 {
1746   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1747                                              false /* is_feature */ );
1748 }
1749
1750
1751 /* *INDENT-OFF* */
1752 VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1753   .name = "ip4-full-reassembly-handoff",
1754   .vector_size = sizeof (u32),
1755   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1756   .error_strings = ip4_full_reass_handoff_error_strings,
1757   .format_trace = format_ip4_full_reass_handoff_trace,
1758
1759   .n_next_nodes = 1,
1760
1761   .next_nodes = {
1762     [0] = "error-drop",
1763   },
1764 };
1765 /* *INDENT-ON* */
1766
1767
1768 /* *INDENT-OFF* */
1769 VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
1770                                                     vlib_node_runtime_t *
1771                                                     node,
1772                                                     vlib_frame_t * frame)
1773 {
1774   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1775                                              true /* is_feature */ );
1776 }
1777 /* *INDENT-ON* */
1778
1779
1780 /* *INDENT-OFF* */
1781 VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
1782   .name = "ip4-full-reass-feature-hoff",
1783   .vector_size = sizeof (u32),
1784   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1785   .error_strings = ip4_full_reass_handoff_error_strings,
1786   .format_trace = format_ip4_full_reass_handoff_trace,
1787
1788   .n_next_nodes = 1,
1789
1790   .next_nodes = {
1791     [0] = "error-drop",
1792   },
1793 };
1794 /* *INDENT-ON* */
1795
1796 /*
1797  * fd.io coding-style-patch-verification: ON
1798  *
1799  * Local Variables:
1800  * eval: (c-set-style "gnu")
1801  * End:
1802  */