map: use ip6-full-reassembly instead of own code
[vpp.git] / src / vnet / ip / reass / ip4_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Full Reassembly.
19  *
20  * This file contains the source code for IPv4 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/fifo.h>
27 #include <vppinfra/bihash_16_8.h>
28 #include <vnet/ip/reass/ip4_full_reass.h>
29 #include <stddef.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
34 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38 #define IP4_REASS_DEBUG_BUFFERS 0
39 #if IP4_REASS_DEBUG_BUFFERS
40 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41   do                                                 \
42     {                                                \
43       u32 _bi = bi;                                  \
44       printf (#what "buffer %u", _bi);               \
45       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47         {                                            \
48           _bi = _b->next_buffer;                     \
49           printf ("[%u]", _bi);                      \
50           _b = vlib_get_buffer (vm, _bi);            \
51         }                                            \
52       printf ("\n");                                 \
53       fflush (stdout);                               \
54     }                                                \
55   while (0)
56 #else
57 #define IP4_REASS_DEBUG_BUFFER(...)
58 #endif
59
60 typedef enum
61 {
62   IP4_REASS_RC_OK,
63   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64   IP4_REASS_RC_INTERNAL_ERROR,
65   IP4_REASS_RC_NO_BUF,
66   IP4_REASS_RC_HANDOFF,
67 } ip4_full_reass_rc_t;
68
69 typedef struct
70 {
71   union
72   {
73     struct
74     {
75       u32 xx_id;
76       ip4_address_t src;
77       ip4_address_t dst;
78       u16 frag_id;
79       u8 proto;
80       u8 unused;
81     };
82     u64 as_u64[2];
83   };
84 } ip4_full_reass_key_t;
85
86 typedef union
87 {
88   struct
89   {
90     u32 reass_index;
91     u32 memory_owner_thread_index;
92   };
93   u64 as_u64;
94 } ip4_full_reass_val_t;
95
96 typedef union
97 {
98   struct
99   {
100     ip4_full_reass_key_t k;
101     ip4_full_reass_val_t v;
102   };
103   clib_bihash_kv_16_8_t kv;
104 } ip4_full_reass_kv_t;
105
106 always_inline u32
107 ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108 {
109   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111 }
112
113 always_inline u16
114 ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115 {
116   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118     (vnb->ip.reass.fragment_first +
119      ip4_full_reass_buffer_get_data_offset (b)) + 1;
120 }
121
122 typedef struct
123 {
124   // hash table key
125   ip4_full_reass_key_t key;
126   // time when last packet was received
127   f64 last_heard;
128   // internal id of this reassembly
129   u64 id;
130   // buffer index of first buffer in this reassembly context
131   u32 first_bi;
132   // last octet of packet, ~0 until fragment without more_fragments arrives
133   u32 last_packet_octet;
134   // length of data collected so far
135   u32 data_len;
136   // trace operation counter
137   u32 trace_op_counter;
138   // next index - used by non-feature node
139   u32 next_index;
140   // error next index - used by custom apps (~0 if not used)
141   u32 error_next_index;
142   // minimum fragment length for this reassembly - used to estimate MTU
143   u16 min_fragment_length;
144   // number of fragments in this reassembly
145   u32 fragments_n;
146   // thread owning memory for this context (whose pool contains this ctx)
147   u32 memory_owner_thread_index;
148   // thread which received fragment with offset 0 and which sends out the
149   // completed reassembly
150   u32 sendout_thread_index;
151 } ip4_full_reass_t;
152
153 typedef struct
154 {
155   ip4_full_reass_t *pool;
156   u32 reass_n;
157   u32 id_counter;
158   clib_spinlock_t lock;
159 } ip4_full_reass_per_thread_t;
160
161 typedef struct
162 {
163   // IPv4 config
164   u32 timeout_ms;
165   f64 timeout;
166   u32 expire_walk_interval_ms;
167   // maximum number of fragments in one reassembly
168   u32 max_reass_len;
169   // maximum number of reassemblies
170   u32 max_reass_n;
171
172   // IPv4 runtime
173   clib_bihash_16_8_t hash;
174   // per-thread data
175   ip4_full_reass_per_thread_t *per_thread_data;
176
177   // convenience
178   vlib_main_t *vlib_main;
179
180   // node index of ip4-drop node
181   u32 ip4_drop_idx;
182   u32 ip4_full_reass_expire_node_idx;
183
184   /** Worker handoff */
185   u32 fq_index;
186   u32 fq_feature_index;
187
188   // reference count for enabling/disabling feature - per interface
189   u32 *feature_use_refcount_per_intf;
190 } ip4_full_reass_main_t;
191
192 extern ip4_full_reass_main_t ip4_full_reass_main;
193
194 #ifndef CLIB_MARCH_VARIANT
195 ip4_full_reass_main_t ip4_full_reass_main;
196 #endif /* CLIB_MARCH_VARIANT */
197
198 typedef enum
199 {
200   IP4_FULL_REASS_NEXT_INPUT,
201   IP4_FULL_REASS_NEXT_DROP,
202   IP4_FULL_REASS_NEXT_HANDOFF,
203   IP4_FULL_REASS_N_NEXT,
204 } ip4_full_reass_next_t;
205
206 typedef enum
207 {
208   RANGE_NEW,
209   RANGE_SHRINK,
210   RANGE_DISCARD,
211   RANGE_OVERLAP,
212   FINALIZE,
213   HANDOFF,
214 } ip4_full_reass_trace_operation_e;
215
216 typedef struct
217 {
218   u16 range_first;
219   u16 range_last;
220   u32 range_bi;
221   i32 data_offset;
222   u32 data_len;
223   u32 first_bi;
224 } ip4_full_reass_range_trace_t;
225
226 typedef struct
227 {
228   ip4_full_reass_trace_operation_e action;
229   u32 reass_id;
230   ip4_full_reass_range_trace_t trace_range;
231   u32 size_diff;
232   u32 op_id;
233   u32 thread_id;
234   u32 thread_id_to;
235   u32 fragment_first;
236   u32 fragment_last;
237   u32 total_data_len;
238 } ip4_full_reass_trace_t;
239
240 extern vlib_node_registration_t ip4_full_reass_node;
241 extern vlib_node_registration_t ip4_full_reass_node_feature;
242
243 static void
244 ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
245                               ip4_full_reass_range_trace_t * trace)
246 {
247   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
248   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
249   trace->range_first = vnb->ip.reass.range_first;
250   trace->range_last = vnb->ip.reass.range_last;
251   trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
252   trace->data_len = ip4_full_reass_buffer_get_data_len (b);
253   trace->range_bi = bi;
254 }
255
256 static u8 *
257 format_ip4_full_reass_range_trace (u8 * s, va_list * args)
258 {
259   ip4_full_reass_range_trace_t *trace =
260     va_arg (*args, ip4_full_reass_range_trace_t *);
261   s =
262     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
263             trace->range_last, trace->data_offset, trace->data_len,
264             trace->range_bi);
265   return s;
266 }
267
268 static u8 *
269 format_ip4_full_reass_trace (u8 * s, va_list * args)
270 {
271   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
272   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
273   ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
274   u32 indent = 0;
275   if (~0 != t->reass_id)
276     {
277       s = format (s, "reass id: %u, op id: %u, ", t->reass_id, t->op_id);
278       indent = format_get_indent (s);
279       s =
280         format (s,
281                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
282                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
283                 t->fragment_last);
284     }
285   switch (t->action)
286     {
287     case RANGE_SHRINK:
288       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
289                   format_ip4_full_reass_range_trace, &t->trace_range,
290                   t->size_diff);
291       break;
292     case RANGE_DISCARD:
293       s = format (s, "\n%Udiscard %U", format_white_space, indent,
294                   format_ip4_full_reass_range_trace, &t->trace_range);
295       break;
296     case RANGE_NEW:
297       s = format (s, "\n%Unew %U", format_white_space, indent,
298                   format_ip4_full_reass_range_trace, &t->trace_range);
299       break;
300     case RANGE_OVERLAP:
301       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
302                   format_ip4_full_reass_range_trace, &t->trace_range);
303       break;
304     case FINALIZE:
305       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
306       break;
307     case HANDOFF:
308       s =
309         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
310                 t->thread_id_to);
311       break;
312     }
313   return s;
314 }
315
316 static void
317 ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
318                           ip4_full_reass_main_t * rm,
319                           ip4_full_reass_t * reass, u32 bi,
320                           ip4_full_reass_trace_operation_e action,
321                           u32 size_diff, u32 thread_id_to)
322 {
323   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
324   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
325   ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
326   if (reass)
327     {
328       t->reass_id = reass->id;
329       t->op_id = reass->trace_op_counter;
330       t->trace_range.first_bi = reass->first_bi;
331       t->total_data_len = reass->data_len;
332       ++reass->trace_op_counter;
333     }
334   else
335     {
336       t->reass_id = ~0;
337       t->op_id = 0;
338       t->trace_range.first_bi = 0;
339       t->total_data_len = 0;
340     }
341   t->action = action;
342   ip4_full_reass_trace_details (vm, bi, &t->trace_range);
343   t->size_diff = size_diff;
344   t->thread_id = vm->thread_index;
345   t->thread_id_to = thread_id_to;
346   t->fragment_first = vnb->ip.reass.fragment_first;
347   t->fragment_last = vnb->ip.reass.fragment_last;
348 #if 0
349   static u8 *s = NULL;
350   s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
351   printf ("%.*s\n", vec_len (s), s);
352   fflush (stdout);
353   vec_reset_length (s);
354 #endif
355 }
356
357 always_inline void
358 ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
359                          ip4_full_reass_t * reass)
360 {
361   pool_put (rt->pool, reass);
362   --rt->reass_n;
363 }
364
365 always_inline void
366 ip4_full_reass_free (ip4_full_reass_main_t * rm,
367                      ip4_full_reass_per_thread_t * rt,
368                      ip4_full_reass_t * reass)
369 {
370   clib_bihash_kv_16_8_t kv;
371   kv.key[0] = reass->key.as_u64[0];
372   kv.key[1] = reass->key.as_u64[1];
373   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
374   return ip4_full_reass_free_ctx (rt, reass);
375 }
376
377 always_inline void
378 ip4_full_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
379                          ip4_full_reass_main_t * rm, ip4_full_reass_t * reass)
380 {
381   u32 range_bi = reass->first_bi;
382   vlib_buffer_t *range_b;
383   vnet_buffer_opaque_t *range_vnb;
384   u32 *to_free = NULL;
385   while (~0 != range_bi)
386     {
387       range_b = vlib_get_buffer (vm, range_bi);
388       range_vnb = vnet_buffer (range_b);
389       u32 bi = range_bi;
390       while (~0 != bi)
391         {
392           vec_add1 (to_free, bi);
393           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
394           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
395             {
396               bi = b->next_buffer;
397               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
398             }
399           else
400             {
401               bi = ~0;
402             }
403         }
404       range_bi = range_vnb->ip.reass.next_range_bi;
405     }
406   /* send to next_error_index */
407   if (~0 != reass->error_next_index)
408     {
409       u32 n_left_to_next, *to_next, next_index;
410
411       next_index = reass->error_next_index;
412       u32 bi = ~0;
413
414       while (vec_len (to_free) > 0)
415         {
416           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
417
418           while (vec_len (to_free) > 0 && n_left_to_next > 0)
419             {
420               bi = vec_pop (to_free);
421
422               if (~0 != bi)
423                 {
424                   to_next[0] = bi;
425                   to_next += 1;
426                   n_left_to_next -= 1;
427                 }
428             }
429           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
430         }
431     }
432   else
433     {
434       vlib_buffer_free (vm, to_free, vec_len (to_free));
435     }
436 }
437
438 always_inline void
439 ip4_full_reass_init (ip4_full_reass_t * reass)
440 {
441   reass->first_bi = ~0;
442   reass->last_packet_octet = ~0;
443   reass->data_len = 0;
444   reass->next_index = ~0;
445   reass->error_next_index = ~0;
446 }
447
448 always_inline ip4_full_reass_t *
449 ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
450                                ip4_full_reass_main_t * rm,
451                                ip4_full_reass_per_thread_t * rt,
452                                ip4_full_reass_kv_t * kv, u8 * do_handoff)
453 {
454   ip4_full_reass_t *reass;
455   f64 now;
456
457 again:
458
459   reass = NULL;
460   now = vlib_time_now (vm);
461   if (!clib_bihash_search_16_8
462       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
463     {
464       reass =
465         pool_elt_at_index (rm->per_thread_data
466                            [kv->v.memory_owner_thread_index].pool,
467                            kv->v.reass_index);
468       if (vm->thread_index != reass->memory_owner_thread_index)
469         {
470           *do_handoff = 1;
471           return reass;
472         }
473
474       if (now > reass->last_heard + rm->timeout)
475         {
476           ip4_full_reass_drop_all (vm, node, rm, reass);
477           ip4_full_reass_free (rm, rt, reass);
478           reass = NULL;
479         }
480     }
481
482   if (reass)
483     {
484       reass->last_heard = now;
485       return reass;
486     }
487
488   if (rt->reass_n >= rm->max_reass_n)
489     {
490       reass = NULL;
491       return reass;
492     }
493   else
494     {
495       pool_get (rt->pool, reass);
496       clib_memset (reass, 0, sizeof (*reass));
497       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
498       reass->memory_owner_thread_index = vm->thread_index;
499       ++rt->id_counter;
500       ip4_full_reass_init (reass);
501       ++rt->reass_n;
502     }
503
504   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
505   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
506   kv->v.reass_index = (reass - rt->pool);
507   kv->v.memory_owner_thread_index = vm->thread_index;
508   reass->last_heard = now;
509
510   int rv =
511     clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 2);
512   if (rv)
513     {
514       ip4_full_reass_free_ctx (rt, reass);
515       reass = NULL;
516       // if other worker created a context already work with the other copy
517       if (-2 == rv)
518         goto again;
519     }
520
521   return reass;
522 }
523
524 always_inline ip4_full_reass_rc_t
525 ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
526                          ip4_full_reass_main_t * rm,
527                          ip4_full_reass_per_thread_t * rt,
528                          ip4_full_reass_t * reass, u32 * bi0,
529                          u32 * next0, u32 * error0, bool is_custom_app)
530 {
531   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
532   vlib_buffer_t *last_b = NULL;
533   u32 sub_chain_bi = reass->first_bi;
534   u32 total_length = 0;
535   u32 buf_cnt = 0;
536   do
537     {
538       u32 tmp_bi = sub_chain_bi;
539       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
540       ip4_header_t *ip = vlib_buffer_get_current (tmp);
541       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
542       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
543           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
544         {
545           return IP4_REASS_RC_INTERNAL_ERROR;
546         }
547
548       u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
549       u32 trim_front =
550         ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
551       u32 trim_end =
552         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
553       if (tmp_bi == reass->first_bi)
554         {
555           /* first buffer - keep ip4 header */
556           if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
557             {
558               return IP4_REASS_RC_INTERNAL_ERROR;
559             }
560           trim_front = 0;
561           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
562             ip4_header_bytes (ip);
563           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
564             {
565               return IP4_REASS_RC_INTERNAL_ERROR;
566             }
567         }
568       u32 keep_data =
569         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
570       while (1)
571         {
572           ++buf_cnt;
573           if (trim_front)
574             {
575               if (trim_front > tmp->current_length)
576                 {
577                   /* drop whole buffer */
578                   u32 to_be_freed_bi = tmp_bi;
579                   trim_front -= tmp->current_length;
580                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
581                     {
582                       return IP4_REASS_RC_INTERNAL_ERROR;
583                     }
584                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
585                   tmp_bi = tmp->next_buffer;
586                   tmp->next_buffer = 0;
587                   tmp = vlib_get_buffer (vm, tmp_bi);
588                   vlib_buffer_free_one (vm, to_be_freed_bi);
589                   continue;
590                 }
591               else
592                 {
593                   vlib_buffer_advance (tmp, trim_front);
594                   trim_front = 0;
595                 }
596             }
597           if (keep_data)
598             {
599               if (last_b)
600                 {
601                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
602                   last_b->next_buffer = tmp_bi;
603                 }
604               last_b = tmp;
605               if (keep_data <= tmp->current_length)
606                 {
607                   tmp->current_length = keep_data;
608                   keep_data = 0;
609                 }
610               else
611                 {
612                   keep_data -= tmp->current_length;
613                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
614                     {
615                       return IP4_REASS_RC_INTERNAL_ERROR;
616                     }
617                 }
618               total_length += tmp->current_length;
619               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
620                 {
621                   tmp_bi = tmp->next_buffer;
622                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
623                 }
624               else
625                 {
626                   break;
627                 }
628             }
629           else
630             {
631               u32 to_be_freed_bi = tmp_bi;
632               if (reass->first_bi == tmp_bi)
633                 {
634                   return IP4_REASS_RC_INTERNAL_ERROR;
635                 }
636               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
637                 {
638                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
639                   tmp_bi = tmp->next_buffer;
640                   tmp->next_buffer = 0;
641                   tmp = vlib_get_buffer (vm, tmp_bi);
642                   vlib_buffer_free_one (vm, to_be_freed_bi);
643                 }
644               else
645                 {
646                   tmp->next_buffer = 0;
647                   vlib_buffer_free_one (vm, to_be_freed_bi);
648                   break;
649                 }
650             }
651         }
652       sub_chain_bi =
653         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
654         reass.next_range_bi;
655     }
656   while (~0 != sub_chain_bi);
657
658   if (!last_b)
659     {
660       return IP4_REASS_RC_INTERNAL_ERROR;
661     }
662   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
663
664   if (total_length < first_b->current_length)
665     {
666       return IP4_REASS_RC_INTERNAL_ERROR;
667     }
668   total_length -= first_b->current_length;
669   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
670   first_b->total_length_not_including_first_buffer = total_length;
671   ip4_header_t *ip = vlib_buffer_get_current (first_b);
672   ip->flags_and_fragment_offset = 0;
673   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
674   ip->checksum = ip4_header_checksum (ip);
675   if (!vlib_buffer_chain_linearize (vm, first_b))
676     {
677       return IP4_REASS_RC_NO_BUF;
678     }
679   // reset to reconstruct the mbuf linking
680   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
681   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
682     {
683       ip4_full_reass_add_trace (vm, node, rm, reass, reass->first_bi,
684                                 FINALIZE, 0, ~0);
685 #if 0
686       // following code does a hexdump of packet fragments to stdout ...
687       do
688         {
689           u32 bi = reass->first_bi;
690           u8 *s = NULL;
691           while (~0 != bi)
692             {
693               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
694               s = format (s, "%u: %U\n", bi, format_hexdump,
695                           vlib_buffer_get_current (b), b->current_length);
696               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
697                 {
698                   bi = b->next_buffer;
699                 }
700               else
701                 {
702                   break;
703                 }
704             }
705           printf ("%.*s\n", vec_len (s), s);
706           fflush (stdout);
707           vec_free (s);
708         }
709       while (0);
710 #endif
711     }
712   *bi0 = reass->first_bi;
713   if (!is_custom_app)
714     {
715       *next0 = IP4_FULL_REASS_NEXT_INPUT;
716     }
717   else
718     {
719       *next0 = reass->next_index;
720     }
721   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
722   *error0 = IP4_ERROR_NONE;
723   ip4_full_reass_free (rm, rt, reass);
724   reass = NULL;
725   return IP4_REASS_RC_OK;
726 }
727
728 always_inline ip4_full_reass_rc_t
729 ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
730                                       ip4_full_reass_main_t * rm,
731                                       ip4_full_reass_per_thread_t * rt,
732                                       ip4_full_reass_t * reass,
733                                       u32 prev_range_bi, u32 new_next_bi)
734 {
735   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
736   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
737   if (~0 != prev_range_bi)
738     {
739       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
740       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
741       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
742       prev_vnb->ip.reass.next_range_bi = new_next_bi;
743     }
744   else
745     {
746       if (~0 != reass->first_bi)
747         {
748           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
749         }
750       reass->first_bi = new_next_bi;
751     }
752   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
753   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
754       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
755     {
756       return IP4_REASS_RC_INTERNAL_ERROR;
757     }
758   reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
759   return IP4_REASS_RC_OK;
760 }
761
762 always_inline ip4_full_reass_rc_t
763 ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
764                                         vlib_node_runtime_t * node,
765                                         ip4_full_reass_main_t * rm,
766                                         ip4_full_reass_t * reass,
767                                         u32 prev_range_bi, u32 discard_bi)
768 {
769   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
770   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
771   if (~0 != prev_range_bi)
772     {
773       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
774       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
775       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
776         {
777           return IP4_REASS_RC_INTERNAL_ERROR;
778         }
779       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
780     }
781   else
782     {
783       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
784     }
785   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
786   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
787       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
788     {
789       return IP4_REASS_RC_INTERNAL_ERROR;
790     }
791   reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
792   while (1)
793     {
794       u32 to_be_freed_bi = discard_bi;
795       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
796         {
797           ip4_full_reass_add_trace (vm, node, rm, reass, discard_bi,
798                                     RANGE_DISCARD, 0, ~0);
799         }
800       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
801         {
802           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
803           discard_bi = discard_b->next_buffer;
804           discard_b->next_buffer = 0;
805           discard_b = vlib_get_buffer (vm, discard_bi);
806           vlib_buffer_free_one (vm, to_be_freed_bi);
807         }
808       else
809         {
810           discard_b->next_buffer = 0;
811           vlib_buffer_free_one (vm, to_be_freed_bi);
812           break;
813         }
814     }
815   return IP4_REASS_RC_OK;
816 }
817
818 always_inline ip4_full_reass_rc_t
819 ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
820                        ip4_full_reass_main_t * rm,
821                        ip4_full_reass_per_thread_t * rt,
822                        ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
823                        u32 * error0, bool is_custom_app,
824                        u32 * handoff_thread_idx)
825 {
826   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
827   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
828   if (is_custom_app)
829     {
830       // store (error_)next_index before it's overwritten
831       reass->next_index = fvnb->ip.reass.next_index;
832       reass->error_next_index = fvnb->ip.reass.error_next_index;
833     }
834   ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
835   int consumed = 0;
836   ip4_header_t *fip = vlib_buffer_get_current (fb);
837   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
838   const u32 fragment_length =
839     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
840   const u32 fragment_last = fragment_first + fragment_length - 1;
841   fvnb->ip.reass.fragment_first = fragment_first;
842   fvnb->ip.reass.fragment_last = fragment_last;
843   int more_fragments = ip4_get_fragment_more (fip);
844   u32 candidate_range_bi = reass->first_bi;
845   u32 prev_range_bi = ~0;
846   fvnb->ip.reass.range_first = fragment_first;
847   fvnb->ip.reass.range_last = fragment_last;
848   fvnb->ip.reass.next_range_bi = ~0;
849   if (!more_fragments)
850     {
851       reass->last_packet_octet = fragment_last;
852     }
853   if (~0 == reass->first_bi)
854     {
855       // starting a new reassembly
856       rc =
857         ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
858                                               prev_range_bi, *bi0);
859       if (IP4_REASS_RC_OK != rc)
860         {
861           return rc;
862         }
863       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
864         {
865           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
866                                     ~0);
867         }
868       *bi0 = ~0;
869       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
870       reass->fragments_n = 1;
871       return IP4_REASS_RC_OK;
872     }
873   reass->min_fragment_length =
874     clib_min (clib_net_to_host_u16 (fip->length),
875               fvnb->ip.reass.estimated_mtu);
876   while (~0 != candidate_range_bi)
877     {
878       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
879       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
880       if (fragment_first > candidate_vnb->ip.reass.range_last)
881         {
882           // this fragments starts after candidate range
883           prev_range_bi = candidate_range_bi;
884           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
885           if (candidate_vnb->ip.reass.range_last < fragment_last &&
886               ~0 == candidate_range_bi)
887             {
888               // special case - this fragment falls beyond all known ranges
889               rc =
890                 ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
891                                                       prev_range_bi, *bi0);
892               if (IP4_REASS_RC_OK != rc)
893                 {
894                   return rc;
895                 }
896               consumed = 1;
897               break;
898             }
899           continue;
900         }
901       if (fragment_last < candidate_vnb->ip.reass.range_first)
902         {
903           // this fragment ends before candidate range without any overlap
904           rc =
905             ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
906                                                   prev_range_bi, *bi0);
907           if (IP4_REASS_RC_OK != rc)
908             {
909               return rc;
910             }
911           consumed = 1;
912         }
913       else
914         {
915           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
916               fragment_last <= candidate_vnb->ip.reass.range_last)
917             {
918               // this fragment is a (sub)part of existing range, ignore it
919               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
920                 {
921                   ip4_full_reass_add_trace (vm, node, rm, reass, *bi0,
922                                             RANGE_OVERLAP, 0, ~0);
923                 }
924               break;
925             }
926           int discard_candidate = 0;
927           if (fragment_first < candidate_vnb->ip.reass.range_first)
928             {
929               u32 overlap =
930                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
931               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
932                 {
933                   candidate_vnb->ip.reass.range_first += overlap;
934                   if (reass->data_len < overlap)
935                     {
936                       return IP4_REASS_RC_INTERNAL_ERROR;
937                     }
938                   reass->data_len -= overlap;
939                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
940                     {
941                       ip4_full_reass_add_trace (vm, node, rm, reass,
942                                                 candidate_range_bi,
943                                                 RANGE_SHRINK, 0, ~0);
944                     }
945                   rc =
946                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
947                                                           prev_range_bi,
948                                                           *bi0);
949                   if (IP4_REASS_RC_OK != rc)
950                     {
951                       return rc;
952                     }
953                   consumed = 1;
954                 }
955               else
956                 {
957                   discard_candidate = 1;
958                 }
959             }
960           else if (fragment_last > candidate_vnb->ip.reass.range_last)
961             {
962               u32 overlap =
963                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
964               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
965                 {
966                   fvnb->ip.reass.range_first += overlap;
967                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
968                     {
969                       prev_range_bi = candidate_range_bi;
970                       candidate_range_bi =
971                         candidate_vnb->ip.reass.next_range_bi;
972                       continue;
973                     }
974                   else
975                     {
976                       // special case - last range discarded
977                       rc =
978                         ip4_full_reass_insert_range_in_chain (vm, rm, rt,
979                                                               reass,
980                                                               candidate_range_bi,
981                                                               *bi0);
982                       if (IP4_REASS_RC_OK != rc)
983                         {
984                           return rc;
985                         }
986                       consumed = 1;
987                     }
988                 }
989               else
990                 {
991                   discard_candidate = 1;
992                 }
993             }
994           else
995             {
996               discard_candidate = 1;
997             }
998           if (discard_candidate)
999             {
1000               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
1001               // discard candidate range, probe next range
1002               rc =
1003                 ip4_full_reass_remove_range_from_chain (vm, node, rm, reass,
1004                                                         prev_range_bi,
1005                                                         candidate_range_bi);
1006               if (IP4_REASS_RC_OK != rc)
1007                 {
1008                   return rc;
1009                 }
1010               if (~0 != next_range_bi)
1011                 {
1012                   candidate_range_bi = next_range_bi;
1013                   continue;
1014                 }
1015               else
1016                 {
1017                   // special case - last range discarded
1018                   rc =
1019                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
1020                                                           prev_range_bi,
1021                                                           *bi0);
1022                   if (IP4_REASS_RC_OK != rc)
1023                     {
1024                       return rc;
1025                     }
1026                   consumed = 1;
1027                 }
1028             }
1029         }
1030       break;
1031     }
1032   ++reass->fragments_n;
1033   if (consumed)
1034     {
1035       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1036         {
1037           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
1038                                     ~0);
1039         }
1040     }
1041   if (~0 != reass->last_packet_octet &&
1042       reass->data_len == reass->last_packet_octet + 1)
1043     {
1044       *handoff_thread_idx = reass->sendout_thread_index;
1045       int handoff =
1046         reass->memory_owner_thread_index != reass->sendout_thread_index;
1047       rc =
1048         ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1049                                  is_custom_app);
1050       if (IP4_REASS_RC_OK == rc && handoff)
1051         {
1052           rc = IP4_REASS_RC_HANDOFF;
1053         }
1054     }
1055   else
1056     {
1057       if (consumed)
1058         {
1059           *bi0 = ~0;
1060           if (reass->fragments_n > rm->max_reass_len)
1061             {
1062               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1063             }
1064         }
1065       else
1066         {
1067           *next0 = IP4_FULL_REASS_NEXT_DROP;
1068           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1069         }
1070     }
1071   return rc;
1072 }
1073
1074 always_inline uword
1075 ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1076                        vlib_frame_t * frame, bool is_feature,
1077                        bool is_custom_app)
1078 {
1079   u32 *from = vlib_frame_vector_args (frame);
1080   u32 n_left_from, n_left_to_next, *to_next, next_index;
1081   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1082   ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1083   clib_spinlock_lock (&rt->lock);
1084
1085   n_left_from = frame->n_vectors;
1086   next_index = node->cached_next_index;
1087   while (n_left_from > 0)
1088     {
1089       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1090
1091       while (n_left_from > 0 && n_left_to_next > 0)
1092         {
1093           u32 bi0;
1094           vlib_buffer_t *b0;
1095           u32 next0;
1096           u32 error0 = IP4_ERROR_NONE;
1097
1098           bi0 = from[0];
1099           b0 = vlib_get_buffer (vm, bi0);
1100
1101           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1102           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1103             {
1104               // this is a whole packet - no fragmentation
1105               if (!is_custom_app)
1106                 {
1107                   next0 = IP4_FULL_REASS_NEXT_INPUT;
1108                 }
1109               else
1110                 {
1111                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1112                 }
1113               goto packet_enqueue;
1114             }
1115           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1116           const u32 fragment_length =
1117             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1118           const u32 fragment_last = fragment_first + fragment_length - 1;
1119           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
1120             {
1121               next0 = IP4_FULL_REASS_NEXT_DROP;
1122               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1123               goto packet_enqueue;
1124             }
1125           ip4_full_reass_kv_t kv;
1126           u8 do_handoff = 0;
1127
1128           kv.k.as_u64[0] =
1129             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1130                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1131             (u64) ip0->src_address.as_u32 << 32;
1132           kv.k.as_u64[1] =
1133             (u64) ip0->dst_address.
1134             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1135
1136           ip4_full_reass_t *reass =
1137             ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
1138                                            &do_handoff);
1139
1140           if (reass)
1141             {
1142               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1143               if (0 == fragment_first)
1144                 {
1145                   reass->sendout_thread_index = vm->thread_index;
1146                 }
1147             }
1148
1149           if (PREDICT_FALSE (do_handoff))
1150             {
1151               next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1152               vnet_buffer (b0)->ip.reass.owner_thread_index =
1153                 kv.v.memory_owner_thread_index;
1154             }
1155           else if (reass)
1156             {
1157               u32 handoff_thread_idx;
1158               switch (ip4_full_reass_update
1159                       (vm, node, rm, rt, reass, &bi0, &next0,
1160                        &error0, is_custom_app, &handoff_thread_idx))
1161                 {
1162                 case IP4_REASS_RC_OK:
1163                   /* nothing to do here */
1164                   break;
1165                 case IP4_REASS_RC_HANDOFF:
1166                   next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1167                   b0 = vlib_get_buffer (vm, bi0);
1168                   vnet_buffer (b0)->ip.reass.owner_thread_index =
1169                     handoff_thread_idx;
1170                   break;
1171                 case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1172                   vlib_node_increment_counter (vm, node->node_index,
1173                                                IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1174                                                1);
1175                   ip4_full_reass_drop_all (vm, node, rm, reass);
1176                   ip4_full_reass_free (rm, rt, reass);
1177                   goto next_packet;
1178                   break;
1179                 case IP4_REASS_RC_NO_BUF:
1180                   vlib_node_increment_counter (vm, node->node_index,
1181                                                IP4_ERROR_REASS_NO_BUF, 1);
1182                   ip4_full_reass_drop_all (vm, node, rm, reass);
1183                   ip4_full_reass_free (rm, rt, reass);
1184                   goto next_packet;
1185                   break;
1186                 case IP4_REASS_RC_INTERNAL_ERROR:
1187                   /* drop everything and start with a clean slate */
1188                   vlib_node_increment_counter (vm, node->node_index,
1189                                                IP4_ERROR_REASS_INTERNAL_ERROR,
1190                                                1);
1191                   ip4_full_reass_drop_all (vm, node, rm, reass);
1192                   ip4_full_reass_free (rm, rt, reass);
1193                   goto next_packet;
1194                   break;
1195                 }
1196             }
1197           else
1198             {
1199               next0 = IP4_FULL_REASS_NEXT_DROP;
1200               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1201             }
1202
1203
1204         packet_enqueue:
1205           b0->error = node->errors[error0];
1206
1207           if (bi0 != ~0)
1208             {
1209               to_next[0] = bi0;
1210               to_next += 1;
1211               n_left_to_next -= 1;
1212               if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1213                 {
1214                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1215                     {
1216                       ip4_full_reass_add_trace (vm, node, rm, NULL, bi0,
1217                                                 HANDOFF, 0,
1218                                                 vnet_buffer (b0)->ip.
1219                                                 reass.owner_thread_index);
1220                     }
1221                 }
1222               else if (is_feature && IP4_ERROR_NONE == error0)
1223                 {
1224                   b0 = vlib_get_buffer (vm, bi0);
1225                   vnet_feature_next (&next0, b0);
1226                 }
1227               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1228                                                to_next, n_left_to_next,
1229                                                bi0, next0);
1230               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1231             }
1232
1233         next_packet:
1234           from += 1;
1235           n_left_from -= 1;
1236         }
1237
1238       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1239     }
1240
1241   clib_spinlock_unlock (&rt->lock);
1242   return frame->n_vectors;
1243 }
1244
1245 static char *ip4_full_reass_error_strings[] = {
1246 #define _(sym, string) string,
1247   foreach_ip4_error
1248 #undef _
1249 };
1250
1251 VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1252                                     vlib_node_runtime_t * node,
1253                                     vlib_frame_t * frame)
1254 {
1255   return ip4_full_reass_inline (vm, node, frame, false /* is_feature */ ,
1256                                 false /* is_custom_app */ );
1257 }
1258
1259 /* *INDENT-OFF* */
1260 VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1261     .name = "ip4-full-reassembly",
1262     .vector_size = sizeof (u32),
1263     .format_trace = format_ip4_full_reass_trace,
1264     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1265     .error_strings = ip4_full_reass_error_strings,
1266     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1267     .next_nodes =
1268         {
1269                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1270                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1271                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1272
1273         },
1274 };
1275 /* *INDENT-ON* */
1276
1277 VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1278                                             vlib_node_runtime_t * node,
1279                                             vlib_frame_t * frame)
1280 {
1281   return ip4_full_reass_inline (vm, node, frame, true /* is_feature */ ,
1282                                 false /* is_custom_app */ );
1283 }
1284
1285 /* *INDENT-OFF* */
1286 VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1287     .name = "ip4-full-reassembly-feature",
1288     .vector_size = sizeof (u32),
1289     .format_trace = format_ip4_full_reass_trace,
1290     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1291     .error_strings = ip4_full_reass_error_strings,
1292     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1293     .next_nodes =
1294         {
1295                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1296                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1297                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1298         },
1299 };
1300 /* *INDENT-ON* */
1301
1302 /* *INDENT-OFF* */
1303 VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1304     .arc_name = "ip4-unicast",
1305     .node_name = "ip4-full-reassembly-feature",
1306     .runs_before = VNET_FEATURES ("ip4-lookup",
1307                                   "ipsec4-input-feature"),
1308     .runs_after = 0,
1309 };
1310 /* *INDENT-ON* */
1311
1312 #ifndef CLIB_MARCH_VARIANT
1313 always_inline u32
1314 ip4_full_reass_get_nbuckets ()
1315 {
1316   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1317   u32 nbuckets;
1318   u8 i;
1319
1320   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1321
1322   for (i = 0; i < 31; i++)
1323     if ((1 << i) >= nbuckets)
1324       break;
1325   nbuckets = 1 << i;
1326
1327   return nbuckets;
1328 }
1329 #endif /* CLIB_MARCH_VARIANT */
1330
1331 typedef enum
1332 {
1333   IP4_EVENT_CONFIG_CHANGED = 1,
1334 } ip4_full_reass_event_t;
1335
1336 typedef struct
1337 {
1338   int failure;
1339   clib_bihash_16_8_t *new_hash;
1340 } ip4_rehash_cb_ctx;
1341
1342 #ifndef CLIB_MARCH_VARIANT
1343 static void
1344 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1345 {
1346   ip4_rehash_cb_ctx *ctx = _ctx;
1347   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1348     {
1349       ctx->failure = 1;
1350     }
1351 }
1352
1353 static void
1354 ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1355                            u32 max_reassembly_length,
1356                            u32 expire_walk_interval_ms)
1357 {
1358   ip4_full_reass_main.timeout_ms = timeout_ms;
1359   ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1360   ip4_full_reass_main.max_reass_n = max_reassemblies;
1361   ip4_full_reass_main.max_reass_len = max_reassembly_length;
1362   ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1363 }
1364
1365 vnet_api_error_t
1366 ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1367                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1368 {
1369   u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1370   ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1371                              max_reassembly_length, expire_walk_interval_ms);
1372   vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1373                              ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1374                              IP4_EVENT_CONFIG_CHANGED, 0);
1375   u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1376   if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1377     {
1378       clib_bihash_16_8_t new_hash;
1379       clib_memset (&new_hash, 0, sizeof (new_hash));
1380       ip4_rehash_cb_ctx ctx;
1381       ctx.failure = 0;
1382       ctx.new_hash = &new_hash;
1383       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1384                              new_nbuckets * 1024);
1385       clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1386                                                ip4_rehash_cb, &ctx);
1387       if (ctx.failure)
1388         {
1389           clib_bihash_free_16_8 (&new_hash);
1390           return -1;
1391         }
1392       else
1393         {
1394           clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1395           clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1396                             sizeof (ip4_full_reass_main.hash));
1397           clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1398         }
1399     }
1400   return 0;
1401 }
1402
1403 vnet_api_error_t
1404 ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1405                     u32 * max_reassembly_length,
1406                     u32 * expire_walk_interval_ms)
1407 {
1408   *timeout_ms = ip4_full_reass_main.timeout_ms;
1409   *max_reassemblies = ip4_full_reass_main.max_reass_n;
1410   *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1411   *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1412   return 0;
1413 }
1414
1415 static clib_error_t *
1416 ip4_full_reass_init_function (vlib_main_t * vm)
1417 {
1418   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1419   clib_error_t *error = 0;
1420   u32 nbuckets;
1421   vlib_node_t *node;
1422
1423   rm->vlib_main = vm;
1424
1425   vec_validate (rm->per_thread_data, vlib_num_workers ());
1426   ip4_full_reass_per_thread_t *rt;
1427   vec_foreach (rt, rm->per_thread_data)
1428   {
1429     clib_spinlock_init (&rt->lock);
1430     pool_alloc (rt->pool, rm->max_reass_n);
1431   }
1432
1433   node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1434   ASSERT (node);
1435   rm->ip4_full_reass_expire_node_idx = node->index;
1436
1437   ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1438                              IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1439                              IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1440                              IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1441
1442   nbuckets = ip4_full_reass_get_nbuckets ();
1443   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1444
1445   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1446   ASSERT (node);
1447   rm->ip4_drop_idx = node->index;
1448
1449   rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1450   rm->fq_feature_index =
1451     vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1452
1453   rm->feature_use_refcount_per_intf = NULL;
1454   return error;
1455 }
1456
1457 VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1458 #endif /* CLIB_MARCH_VARIANT */
1459
1460 static uword
1461 ip4_full_reass_walk_expired (vlib_main_t * vm,
1462                              vlib_node_runtime_t * node, vlib_frame_t * f)
1463 {
1464   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1465   uword event_type, *event_data = 0;
1466
1467   while (true)
1468     {
1469       vlib_process_wait_for_event_or_clock (vm,
1470                                             (f64)
1471                                             rm->expire_walk_interval_ms /
1472                                             (f64) MSEC_PER_SEC);
1473       event_type = vlib_process_get_events (vm, &event_data);
1474
1475       switch (event_type)
1476         {
1477         case ~0:                /* no events => timeout */
1478           /* nothing to do here */
1479           break;
1480         case IP4_EVENT_CONFIG_CHANGED:
1481           break;
1482         default:
1483           clib_warning ("BUG: event type 0x%wx", event_type);
1484           break;
1485         }
1486       f64 now = vlib_time_now (vm);
1487
1488       ip4_full_reass_t *reass;
1489       int *pool_indexes_to_free = NULL;
1490
1491       uword thread_index = 0;
1492       int index;
1493       const uword nthreads = vlib_num_workers () + 1;
1494       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1495         {
1496           ip4_full_reass_per_thread_t *rt =
1497             &rm->per_thread_data[thread_index];
1498           clib_spinlock_lock (&rt->lock);
1499
1500           vec_reset_length (pool_indexes_to_free);
1501           /* *INDENT-OFF* */
1502           pool_foreach_index (index, rt->pool, ({
1503                                 reass = pool_elt_at_index (rt->pool, index);
1504                                 if (now > reass->last_heard + rm->timeout)
1505                                   {
1506                                     vec_add1 (pool_indexes_to_free, index);
1507                                   }
1508                               }));
1509           /* *INDENT-ON* */
1510           int *i;
1511           /* *INDENT-OFF* */
1512           vec_foreach (i, pool_indexes_to_free)
1513           {
1514             ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1515             ip4_full_reass_drop_all (vm, node, rm, reass);
1516             ip4_full_reass_free (rm, rt, reass);
1517           }
1518           /* *INDENT-ON* */
1519
1520           clib_spinlock_unlock (&rt->lock);
1521         }
1522
1523       vec_free (pool_indexes_to_free);
1524       if (event_data)
1525         {
1526           _vec_len (event_data) = 0;
1527         }
1528     }
1529
1530   return 0;
1531 }
1532
1533 /* *INDENT-OFF* */
1534 VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1535     .function = ip4_full_reass_walk_expired,
1536     .type = VLIB_NODE_TYPE_PROCESS,
1537     .name = "ip4-full-reassembly-expire-walk",
1538     .format_trace = format_ip4_full_reass_trace,
1539     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1540     .error_strings = ip4_full_reass_error_strings,
1541
1542 };
1543 /* *INDENT-ON* */
1544
1545 static u8 *
1546 format_ip4_full_reass_key (u8 * s, va_list * args)
1547 {
1548   ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1549   s =
1550     format (s,
1551             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1552             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1553             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1554   return s;
1555 }
1556
1557 static u8 *
1558 format_ip4_reass (u8 * s, va_list * args)
1559 {
1560   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1561   ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1562
1563   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1564               "last_packet_octet: %u, trace_op_counter: %u\n",
1565               reass->id, format_ip4_full_reass_key, &reass->key,
1566               reass->first_bi, reass->data_len,
1567               reass->last_packet_octet, reass->trace_op_counter);
1568
1569   u32 bi = reass->first_bi;
1570   u32 counter = 0;
1571   while (~0 != bi)
1572     {
1573       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1574       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1575       s =
1576         format (s,
1577                 "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1578                 "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1579                 vnb->ip.reass.range_last, bi,
1580                 ip4_full_reass_buffer_get_data_offset (b),
1581                 ip4_full_reass_buffer_get_data_len (b),
1582                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1583       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1584         {
1585           bi = b->next_buffer;
1586         }
1587       else
1588         {
1589           bi = ~0;
1590         }
1591     }
1592   return s;
1593 }
1594
1595 static clib_error_t *
1596 show_ip4_reass (vlib_main_t * vm,
1597                 unformat_input_t * input,
1598                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1599 {
1600   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1601
1602   vlib_cli_output (vm, "---------------------");
1603   vlib_cli_output (vm, "IP4 reassembly status");
1604   vlib_cli_output (vm, "---------------------");
1605   bool details = false;
1606   if (unformat (input, "details"))
1607     {
1608       details = true;
1609     }
1610
1611   u32 sum_reass_n = 0;
1612   ip4_full_reass_t *reass;
1613   uword thread_index;
1614   const uword nthreads = vlib_num_workers () + 1;
1615   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1616     {
1617       ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1618       clib_spinlock_lock (&rt->lock);
1619       if (details)
1620         {
1621           /* *INDENT-OFF* */
1622           pool_foreach (reass, rt->pool, {
1623             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1624           });
1625           /* *INDENT-ON* */
1626         }
1627       sum_reass_n += rt->reass_n;
1628       clib_spinlock_unlock (&rt->lock);
1629     }
1630   vlib_cli_output (vm, "---------------------");
1631   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1632                    (long unsigned) sum_reass_n);
1633   vlib_cli_output (vm,
1634                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1635                    (long unsigned) rm->max_reass_n);
1636   return 0;
1637 }
1638
1639 /* *INDENT-OFF* */
1640 VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1641     .path = "show ip4-full-reassembly",
1642     .short_help = "show ip4-full-reassembly [details]",
1643     .function = show_ip4_reass,
1644 };
1645 /* *INDENT-ON* */
1646
1647 #ifndef CLIB_MARCH_VARIANT
1648 vnet_api_error_t
1649 ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1650 {
1651   return vnet_feature_enable_disable ("ip4-unicast",
1652                                       "ip4-full-reassembly-feature",
1653                                       sw_if_index, enable_disable, 0, 0);
1654 }
1655 #endif /* CLIB_MARCH_VARIANT */
1656
1657
1658 #define foreach_ip4_full_reass_handoff_error                       \
1659 _(CONGESTION_DROP, "congestion drop")
1660
1661
1662 typedef enum
1663 {
1664 #define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1665   foreach_ip4_full_reass_handoff_error
1666 #undef _
1667     IP4_FULL_REASS_HANDOFF_N_ERROR,
1668 } ip4_full_reass_handoff_error_t;
1669
1670 static char *ip4_full_reass_handoff_error_strings[] = {
1671 #define _(sym,string) string,
1672   foreach_ip4_full_reass_handoff_error
1673 #undef _
1674 };
1675
1676 typedef struct
1677 {
1678   u32 next_worker_index;
1679 } ip4_full_reass_handoff_trace_t;
1680
1681 static u8 *
1682 format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1683 {
1684   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1685   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1686   ip4_full_reass_handoff_trace_t *t =
1687     va_arg (*args, ip4_full_reass_handoff_trace_t *);
1688
1689   s =
1690     format (s, "ip4-full-reassembly-handoff: next-worker %d",
1691             t->next_worker_index);
1692
1693   return s;
1694 }
1695
1696 always_inline uword
1697 ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
1698                                     vlib_node_runtime_t * node,
1699                                     vlib_frame_t * frame, bool is_feature)
1700 {
1701   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1702
1703   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1704   u32 n_enq, n_left_from, *from;
1705   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1706   u32 fq_index;
1707
1708   from = vlib_frame_vector_args (frame);
1709   n_left_from = frame->n_vectors;
1710   vlib_get_buffers (vm, from, bufs, n_left_from);
1711
1712   b = bufs;
1713   ti = thread_indices;
1714
1715   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1716
1717   while (n_left_from > 0)
1718     {
1719       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1720
1721       if (PREDICT_FALSE
1722           ((node->flags & VLIB_NODE_FLAG_TRACE)
1723            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1724         {
1725           ip4_full_reass_handoff_trace_t *t =
1726             vlib_add_trace (vm, node, b[0], sizeof (*t));
1727           t->next_worker_index = ti[0];
1728         }
1729
1730       n_left_from -= 1;
1731       ti += 1;
1732       b += 1;
1733     }
1734   n_enq =
1735     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1736                                    frame->n_vectors, 1);
1737
1738   if (n_enq < frame->n_vectors)
1739     vlib_node_increment_counter (vm, node->node_index,
1740                                  IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1741                                  frame->n_vectors - n_enq);
1742   return frame->n_vectors;
1743 }
1744
1745 VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1746                                             vlib_node_runtime_t * node,
1747                                             vlib_frame_t * frame)
1748 {
1749   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1750                                              false /* is_feature */ );
1751 }
1752
1753
1754 /* *INDENT-OFF* */
1755 VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1756   .name = "ip4-full-reassembly-handoff",
1757   .vector_size = sizeof (u32),
1758   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1759   .error_strings = ip4_full_reass_handoff_error_strings,
1760   .format_trace = format_ip4_full_reass_handoff_trace,
1761
1762   .n_next_nodes = 1,
1763
1764   .next_nodes = {
1765     [0] = "error-drop",
1766   },
1767 };
1768 /* *INDENT-ON* */
1769
1770
1771 /* *INDENT-OFF* */
1772 VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
1773                                                     vlib_node_runtime_t *
1774                                                     node,
1775                                                     vlib_frame_t * frame)
1776 {
1777   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1778                                              true /* is_feature */ );
1779 }
1780 /* *INDENT-ON* */
1781
1782
1783 /* *INDENT-OFF* */
1784 VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
1785   .name = "ip4-full-reass-feature-hoff",
1786   .vector_size = sizeof (u32),
1787   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1788   .error_strings = ip4_full_reass_handoff_error_strings,
1789   .format_trace = format_ip4_full_reass_handoff_trace,
1790
1791   .n_next_nodes = 1,
1792
1793   .next_nodes = {
1794     [0] = "error-drop",
1795   },
1796 };
1797 /* *INDENT-ON* */
1798
1799 #ifndef CLIB_MARCH_VARIANT
1800 int
1801 ip4_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1802 {
1803   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1804   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1805   if (is_enable)
1806     {
1807       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1808         {
1809           ++rm->feature_use_refcount_per_intf[sw_if_index];
1810           return vnet_feature_enable_disable ("ip4-unicast",
1811                                               "ip4-full-reassembly-feature",
1812                                               sw_if_index, 1, 0, 0);
1813         }
1814       ++rm->feature_use_refcount_per_intf[sw_if_index];
1815     }
1816   else
1817     {
1818       --rm->feature_use_refcount_per_intf[sw_if_index];
1819       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1820         return vnet_feature_enable_disable ("ip4-unicast",
1821                                             "ip4-full-reassembly-feature",
1822                                             sw_if_index, 0, 0, 0);
1823     }
1824   return -1;
1825 }
1826 #endif
1827
1828 /*
1829  * fd.io coding-style-patch-verification: ON
1830  *
1831  * Local Variables:
1832  * eval: (c-set-style "gnu")
1833  * End:
1834  */