vppinfra: allocate bihash virtual space on demand
[vpp.git] / src / vnet / ip / ip4_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Reassembly.
19  *
20  * This file contains the source code for IPv4 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_16_8.h>
27 #include <vnet/ip/ip4_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
34 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
35
36 #define IP4_REASS_DEBUG_BUFFERS 0
37 #if IP4_REASS_DEBUG_BUFFERS
38 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
39   do                                                 \
40     {                                                \
41       u32 _bi = bi;                                  \
42       printf (#what "buffer %u", _bi);               \
43       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
44       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
45         {                                            \
46           _bi = _b->next_buffer;                     \
47           printf ("[%u]", _bi);                      \
48           _b = vlib_get_buffer (vm, _bi);            \
49         }                                            \
50       printf ("\n");                                 \
51       fflush (stdout);                               \
52     }                                                \
53   while (0)
54 #else
55 #define IP4_REASS_DEBUG_BUFFER(...)
56 #endif
57
58 typedef enum
59 {
60   IP4_REASS_RC_OK,
61   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
62   IP4_REASS_RC_INTERNAL_ERROR,
63   IP4_REASS_RC_NO_BUF,
64 } ip4_reass_rc_t;
65
66 typedef struct
67 {
68   union
69   {
70     struct
71     {
72       u32 xx_id;
73       ip4_address_t src;
74       ip4_address_t dst;
75       u16 frag_id;
76       u8 proto;
77       u8 unused;
78     };
79     u64 as_u64[2];
80   };
81 } ip4_reass_key_t;
82
83 typedef union
84 {
85   struct
86   {
87     u32 reass_index;
88     u32 thread_index;
89   };
90   u64 as_u64;
91 } ip4_reass_val_t;
92
93 typedef union
94 {
95   struct
96   {
97     ip4_reass_key_t k;
98     ip4_reass_val_t v;
99   };
100   clib_bihash_kv_16_8_t kv;
101 } ip4_reass_kv_t;
102
103 always_inline u32
104 ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
105 {
106   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
107   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
108 }
109
110 always_inline u16
111 ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
112 {
113   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
114   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
115     (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
116 }
117
118 typedef struct
119 {
120   // hash table key
121   ip4_reass_key_t key;
122   // time when last packet was received
123   f64 last_heard;
124   // internal id of this reassembly
125   u64 id;
126   // buffer index of first buffer in this reassembly context
127   u32 first_bi;
128   // last octet of packet, ~0 until fragment without more_fragments arrives
129   u32 last_packet_octet;
130   // length of data collected so far
131   u32 data_len;
132   // trace operation counter
133   u32 trace_op_counter;
134   // next index - used by non-feature node
135   u32 next_index;
136   // error next index - used by custom apps (~0 if not used)
137   u32 error_next_index;
138   // minimum fragment length for this reassembly - used to estimate MTU
139   u16 min_fragment_length;
140   // number of fragments in this reassembly
141   u32 fragments_n;
142 } ip4_reass_t;
143
144 typedef struct
145 {
146   ip4_reass_t *pool;
147   u32 reass_n;
148   u32 id_counter;
149   clib_spinlock_t lock;
150 } ip4_reass_per_thread_t;
151
152 typedef struct
153 {
154   // IPv4 config
155   u32 timeout_ms;
156   f64 timeout;
157   u32 expire_walk_interval_ms;
158   // maximum number of fragments in one reassembly
159   u32 max_reass_len;
160   // maximum number of reassemblies
161   u32 max_reass_n;
162
163   // IPv4 runtime
164   clib_bihash_16_8_t hash;
165   // per-thread data
166   ip4_reass_per_thread_t *per_thread_data;
167
168   // convenience
169   vlib_main_t *vlib_main;
170   vnet_main_t *vnet_main;
171
172   // node index of ip4-drop node
173   u32 ip4_drop_idx;
174   u32 ip4_reass_expire_node_idx;
175
176   /** Worker handoff */
177   u32 fq_index;
178   u32 fq_feature_index;
179
180 } ip4_reass_main_t;
181
182 extern ip4_reass_main_t ip4_reass_main;
183
184 #ifndef CLIB_MARCH_VARIANT
185 ip4_reass_main_t ip4_reass_main;
186 #endif /* CLIB_MARCH_VARIANT */
187
188 typedef enum
189 {
190   IP4_REASSEMBLY_NEXT_INPUT,
191   IP4_REASSEMBLY_NEXT_DROP,
192   IP4_REASSEMBLY_NEXT_HANDOFF,
193   IP4_REASSEMBLY_N_NEXT,
194 } ip4_reass_next_t;
195
196 typedef enum
197 {
198   RANGE_NEW,
199   RANGE_SHRINK,
200   RANGE_DISCARD,
201   RANGE_OVERLAP,
202   FINALIZE,
203 } ip4_reass_trace_operation_e;
204
205 typedef struct
206 {
207   u16 range_first;
208   u16 range_last;
209   u32 range_bi;
210   i32 data_offset;
211   u32 data_len;
212   u32 first_bi;
213 } ip4_reass_range_trace_t;
214
215 typedef struct
216 {
217   ip4_reass_trace_operation_e action;
218   u32 reass_id;
219   ip4_reass_range_trace_t trace_range;
220   u32 size_diff;
221   u32 op_id;
222   u32 fragment_first;
223   u32 fragment_last;
224   u32 total_data_len;
225 } ip4_reass_trace_t;
226
227 extern vlib_node_registration_t ip4_reass_node;
228 extern vlib_node_registration_t ip4_reass_node_feature;
229
230 static void
231 ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
232                          ip4_reass_range_trace_t * trace)
233 {
234   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
235   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
236   trace->range_first = vnb->ip.reass.range_first;
237   trace->range_last = vnb->ip.reass.range_last;
238   trace->data_offset = ip4_reass_buffer_get_data_offset (b);
239   trace->data_len = ip4_reass_buffer_get_data_len (b);
240   trace->range_bi = bi;
241 }
242
243 static u8 *
244 format_ip4_reass_range_trace (u8 * s, va_list * args)
245 {
246   ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
247   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
248               trace->range_last, trace->data_offset, trace->data_len,
249               trace->range_bi);
250   return s;
251 }
252
253 static u8 *
254 format_ip4_reass_trace (u8 * s, va_list * args)
255 {
256   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
257   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
258   ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
259   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
260   u32 indent = format_get_indent (s);
261   s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
262               t->trace_range.first_bi, t->total_data_len, t->fragment_first,
263               t->fragment_last);
264   switch (t->action)
265     {
266     case RANGE_SHRINK:
267       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
268                   format_ip4_reass_range_trace, &t->trace_range,
269                   t->size_diff);
270       break;
271     case RANGE_DISCARD:
272       s = format (s, "\n%Udiscard %U", format_white_space, indent,
273                   format_ip4_reass_range_trace, &t->trace_range);
274       break;
275     case RANGE_NEW:
276       s = format (s, "\n%Unew %U", format_white_space, indent,
277                   format_ip4_reass_range_trace, &t->trace_range);
278       break;
279     case RANGE_OVERLAP:
280       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
281                   format_ip4_reass_range_trace, &t->trace_range);
282       break;
283     case FINALIZE:
284       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
285       break;
286     }
287   return s;
288 }
289
290 static void
291 ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
292                      ip4_reass_main_t * rm, ip4_reass_t * reass, u32 bi,
293                      ip4_reass_trace_operation_e action, u32 size_diff)
294 {
295   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
296   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
297   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
298   t->reass_id = reass->id;
299   t->action = action;
300   ip4_reass_trace_details (vm, bi, &t->trace_range);
301   t->size_diff = size_diff;
302   t->op_id = reass->trace_op_counter;
303   ++reass->trace_op_counter;
304   t->fragment_first = vnb->ip.reass.fragment_first;
305   t->fragment_last = vnb->ip.reass.fragment_last;
306   t->trace_range.first_bi = reass->first_bi;
307   t->total_data_len = reass->data_len;
308 #if 0
309   static u8 *s = NULL;
310   s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
311   printf ("%.*s\n", vec_len (s), s);
312   fflush (stdout);
313   vec_reset_length (s);
314 #endif
315 }
316
317
318 always_inline void
319 ip4_reass_free (ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
320                 ip4_reass_t * reass)
321 {
322   clib_bihash_kv_16_8_t kv;
323   kv.key[0] = reass->key.as_u64[0];
324   kv.key[1] = reass->key.as_u64[1];
325   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
326   pool_put (rt->pool, reass);
327   --rt->reass_n;
328 }
329
330 always_inline void
331 ip4_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
332                     ip4_reass_main_t * rm, ip4_reass_t * reass)
333 {
334   u32 range_bi = reass->first_bi;
335   vlib_buffer_t *range_b;
336   vnet_buffer_opaque_t *range_vnb;
337   u32 *to_free = NULL;
338   while (~0 != range_bi)
339     {
340       range_b = vlib_get_buffer (vm, range_bi);
341       range_vnb = vnet_buffer (range_b);
342       u32 bi = range_bi;
343       while (~0 != bi)
344         {
345           vec_add1 (to_free, bi);
346           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
347           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
348             {
349               bi = b->next_buffer;
350               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
351             }
352           else
353             {
354               bi = ~0;
355             }
356         }
357       range_bi = range_vnb->ip.reass.next_range_bi;
358     }
359   /* send to next_error_index */
360   if (~0 != reass->error_next_index)
361     {
362       u32 n_left_to_next, *to_next, next_index;
363
364       next_index = reass->error_next_index;
365       u32 bi = ~0;
366
367       while (vec_len (to_free) > 0)
368         {
369           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
370
371           while (vec_len (to_free) > 0 && n_left_to_next > 0)
372             {
373               bi = vec_pop (to_free);
374
375               if (~0 != bi)
376                 {
377                   to_next[0] = bi;
378                   to_next += 1;
379                   n_left_to_next -= 1;
380                   vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
381                                                    to_next, n_left_to_next,
382                                                    bi, next_index);
383                 }
384             }
385           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
386         }
387     }
388   else
389     {
390       vlib_buffer_free (vm, to_free, vec_len (to_free));
391     }
392 }
393
394 static ip4_reass_t *
395 ip4_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
396                           ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
397                           ip4_reass_kv_t * kv, u8 * do_handoff)
398 {
399   ip4_reass_t *reass = NULL;
400   f64 now = vlib_time_now (rm->vlib_main);
401
402   if (!clib_bihash_search_16_8
403       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
404     {
405       if (vm->thread_index != kv->v.thread_index)
406         {
407           *do_handoff = 1;
408           return NULL;
409         }
410       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
411
412       if (now > reass->last_heard + rm->timeout)
413         {
414           ip4_reass_drop_all (vm, node, rm, reass);
415           ip4_reass_free (rm, rt, reass);
416           reass = NULL;
417         }
418     }
419
420   if (reass)
421     {
422       reass->last_heard = now;
423       return reass;
424     }
425
426   if (rt->reass_n >= rm->max_reass_n)
427     {
428       reass = NULL;
429       return reass;
430     }
431   else
432     {
433       pool_get (rt->pool, reass);
434       clib_memset (reass, 0, sizeof (*reass));
435       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
436       ++rt->id_counter;
437       reass->first_bi = ~0;
438       reass->last_packet_octet = ~0;
439       reass->data_len = 0;
440       reass->next_index = ~0;
441       reass->error_next_index = ~0;
442       ++rt->reass_n;
443     }
444
445   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
446   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
447   kv->v.reass_index = (reass - rt->pool);
448   kv->v.thread_index = vm->thread_index;
449   reass->last_heard = now;
450
451   if (clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 1))
452     {
453       ip4_reass_free (rm, rt, reass);
454       reass = NULL;
455     }
456
457   return reass;
458 }
459
460 always_inline ip4_reass_rc_t
461 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
462                     ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
463                     ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
464                     bool is_custom_app)
465 {
466   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
467   vlib_buffer_t *last_b = NULL;
468   u32 sub_chain_bi = reass->first_bi;
469   u32 total_length = 0;
470   u32 buf_cnt = 0;
471   do
472     {
473       u32 tmp_bi = sub_chain_bi;
474       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
475       ip4_header_t *ip = vlib_buffer_get_current (tmp);
476       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
477       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
478           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
479         {
480           return IP4_REASS_RC_INTERNAL_ERROR;
481         }
482
483       u32 data_len = ip4_reass_buffer_get_data_len (tmp);
484       u32 trim_front =
485         ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
486       u32 trim_end =
487         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
488       if (tmp_bi == reass->first_bi)
489         {
490           /* first buffer - keep ip4 header */
491           if (0 != ip4_reass_buffer_get_data_offset (tmp))
492             {
493               return IP4_REASS_RC_INTERNAL_ERROR;
494             }
495           trim_front = 0;
496           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
497             ip4_header_bytes (ip);
498           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
499             {
500               return IP4_REASS_RC_INTERNAL_ERROR;
501             }
502         }
503       u32 keep_data =
504         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
505       while (1)
506         {
507           ++buf_cnt;
508           if (trim_front)
509             {
510               if (trim_front > tmp->current_length)
511                 {
512                   /* drop whole buffer */
513                   u32 to_be_freed_bi = tmp_bi;
514                   trim_front -= tmp->current_length;
515                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
516                     {
517                       return IP4_REASS_RC_INTERNAL_ERROR;
518                     }
519                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
520                   tmp_bi = tmp->next_buffer;
521                   tmp->next_buffer = 0;
522                   tmp = vlib_get_buffer (vm, tmp_bi);
523                   vlib_buffer_free_one (vm, to_be_freed_bi);
524                   continue;
525                 }
526               else
527                 {
528                   vlib_buffer_advance (tmp, trim_front);
529                   trim_front = 0;
530                 }
531             }
532           if (keep_data)
533             {
534               if (last_b)
535                 {
536                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
537                   last_b->next_buffer = tmp_bi;
538                 }
539               last_b = tmp;
540               if (keep_data <= tmp->current_length)
541                 {
542                   tmp->current_length = keep_data;
543                   keep_data = 0;
544                 }
545               else
546                 {
547                   keep_data -= tmp->current_length;
548                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
549                     {
550                       return IP4_REASS_RC_INTERNAL_ERROR;
551                     }
552                 }
553               total_length += tmp->current_length;
554               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
555                 {
556                   tmp_bi = tmp->next_buffer;
557                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
558                 }
559               else
560                 {
561                   break;
562                 }
563             }
564           else
565             {
566               u32 to_be_freed_bi = tmp_bi;
567               if (reass->first_bi == tmp_bi)
568                 {
569                   return IP4_REASS_RC_INTERNAL_ERROR;
570                 }
571               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
572                 {
573                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
574                   tmp_bi = tmp->next_buffer;
575                   tmp->next_buffer = 0;
576                   tmp = vlib_get_buffer (vm, tmp_bi);
577                   vlib_buffer_free_one (vm, to_be_freed_bi);
578                 }
579               else
580                 {
581                   tmp->next_buffer = 0;
582                   vlib_buffer_free_one (vm, to_be_freed_bi);
583                   break;
584                 }
585             }
586         }
587       sub_chain_bi =
588         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
589         reass.next_range_bi;
590     }
591   while (~0 != sub_chain_bi);
592
593   if (!last_b)
594     {
595       return IP4_REASS_RC_INTERNAL_ERROR;
596     }
597   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
598
599   if (total_length < first_b->current_length)
600     {
601       return IP4_REASS_RC_INTERNAL_ERROR;
602     }
603   total_length -= first_b->current_length;
604   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
605   first_b->total_length_not_including_first_buffer = total_length;
606   ip4_header_t *ip = vlib_buffer_get_current (first_b);
607   ip->flags_and_fragment_offset = 0;
608   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
609   ip->checksum = ip4_header_checksum (ip);
610   if (!vlib_buffer_chain_linearize (vm, first_b))
611     {
612       return IP4_REASS_RC_NO_BUF;
613     }
614   // reset to reconstruct the mbuf linking
615   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
616   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
617     {
618       ip4_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
619 #if 0
620       // following code does a hexdump of packet fragments to stdout ...
621       do
622         {
623           u32 bi = reass->first_bi;
624           u8 *s = NULL;
625           while (~0 != bi)
626             {
627               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
628               s = format (s, "%u: %U\n", bi, format_hexdump,
629                           vlib_buffer_get_current (b), b->current_length);
630               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
631                 {
632                   bi = b->next_buffer;
633                 }
634               else
635                 {
636                   break;
637                 }
638             }
639           printf ("%.*s\n", vec_len (s), s);
640           fflush (stdout);
641           vec_free (s);
642         }
643       while (0);
644 #endif
645     }
646   *bi0 = reass->first_bi;
647   if (!is_custom_app)
648     {
649       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
650     }
651   else
652     {
653       *next0 = reass->next_index;
654     }
655   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
656   *error0 = IP4_ERROR_NONE;
657   ip4_reass_free (rm, rt, reass);
658   reass = NULL;
659   return IP4_REASS_RC_OK;
660 }
661
662 always_inline ip4_reass_rc_t
663 ip4_reass_insert_range_in_chain (vlib_main_t * vm,
664                                  ip4_reass_main_t * rm,
665                                  ip4_reass_per_thread_t * rt,
666                                  ip4_reass_t * reass,
667                                  u32 prev_range_bi, u32 new_next_bi)
668 {
669   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
670   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
671   if (~0 != prev_range_bi)
672     {
673       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
674       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
675       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
676       prev_vnb->ip.reass.next_range_bi = new_next_bi;
677     }
678   else
679     {
680       if (~0 != reass->first_bi)
681         {
682           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
683         }
684       reass->first_bi = new_next_bi;
685     }
686   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
687   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
688       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
689     {
690       return IP4_REASS_RC_INTERNAL_ERROR;
691     }
692   reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
693   return IP4_REASS_RC_OK;
694 }
695
696 always_inline ip4_reass_rc_t
697 ip4_reass_remove_range_from_chain (vlib_main_t * vm,
698                                    vlib_node_runtime_t * node,
699                                    ip4_reass_main_t * rm,
700                                    ip4_reass_t * reass, u32 prev_range_bi,
701                                    u32 discard_bi)
702 {
703   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
704   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
705   if (~0 != prev_range_bi)
706     {
707       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
708       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
709       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
710         {
711           return IP4_REASS_RC_INTERNAL_ERROR;
712         }
713       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
714     }
715   else
716     {
717       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
718     }
719   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
720   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
721       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
722     {
723       return IP4_REASS_RC_INTERNAL_ERROR;
724     }
725   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
726   while (1)
727     {
728       u32 to_be_freed_bi = discard_bi;
729       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
730         {
731           ip4_reass_add_trace (vm, node, rm, reass, discard_bi, RANGE_DISCARD,
732                                0);
733         }
734       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
735         {
736           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
737           discard_bi = discard_b->next_buffer;
738           discard_b->next_buffer = 0;
739           discard_b = vlib_get_buffer (vm, discard_bi);
740           vlib_buffer_free_one (vm, to_be_freed_bi);
741         }
742       else
743         {
744           discard_b->next_buffer = 0;
745           vlib_buffer_free_one (vm, to_be_freed_bi);
746           break;
747         }
748     }
749   return IP4_REASS_RC_OK;
750 }
751
752 always_inline ip4_reass_rc_t
753 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
754                   ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
755                   ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
756                   bool is_custom_app)
757 {
758   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
759   int consumed = 0;
760   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
761   ip4_header_t *fip = vlib_buffer_get_current (fb);
762   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
763   if (is_custom_app)
764     {
765       // store (error_)next_index before it's overwritten
766       reass->next_index = fvnb->ip.reass.next_index;
767       reass->error_next_index = fvnb->ip.reass.error_next_index;
768     }
769   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
770   const u32 fragment_length =
771     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
772   const u32 fragment_last = fragment_first + fragment_length - 1;
773   fvnb->ip.reass.fragment_first = fragment_first;
774   fvnb->ip.reass.fragment_last = fragment_last;
775   int more_fragments = ip4_get_fragment_more (fip);
776   u32 candidate_range_bi = reass->first_bi;
777   u32 prev_range_bi = ~0;
778   fvnb->ip.reass.range_first = fragment_first;
779   fvnb->ip.reass.range_last = fragment_last;
780   fvnb->ip.reass.next_range_bi = ~0;
781   if (!more_fragments)
782     {
783       reass->last_packet_octet = fragment_last;
784     }
785   if (~0 == reass->first_bi)
786     {
787       // starting a new reassembly
788       rc =
789         ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
790                                          *bi0);
791       if (IP4_REASS_RC_OK != rc)
792         {
793           return rc;
794         }
795       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
796         {
797           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
798         }
799       *bi0 = ~0;
800       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
801       reass->fragments_n = 1;
802       return IP4_REASS_RC_OK;
803     }
804   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
805                                          fvnb->ip.reass.estimated_mtu);
806   while (~0 != candidate_range_bi)
807     {
808       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
809       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
810       if (fragment_first > candidate_vnb->ip.reass.range_last)
811         {
812           // this fragments starts after candidate range
813           prev_range_bi = candidate_range_bi;
814           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
815           if (candidate_vnb->ip.reass.range_last < fragment_last &&
816               ~0 == candidate_range_bi)
817             {
818               // special case - this fragment falls beyond all known ranges
819               rc =
820                 ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
821                                                  prev_range_bi, *bi0);
822               if (IP4_REASS_RC_OK != rc)
823                 {
824                   return rc;
825                 }
826               consumed = 1;
827               break;
828             }
829           continue;
830         }
831       if (fragment_last < candidate_vnb->ip.reass.range_first)
832         {
833           // this fragment ends before candidate range without any overlap
834           rc =
835             ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
836                                              *bi0);
837           if (IP4_REASS_RC_OK != rc)
838             {
839               return rc;
840             }
841           consumed = 1;
842         }
843       else
844         {
845           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
846               fragment_last <= candidate_vnb->ip.reass.range_last)
847             {
848               // this fragment is a (sub)part of existing range, ignore it
849               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
850                 {
851                   ip4_reass_add_trace (vm, node, rm, reass, *bi0,
852                                        RANGE_OVERLAP, 0);
853                 }
854               break;
855             }
856           int discard_candidate = 0;
857           if (fragment_first < candidate_vnb->ip.reass.range_first)
858             {
859               u32 overlap =
860                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
861               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
862                 {
863                   candidate_vnb->ip.reass.range_first += overlap;
864                   if (reass->data_len < overlap)
865                     {
866                       return IP4_REASS_RC_INTERNAL_ERROR;
867                     }
868                   reass->data_len -= overlap;
869                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
870                     {
871                       ip4_reass_add_trace (vm, node, rm, reass,
872                                            candidate_range_bi, RANGE_SHRINK,
873                                            overlap);
874                     }
875                   rc =
876                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
877                                                      prev_range_bi, *bi0);
878                   if (IP4_REASS_RC_OK != rc)
879                     {
880                       return rc;
881                     }
882                   consumed = 1;
883                 }
884               else
885                 {
886                   discard_candidate = 1;
887                 }
888             }
889           else if (fragment_last > candidate_vnb->ip.reass.range_last)
890             {
891               u32 overlap =
892                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
893               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
894                 {
895                   fvnb->ip.reass.range_first += overlap;
896                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
897                     {
898                       prev_range_bi = candidate_range_bi;
899                       candidate_range_bi =
900                         candidate_vnb->ip.reass.next_range_bi;
901                       continue;
902                     }
903                   else
904                     {
905                       // special case - last range discarded
906                       rc =
907                         ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
908                                                          candidate_range_bi,
909                                                          *bi0);
910                       if (IP4_REASS_RC_OK != rc)
911                         {
912                           return rc;
913                         }
914                       consumed = 1;
915                     }
916                 }
917               else
918                 {
919                   discard_candidate = 1;
920                 }
921             }
922           else
923             {
924               discard_candidate = 1;
925             }
926           if (discard_candidate)
927             {
928               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
929               // discard candidate range, probe next range
930               rc =
931                 ip4_reass_remove_range_from_chain (vm, node, rm, reass,
932                                                    prev_range_bi,
933                                                    candidate_range_bi);
934               if (IP4_REASS_RC_OK != rc)
935                 {
936                   return rc;
937                 }
938               if (~0 != next_range_bi)
939                 {
940                   candidate_range_bi = next_range_bi;
941                   continue;
942                 }
943               else
944                 {
945                   // special case - last range discarded
946                   rc =
947                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
948                                                      prev_range_bi, *bi0);
949                   if (IP4_REASS_RC_OK != rc)
950                     {
951                       return rc;
952                     }
953                   consumed = 1;
954                 }
955             }
956         }
957       break;
958     }
959   ++reass->fragments_n;
960   if (consumed)
961     {
962       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
963         {
964           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
965         }
966     }
967   if (~0 != reass->last_packet_octet &&
968       reass->data_len == reass->last_packet_octet + 1)
969     {
970       return ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
971                                  is_custom_app);
972     }
973   else
974     {
975       if (consumed)
976         {
977           *bi0 = ~0;
978           if (reass->fragments_n > rm->max_reass_len)
979             {
980               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
981             }
982         }
983       else
984         {
985           *next0 = IP4_REASSEMBLY_NEXT_DROP;
986           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
987         }
988     }
989   return rc;
990 }
991
992 always_inline uword
993 ip4_reassembly_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
994                        vlib_frame_t * frame, bool is_feature,
995                        bool is_custom_app)
996 {
997   u32 *from = vlib_frame_vector_args (frame);
998   u32 n_left_from, n_left_to_next, *to_next, next_index;
999   ip4_reass_main_t *rm = &ip4_reass_main;
1000   ip4_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1001   clib_spinlock_lock (&rt->lock);
1002
1003   n_left_from = frame->n_vectors;
1004   next_index = node->cached_next_index;
1005   while (n_left_from > 0)
1006     {
1007       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1008
1009       while (n_left_from > 0 && n_left_to_next > 0)
1010         {
1011           u32 bi0;
1012           vlib_buffer_t *b0;
1013           u32 next0;
1014           u32 error0 = IP4_ERROR_NONE;
1015
1016           bi0 = from[0];
1017           b0 = vlib_get_buffer (vm, bi0);
1018
1019           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1020           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1021             {
1022               // this is a whole packet - no fragmentation
1023               if (!is_custom_app)
1024                 {
1025                   next0 = IP4_REASSEMBLY_NEXT_INPUT;
1026                 }
1027               else
1028                 {
1029                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1030                 }
1031             }
1032           else
1033             {
1034               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1035               const u32 fragment_length =
1036                 clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1037               const u32 fragment_last = fragment_first + fragment_length - 1;
1038               if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
1039                 {
1040                   next0 = IP4_REASSEMBLY_NEXT_DROP;
1041                   error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1042                 }
1043               else
1044                 {
1045                   ip4_reass_kv_t kv;
1046                   u8 do_handoff = 0;
1047
1048                   kv.k.as_u64[0] =
1049                     (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1050                                    vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1051                     (u64) ip0->src_address.as_u32 << 32;
1052                   kv.k.as_u64[1] =
1053                     (u64) ip0->dst_address.as_u32 |
1054                     (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1055
1056                   ip4_reass_t *reass =
1057                     ip4_reass_find_or_create (vm, node, rm, rt, &kv,
1058                                               &do_handoff);
1059
1060                   if (PREDICT_FALSE (do_handoff))
1061                     {
1062                       next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
1063                       if (is_feature)
1064                         vnet_buffer (b0)->ip.
1065                           reass.owner_feature_thread_index =
1066                           kv.v.thread_index;
1067                       else
1068                         vnet_buffer (b0)->ip.reass.owner_thread_index =
1069                           kv.v.thread_index;
1070                     }
1071                   else if (reass)
1072                     {
1073                       switch (ip4_reass_update
1074                               (vm, node, rm, rt, reass, &bi0, &next0,
1075                                &error0, is_custom_app))
1076                         {
1077                         case IP4_REASS_RC_OK:
1078                           /* nothing to do here */
1079                           break;
1080                         case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1081                           vlib_node_increment_counter (vm, node->node_index,
1082                                                        IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1083                                                        1);
1084                           ip4_reass_drop_all (vm, node, rm, reass);
1085                           ip4_reass_free (rm, rt, reass);
1086                           goto next_packet;
1087                           break;
1088                         case IP4_REASS_RC_NO_BUF:
1089                           vlib_node_increment_counter (vm, node->node_index,
1090                                                        IP4_ERROR_REASS_NO_BUF,
1091                                                        1);
1092                           ip4_reass_drop_all (vm, node, rm, reass);
1093                           ip4_reass_free (rm, rt, reass);
1094                           goto next_packet;
1095                           break;
1096                         case IP4_REASS_RC_INTERNAL_ERROR:
1097                           /* drop everything and start with a clean slate */
1098                           vlib_node_increment_counter (vm, node->node_index,
1099                                                        IP4_ERROR_REASS_INTERNAL_ERROR,
1100                                                        1);
1101                           ip4_reass_drop_all (vm, node, rm, reass);
1102                           ip4_reass_free (rm, rt, reass);
1103                           goto next_packet;
1104                           break;
1105                         }
1106                     }
1107                   else
1108                     {
1109                       next0 = IP4_REASSEMBLY_NEXT_DROP;
1110                       error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1111                     }
1112                 }
1113
1114               b0->error = node->errors[error0];
1115             }
1116
1117           if (bi0 != ~0)
1118             {
1119               to_next[0] = bi0;
1120               to_next += 1;
1121               n_left_to_next -= 1;
1122               if (is_feature && IP4_ERROR_NONE == error0)
1123                 {
1124                   b0 = vlib_get_buffer (vm, bi0);
1125                   vnet_feature_next (&next0, b0);
1126                 }
1127               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1128                                                to_next, n_left_to_next,
1129                                                bi0, next0);
1130               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1131             }
1132
1133         next_packet:
1134           from += 1;
1135           n_left_from -= 1;
1136         }
1137
1138       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1139     }
1140
1141   clib_spinlock_unlock (&rt->lock);
1142   return frame->n_vectors;
1143 }
1144
1145 static char *ip4_reassembly_error_strings[] = {
1146 #define _(sym, string) string,
1147   foreach_ip4_error
1148 #undef _
1149 };
1150
1151 VLIB_NODE_FN (ip4_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1152                                vlib_frame_t * frame)
1153 {
1154   return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ ,
1155                                 false /* is_custom_app */ );
1156 }
1157
1158 /* *INDENT-OFF* */
1159 VLIB_REGISTER_NODE (ip4_reass_node) = {
1160     .name = "ip4-reassembly",
1161     .vector_size = sizeof (u32),
1162     .format_trace = format_ip4_reass_trace,
1163     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1164     .error_strings = ip4_reassembly_error_strings,
1165     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1166     .next_nodes =
1167         {
1168                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1169                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1170                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reassembly-handoff",
1171
1172         },
1173 };
1174 /* *INDENT-ON* */
1175
1176 VLIB_NODE_FN (ip4_reass_node_feature) (vlib_main_t * vm,
1177                                        vlib_node_runtime_t * node,
1178                                        vlib_frame_t * frame)
1179 {
1180   return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ ,
1181                                 false /* is_custom_app */ );
1182 }
1183
1184 /* *INDENT-OFF* */
1185 VLIB_REGISTER_NODE (ip4_reass_node_feature) = {
1186     .name = "ip4-reassembly-feature",
1187     .vector_size = sizeof (u32),
1188     .format_trace = format_ip4_reass_trace,
1189     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1190     .error_strings = ip4_reassembly_error_strings,
1191     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1192     .next_nodes =
1193         {
1194                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1195                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1196                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reass-feature-hoff",
1197         },
1198 };
1199 /* *INDENT-ON* */
1200
1201 /* *INDENT-OFF* */
1202 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
1203     .arc_name = "ip4-unicast",
1204     .node_name = "ip4-reassembly-feature",
1205     .runs_before = VNET_FEATURES ("ip4-lookup"),
1206     .runs_after = 0,
1207 };
1208 /* *INDENT-ON* */
1209
1210 #ifndef CLIB_MARCH_VARIANT
1211 always_inline u32
1212 ip4_reass_get_nbuckets ()
1213 {
1214   ip4_reass_main_t *rm = &ip4_reass_main;
1215   u32 nbuckets;
1216   u8 i;
1217
1218   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1219
1220   for (i = 0; i < 31; i++)
1221     if ((1 << i) >= nbuckets)
1222       break;
1223   nbuckets = 1 << i;
1224
1225   return nbuckets;
1226 }
1227 #endif /* CLIB_MARCH_VARIANT */
1228
1229 typedef enum
1230 {
1231   IP4_EVENT_CONFIG_CHANGED = 1,
1232 } ip4_reass_event_t;
1233
1234 typedef struct
1235 {
1236   int failure;
1237   clib_bihash_16_8_t *new_hash;
1238 } ip4_rehash_cb_ctx;
1239
1240 #ifndef CLIB_MARCH_VARIANT
1241 static void
1242 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1243 {
1244   ip4_rehash_cb_ctx *ctx = _ctx;
1245   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1246     {
1247       ctx->failure = 1;
1248     }
1249 }
1250
1251 static void
1252 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1253                       u32 max_reassembly_length, u32 expire_walk_interval_ms)
1254 {
1255   ip4_reass_main.timeout_ms = timeout_ms;
1256   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1257   ip4_reass_main.max_reass_n = max_reassemblies;
1258   ip4_reass_main.max_reass_len = max_reassembly_length;
1259   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1260 }
1261
1262 vnet_api_error_t
1263 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
1264                u32 max_reassembly_length, u32 expire_walk_interval_ms)
1265 {
1266   u32 old_nbuckets = ip4_reass_get_nbuckets ();
1267   ip4_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
1268                         expire_walk_interval_ms);
1269   vlib_process_signal_event (ip4_reass_main.vlib_main,
1270                              ip4_reass_main.ip4_reass_expire_node_idx,
1271                              IP4_EVENT_CONFIG_CHANGED, 0);
1272   u32 new_nbuckets = ip4_reass_get_nbuckets ();
1273   if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1274     {
1275       clib_bihash_16_8_t new_hash;
1276       clib_memset (&new_hash, 0, sizeof (new_hash));
1277       ip4_rehash_cb_ctx ctx;
1278       ctx.failure = 0;
1279       ctx.new_hash = &new_hash;
1280       clib_bihash_init_16_8 (&new_hash, "ip4-reass", new_nbuckets,
1281                              new_nbuckets * 1024);
1282       clib_bihash_foreach_key_value_pair_16_8 (&ip4_reass_main.hash,
1283                                                ip4_rehash_cb, &ctx);
1284       if (ctx.failure)
1285         {
1286           clib_bihash_free_16_8 (&new_hash);
1287           return -1;
1288         }
1289       else
1290         {
1291           clib_bihash_free_16_8 (&ip4_reass_main.hash);
1292           clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
1293                             sizeof (ip4_reass_main.hash));
1294           clib_bihash_copied (&ip4_reass_main.hash, &new_hash);
1295         }
1296     }
1297   return 0;
1298 }
1299
1300 vnet_api_error_t
1301 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1302                u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1303 {
1304   *timeout_ms = ip4_reass_main.timeout_ms;
1305   *max_reassemblies = ip4_reass_main.max_reass_n;
1306   *max_reassembly_length = ip4_reass_main.max_reass_len;
1307   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
1308   return 0;
1309 }
1310
1311 static clib_error_t *
1312 ip4_reass_init_function (vlib_main_t * vm)
1313 {
1314   ip4_reass_main_t *rm = &ip4_reass_main;
1315   clib_error_t *error = 0;
1316   u32 nbuckets;
1317   vlib_node_t *node;
1318
1319   rm->vlib_main = vm;
1320   rm->vnet_main = vnet_get_main ();
1321
1322   vec_validate (rm->per_thread_data, vlib_num_workers ());
1323   ip4_reass_per_thread_t *rt;
1324   vec_foreach (rt, rm->per_thread_data)
1325   {
1326     clib_spinlock_init (&rt->lock);
1327     pool_alloc (rt->pool, rm->max_reass_n);
1328   }
1329
1330   node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
1331   ASSERT (node);
1332   rm->ip4_reass_expire_node_idx = node->index;
1333
1334   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1335                         IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1336                         IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1337                         IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1338
1339   nbuckets = ip4_reass_get_nbuckets ();
1340   clib_bihash_init_16_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
1341
1342   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1343   ASSERT (node);
1344   rm->ip4_drop_idx = node->index;
1345
1346   rm->fq_index = vlib_frame_queue_main_init (ip4_reass_node.index, 0);
1347   rm->fq_feature_index =
1348     vlib_frame_queue_main_init (ip4_reass_node_feature.index, 0);
1349
1350
1351   return error;
1352 }
1353
1354 VLIB_INIT_FUNCTION (ip4_reass_init_function);
1355 #endif /* CLIB_MARCH_VARIANT */
1356
1357 static uword
1358 ip4_reass_walk_expired (vlib_main_t * vm,
1359                         vlib_node_runtime_t * node, vlib_frame_t * f)
1360 {
1361   ip4_reass_main_t *rm = &ip4_reass_main;
1362   uword event_type, *event_data = 0;
1363
1364   while (true)
1365     {
1366       vlib_process_wait_for_event_or_clock (vm,
1367                                             (f64)
1368                                             rm->expire_walk_interval_ms /
1369                                             (f64) MSEC_PER_SEC);
1370       event_type = vlib_process_get_events (vm, &event_data);
1371
1372       switch (event_type)
1373         {
1374         case ~0:                /* no events => timeout */
1375           /* nothing to do here */
1376           break;
1377         case IP4_EVENT_CONFIG_CHANGED:
1378           break;
1379         default:
1380           clib_warning ("BUG: event type 0x%wx", event_type);
1381           break;
1382         }
1383       f64 now = vlib_time_now (vm);
1384
1385       ip4_reass_t *reass;
1386       int *pool_indexes_to_free = NULL;
1387
1388       uword thread_index = 0;
1389       int index;
1390       const uword nthreads = vlib_num_workers () + 1;
1391       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1392         {
1393           ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1394           clib_spinlock_lock (&rt->lock);
1395
1396           vec_reset_length (pool_indexes_to_free);
1397           /* *INDENT-OFF* */
1398           pool_foreach_index (index, rt->pool, ({
1399                                 reass = pool_elt_at_index (rt->pool, index);
1400                                 if (now > reass->last_heard + rm->timeout)
1401                                   {
1402                                     vec_add1 (pool_indexes_to_free, index);
1403                                   }
1404                               }));
1405           /* *INDENT-ON* */
1406           int *i;
1407           /* *INDENT-OFF* */
1408           vec_foreach (i, pool_indexes_to_free)
1409           {
1410             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1411             ip4_reass_drop_all (vm, node, rm, reass);
1412             ip4_reass_free (rm, rt, reass);
1413           }
1414           /* *INDENT-ON* */
1415
1416           clib_spinlock_unlock (&rt->lock);
1417         }
1418
1419       vec_free (pool_indexes_to_free);
1420       if (event_data)
1421         {
1422           _vec_len (event_data) = 0;
1423         }
1424     }
1425
1426   return 0;
1427 }
1428
1429 /* *INDENT-OFF* */
1430 VLIB_REGISTER_NODE (ip4_reass_expire_node, static) = {
1431     .function = ip4_reass_walk_expired,
1432     .type = VLIB_NODE_TYPE_PROCESS,
1433     .name = "ip4-reassembly-expire-walk",
1434     .format_trace = format_ip4_reass_trace,
1435     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1436     .error_strings = ip4_reassembly_error_strings,
1437
1438 };
1439 /* *INDENT-ON* */
1440
1441 static u8 *
1442 format_ip4_reass_key (u8 * s, va_list * args)
1443 {
1444   ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
1445   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1446               key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1447               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1448   return s;
1449 }
1450
1451 static u8 *
1452 format_ip4_reass (u8 * s, va_list * args)
1453 {
1454   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1455   ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
1456
1457   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1458               "last_packet_octet: %u, trace_op_counter: %u\n",
1459               reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
1460               reass->data_len, reass->last_packet_octet,
1461               reass->trace_op_counter);
1462   u32 bi = reass->first_bi;
1463   u32 counter = 0;
1464   while (~0 != bi)
1465     {
1466       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1467       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1468       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1469                   "fragment[%u, %u]\n",
1470                   counter, vnb->ip.reass.range_first,
1471                   vnb->ip.reass.range_last, bi,
1472                   ip4_reass_buffer_get_data_offset (b),
1473                   ip4_reass_buffer_get_data_len (b),
1474                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1475       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1476         {
1477           bi = b->next_buffer;
1478         }
1479       else
1480         {
1481           bi = ~0;
1482         }
1483     }
1484   return s;
1485 }
1486
1487 static clib_error_t *
1488 show_ip4_reass (vlib_main_t * vm,
1489                 unformat_input_t * input,
1490                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1491 {
1492   ip4_reass_main_t *rm = &ip4_reass_main;
1493
1494   vlib_cli_output (vm, "---------------------");
1495   vlib_cli_output (vm, "IP4 reassembly status");
1496   vlib_cli_output (vm, "---------------------");
1497   bool details = false;
1498   if (unformat (input, "details"))
1499     {
1500       details = true;
1501     }
1502
1503   u32 sum_reass_n = 0;
1504   ip4_reass_t *reass;
1505   uword thread_index;
1506   const uword nthreads = vlib_num_workers () + 1;
1507   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1508     {
1509       ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1510       clib_spinlock_lock (&rt->lock);
1511       if (details)
1512         {
1513           /* *INDENT-OFF* */
1514           pool_foreach (reass, rt->pool, {
1515             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1516           });
1517           /* *INDENT-ON* */
1518         }
1519       sum_reass_n += rt->reass_n;
1520       clib_spinlock_unlock (&rt->lock);
1521     }
1522   vlib_cli_output (vm, "---------------------");
1523   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1524                    (long unsigned) sum_reass_n);
1525   vlib_cli_output (vm,
1526                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1527                    (long unsigned) rm->max_reass_n);
1528   return 0;
1529 }
1530
1531 /* *INDENT-OFF* */
1532 VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
1533     .path = "show ip4-reassembly",
1534     .short_help = "show ip4-reassembly [details]",
1535     .function = show_ip4_reass,
1536 };
1537 /* *INDENT-ON* */
1538
1539 #ifndef CLIB_MARCH_VARIANT
1540 vnet_api_error_t
1541 ip4_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1542 {
1543   return vnet_feature_enable_disable ("ip4-unicast",
1544                                       "ip4-reassembly-feature", sw_if_index,
1545                                       enable_disable, 0, 0);
1546 }
1547 #endif /* CLIB_MARCH_VARIANT */
1548
1549
1550 #define foreach_ip4_reassembly_handoff_error                       \
1551 _(CONGESTION_DROP, "congestion drop")
1552
1553
1554 typedef enum
1555 {
1556 #define _(sym,str) IP4_REASSEMBLY_HANDOFF_ERROR_##sym,
1557   foreach_ip4_reassembly_handoff_error
1558 #undef _
1559     IP4_REASSEMBLY_HANDOFF_N_ERROR,
1560 } ip4_reassembly_handoff_error_t;
1561
1562 static char *ip4_reassembly_handoff_error_strings[] = {
1563 #define _(sym,string) string,
1564   foreach_ip4_reassembly_handoff_error
1565 #undef _
1566 };
1567
1568 typedef struct
1569 {
1570   u32 next_worker_index;
1571 } ip4_reassembly_handoff_trace_t;
1572
1573 static u8 *
1574 format_ip4_reassembly_handoff_trace (u8 * s, va_list * args)
1575 {
1576   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1577   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1578   ip4_reassembly_handoff_trace_t *t =
1579     va_arg (*args, ip4_reassembly_handoff_trace_t *);
1580
1581   s =
1582     format (s, "ip4-reassembly-handoff: next-worker %d",
1583             t->next_worker_index);
1584
1585   return s;
1586 }
1587
1588 always_inline uword
1589 ip4_reassembly_handoff_node_inline (vlib_main_t * vm,
1590                                     vlib_node_runtime_t * node,
1591                                     vlib_frame_t * frame, bool is_feature)
1592 {
1593   ip4_reass_main_t *rm = &ip4_reass_main;
1594
1595   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1596   u32 n_enq, n_left_from, *from;
1597   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1598   u32 fq_index;
1599
1600   from = vlib_frame_vector_args (frame);
1601   n_left_from = frame->n_vectors;
1602   vlib_get_buffers (vm, from, bufs, n_left_from);
1603
1604   b = bufs;
1605   ti = thread_indices;
1606
1607   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1608
1609   while (n_left_from > 0)
1610     {
1611       ti[0] =
1612         (is_feature) ? vnet_buffer (b[0])->ip.
1613         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1614         reass.owner_thread_index;
1615
1616       if (PREDICT_FALSE
1617           ((node->flags & VLIB_NODE_FLAG_TRACE)
1618            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1619         {
1620           ip4_reassembly_handoff_trace_t *t =
1621             vlib_add_trace (vm, node, b[0], sizeof (*t));
1622           t->next_worker_index = ti[0];
1623         }
1624
1625       n_left_from -= 1;
1626       ti += 1;
1627       b += 1;
1628     }
1629   n_enq =
1630     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1631                                    frame->n_vectors, 1);
1632
1633   if (n_enq < frame->n_vectors)
1634     vlib_node_increment_counter (vm, node->node_index,
1635                                  IP4_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1636                                  frame->n_vectors - n_enq);
1637   return frame->n_vectors;
1638 }
1639
1640 VLIB_NODE_FN (ip4_reassembly_handoff_node) (vlib_main_t * vm,
1641                                             vlib_node_runtime_t * node,
1642                                             vlib_frame_t * frame)
1643 {
1644   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1645                                              false /* is_feature */ );
1646 }
1647
1648
1649 /* *INDENT-OFF* */
1650 VLIB_REGISTER_NODE (ip4_reassembly_handoff_node) = {
1651   .name = "ip4-reassembly-handoff",
1652   .vector_size = sizeof (u32),
1653   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1654   .error_strings = ip4_reassembly_handoff_error_strings,
1655   .format_trace = format_ip4_reassembly_handoff_trace,
1656
1657   .n_next_nodes = 1,
1658
1659   .next_nodes = {
1660     [0] = "error-drop",
1661   },
1662 };
1663 /* *INDENT-ON* */
1664
1665
1666 /* *INDENT-OFF* */
1667 VLIB_NODE_FN (ip4_reassembly_feature_handoff_node) (vlib_main_t * vm,
1668                                                     vlib_node_runtime_t *
1669                                                     node,
1670                                                     vlib_frame_t * frame)
1671 {
1672   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1673                                              true /* is_feature */ );
1674 }
1675 /* *INDENT-ON* */
1676
1677
1678 /* *INDENT-OFF* */
1679 VLIB_REGISTER_NODE (ip4_reassembly_feature_handoff_node) = {
1680   .name = "ip4-reass-feature-hoff",
1681   .vector_size = sizeof (u32),
1682   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1683   .error_strings = ip4_reassembly_handoff_error_strings,
1684   .format_trace = format_ip4_reassembly_handoff_trace,
1685
1686   .n_next_nodes = 1,
1687
1688   .next_nodes = {
1689     [0] = "error-drop",
1690   },
1691 };
1692 /* *INDENT-ON* */
1693
1694 /*
1695  * fd.io coding-style-patch-verification: ON
1696  *
1697  * Local Variables:
1698  * eval: (c-set-style "gnu")
1699  * End:
1700  */