reassembly: support more custom options for apps
[vpp.git] / src / vnet / ip / ip4_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Reassembly.
19  *
20  * This file contains the source code for IPv4 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_16_8.h>
27 #include <vnet/ip/ip4_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
34 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
35
36 #define IP4_REASS_DEBUG_BUFFERS 0
37 #if IP4_REASS_DEBUG_BUFFERS
38 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
39   do                                                 \
40     {                                                \
41       u32 _bi = bi;                                  \
42       printf (#what "buffer %u", _bi);               \
43       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
44       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
45         {                                            \
46           _bi = _b->next_buffer;                     \
47           printf ("[%u]", _bi);                      \
48           _b = vlib_get_buffer (vm, _bi);            \
49         }                                            \
50       printf ("\n");                                 \
51       fflush (stdout);                               \
52     }                                                \
53   while (0)
54 #else
55 #define IP4_REASS_DEBUG_BUFFER(...)
56 #endif
57
58 typedef enum
59 {
60   IP4_REASS_RC_OK,
61   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
62   IP4_REASS_RC_INTERNAL_ERROR,
63   IP4_REASS_RC_NO_BUF,
64 } ip4_reass_rc_t;
65
66 typedef struct
67 {
68   union
69   {
70     struct
71     {
72       u32 xx_id;
73       ip4_address_t src;
74       ip4_address_t dst;
75       u16 frag_id;
76       u8 proto;
77       u8 unused;
78     };
79     u64 as_u64[2];
80   };
81 } ip4_reass_key_t;
82
83 typedef union
84 {
85   struct
86   {
87     u32 reass_index;
88     u32 thread_index;
89   };
90   u64 as_u64;
91 } ip4_reass_val_t;
92
93 typedef union
94 {
95   struct
96   {
97     ip4_reass_key_t k;
98     ip4_reass_val_t v;
99   };
100   clib_bihash_kv_16_8_t kv;
101 } ip4_reass_kv_t;
102
103 always_inline u32
104 ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
105 {
106   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
107   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
108 }
109
110 always_inline u16
111 ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
112 {
113   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
114   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
115     (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
116 }
117
118 typedef struct
119 {
120   // hash table key
121   ip4_reass_key_t key;
122   // time when last packet was received
123   f64 last_heard;
124   // internal id of this reassembly
125   u64 id;
126   // buffer index of first buffer in this reassembly context
127   u32 first_bi;
128   // last octet of packet, ~0 until fragment without more_fragments arrives
129   u32 last_packet_octet;
130   // length of data collected so far
131   u32 data_len;
132   // trace operation counter
133   u32 trace_op_counter;
134   // next index - used by non-feature node
135   u32 next_index;
136   // error next index - used by non-feature node
137   u32 error_next_index;
138   // is_feature flag stored for non-inline code use
139   bool is_feature;
140   // minimum fragment length for this reassembly - used to estimate MTU
141   u16 min_fragment_length;
142   // number of fragments in this reassembly
143   u32 fragments_n;
144 } ip4_reass_t;
145
146 typedef struct
147 {
148   ip4_reass_t *pool;
149   u32 reass_n;
150   u32 id_counter;
151   clib_spinlock_t lock;
152 } ip4_reass_per_thread_t;
153
154 typedef struct
155 {
156   // IPv4 config
157   u32 timeout_ms;
158   f64 timeout;
159   u32 expire_walk_interval_ms;
160   // maximum number of fragments in one reassembly
161   u32 max_reass_len;
162   // maximum number of reassemblies
163   u32 max_reass_n;
164
165   // IPv4 runtime
166   clib_bihash_16_8_t hash;
167   // per-thread data
168   ip4_reass_per_thread_t *per_thread_data;
169
170   // convenience
171   vlib_main_t *vlib_main;
172   vnet_main_t *vnet_main;
173
174   // node index of ip4-drop node
175   u32 ip4_drop_idx;
176   u32 ip4_reass_expire_node_idx;
177
178   /** Worker handoff */
179   u32 fq_index;
180   u32 fq_feature_index;
181
182 } ip4_reass_main_t;
183
184 extern ip4_reass_main_t ip4_reass_main;
185
186 #ifndef CLIB_MARCH_VARIANT
187 ip4_reass_main_t ip4_reass_main;
188 #endif /* CLIB_MARCH_VARIANT */
189
190 typedef enum
191 {
192   IP4_REASSEMBLY_NEXT_INPUT,
193   IP4_REASSEMBLY_NEXT_DROP,
194   IP4_REASSEMBLY_NEXT_HANDOFF,
195   IP4_REASSEMBLY_N_NEXT,
196 } ip4_reass_next_t;
197
198 typedef enum
199 {
200   RANGE_NEW,
201   RANGE_SHRINK,
202   RANGE_DISCARD,
203   RANGE_OVERLAP,
204   FINALIZE,
205 } ip4_reass_trace_operation_e;
206
207 typedef struct
208 {
209   u16 range_first;
210   u16 range_last;
211   u32 range_bi;
212   i32 data_offset;
213   u32 data_len;
214   u32 first_bi;
215 } ip4_reass_range_trace_t;
216
217 typedef struct
218 {
219   ip4_reass_trace_operation_e action;
220   u32 reass_id;
221   ip4_reass_range_trace_t trace_range;
222   u32 size_diff;
223   u32 op_id;
224   u32 fragment_first;
225   u32 fragment_last;
226   u32 total_data_len;
227 } ip4_reass_trace_t;
228
229 extern vlib_node_registration_t ip4_reass_node;
230 extern vlib_node_registration_t ip4_reass_node_feature;
231
232 static void
233 ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
234                          ip4_reass_range_trace_t * trace)
235 {
236   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
237   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
238   trace->range_first = vnb->ip.reass.range_first;
239   trace->range_last = vnb->ip.reass.range_last;
240   trace->data_offset = ip4_reass_buffer_get_data_offset (b);
241   trace->data_len = ip4_reass_buffer_get_data_len (b);
242   trace->range_bi = bi;
243 }
244
245 static u8 *
246 format_ip4_reass_range_trace (u8 * s, va_list * args)
247 {
248   ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
249   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
250               trace->range_last, trace->data_offset, trace->data_len,
251               trace->range_bi);
252   return s;
253 }
254
255 static u8 *
256 format_ip4_reass_trace (u8 * s, va_list * args)
257 {
258   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
259   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
260   ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
261   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
262   u32 indent = format_get_indent (s);
263   s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
264               t->trace_range.first_bi, t->total_data_len, t->fragment_first,
265               t->fragment_last);
266   switch (t->action)
267     {
268     case RANGE_SHRINK:
269       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
270                   format_ip4_reass_range_trace, &t->trace_range,
271                   t->size_diff);
272       break;
273     case RANGE_DISCARD:
274       s = format (s, "\n%Udiscard %U", format_white_space, indent,
275                   format_ip4_reass_range_trace, &t->trace_range);
276       break;
277     case RANGE_NEW:
278       s = format (s, "\n%Unew %U", format_white_space, indent,
279                   format_ip4_reass_range_trace, &t->trace_range);
280       break;
281     case RANGE_OVERLAP:
282       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
283                   format_ip4_reass_range_trace, &t->trace_range);
284       break;
285     case FINALIZE:
286       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
287       break;
288     }
289   return s;
290 }
291
292 static void
293 ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
294                      ip4_reass_main_t * rm, ip4_reass_t * reass, u32 bi,
295                      ip4_reass_trace_operation_e action, u32 size_diff)
296 {
297   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
298   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
299   if (pool_is_free_index (vm->trace_main.trace_buffer_pool, b->trace_index))
300     {
301       // this buffer's trace is gone
302       b->flags &= ~VLIB_BUFFER_IS_TRACED;
303       return;
304     }
305   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
306   t->reass_id = reass->id;
307   t->action = action;
308   ip4_reass_trace_details (vm, bi, &t->trace_range);
309   t->size_diff = size_diff;
310   t->op_id = reass->trace_op_counter;
311   ++reass->trace_op_counter;
312   t->fragment_first = vnb->ip.reass.fragment_first;
313   t->fragment_last = vnb->ip.reass.fragment_last;
314   t->trace_range.first_bi = reass->first_bi;
315   t->total_data_len = reass->data_len;
316 #if 0
317   static u8 *s = NULL;
318   s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
319   printf ("%.*s\n", vec_len (s), s);
320   fflush (stdout);
321   vec_reset_length (s);
322 #endif
323 }
324
325
326 always_inline void
327 ip4_reass_free (ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
328                 ip4_reass_t * reass)
329 {
330   clib_bihash_kv_16_8_t kv;
331   kv.key[0] = reass->key.as_u64[0];
332   kv.key[1] = reass->key.as_u64[1];
333   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
334   pool_put (rt->pool, reass);
335   --rt->reass_n;
336 }
337
338 always_inline void
339 ip4_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
340                     ip4_reass_main_t * rm, ip4_reass_t * reass,
341                     bool is_feature)
342 {
343   u32 range_bi = reass->first_bi;
344   vlib_buffer_t *range_b;
345   vnet_buffer_opaque_t *range_vnb;
346   u32 *to_free = NULL;
347   while (~0 != range_bi)
348     {
349       range_b = vlib_get_buffer (vm, range_bi);
350       range_vnb = vnet_buffer (range_b);
351       u32 bi = range_bi;
352       while (~0 != bi)
353         {
354           vec_add1 (to_free, bi);
355           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
356           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
357             {
358               bi = b->next_buffer;
359               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
360             }
361           else
362             {
363               bi = ~0;
364             }
365         }
366       range_bi = range_vnb->ip.reass.next_range_bi;
367     }
368   /* send to next_error_index */
369   if (!(is_feature))
370     {
371       u32 n_left_to_next, *to_next, next_index;
372
373       next_index = reass->error_next_index;
374       u32 bi = ~0;
375
376       while (vec_len (to_free) > 0)
377         {
378           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
379
380           while (vec_len (to_free) > 0 && n_left_to_next > 0)
381             {
382               bi = vec_pop (to_free);
383
384               if (~0 != bi)
385                 {
386                   to_next[0] = bi;
387                   to_next += 1;
388                   n_left_to_next -= 1;
389                   vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
390                                                    to_next, n_left_to_next,
391                                                    bi, next_index);
392                 }
393             }
394           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
395         }
396     }
397   else
398     {
399       vlib_buffer_free (vm, to_free, vec_len (to_free));
400     }
401 }
402
403 static ip4_reass_t *
404 ip4_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
405                           ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
406                           ip4_reass_kv_t * kv, u8 * do_handoff,
407                           bool is_feature)
408 {
409   ip4_reass_t *reass = NULL;
410   f64 now = vlib_time_now (rm->vlib_main);
411
412   if (!clib_bihash_search_16_8
413       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
414     {
415       if (vm->thread_index != kv->v.thread_index)
416         {
417           *do_handoff = 1;
418           return NULL;
419         }
420       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
421
422       if (now > reass->last_heard + rm->timeout)
423         {
424           ip4_reass_drop_all (vm, node, rm, reass, is_feature);
425           ip4_reass_free (rm, rt, reass);
426           reass = NULL;
427         }
428     }
429
430   if (reass)
431     {
432       reass->last_heard = now;
433       return reass;
434     }
435
436   if (rt->reass_n >= rm->max_reass_n)
437     {
438       reass = NULL;
439       return reass;
440     }
441   else
442     {
443       pool_get (rt->pool, reass);
444       clib_memset (reass, 0, sizeof (*reass));
445       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
446       ++rt->id_counter;
447       reass->first_bi = ~0;
448       reass->last_packet_octet = ~0;
449       reass->data_len = 0;
450       reass->is_feature = is_feature;
451       ++rt->reass_n;
452     }
453
454   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
455   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
456   kv->v.reass_index = (reass - rt->pool);
457   kv->v.thread_index = vm->thread_index;
458   reass->last_heard = now;
459
460   if (clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 1))
461     {
462       ip4_reass_free (rm, rt, reass);
463       reass = NULL;
464     }
465
466   return reass;
467 }
468
469 always_inline ip4_reass_rc_t
470 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
471                     ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
472                     ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
473                     bool is_feature)
474 {
475   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
476   vlib_buffer_t *last_b = NULL;
477   u32 sub_chain_bi = reass->first_bi;
478   u32 total_length = 0;
479   u32 buf_cnt = 0;
480   do
481     {
482       u32 tmp_bi = sub_chain_bi;
483       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
484       ip4_header_t *ip = vlib_buffer_get_current (tmp);
485       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
486       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
487           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
488         {
489           return IP4_REASS_RC_INTERNAL_ERROR;
490         }
491
492       u32 data_len = ip4_reass_buffer_get_data_len (tmp);
493       u32 trim_front =
494         ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
495       u32 trim_end =
496         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
497       if (tmp_bi == reass->first_bi)
498         {
499           /* first buffer - keep ip4 header */
500           if (0 != ip4_reass_buffer_get_data_offset (tmp))
501             {
502               return IP4_REASS_RC_INTERNAL_ERROR;
503             }
504           trim_front = 0;
505           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
506             ip4_header_bytes (ip);
507           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
508             {
509               return IP4_REASS_RC_INTERNAL_ERROR;
510             }
511         }
512       u32 keep_data =
513         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
514       while (1)
515         {
516           ++buf_cnt;
517           if (trim_front)
518             {
519               if (trim_front > tmp->current_length)
520                 {
521                   /* drop whole buffer */
522                   u32 to_be_freed_bi = tmp_bi;
523                   trim_front -= tmp->current_length;
524                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
525                     {
526                       return IP4_REASS_RC_INTERNAL_ERROR;
527                     }
528                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
529                   tmp_bi = tmp->next_buffer;
530                   tmp->next_buffer = 0;
531                   tmp = vlib_get_buffer (vm, tmp_bi);
532                   vlib_buffer_free_one (vm, to_be_freed_bi);
533                   continue;
534                 }
535               else
536                 {
537                   vlib_buffer_advance (tmp, trim_front);
538                   trim_front = 0;
539                 }
540             }
541           if (keep_data)
542             {
543               if (last_b)
544                 {
545                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
546                   last_b->next_buffer = tmp_bi;
547                 }
548               last_b = tmp;
549               if (keep_data <= tmp->current_length)
550                 {
551                   tmp->current_length = keep_data;
552                   keep_data = 0;
553                 }
554               else
555                 {
556                   keep_data -= tmp->current_length;
557                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
558                     {
559                       return IP4_REASS_RC_INTERNAL_ERROR;
560                     }
561                 }
562               total_length += tmp->current_length;
563               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
564                 {
565                   tmp_bi = tmp->next_buffer;
566                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
567                 }
568               else
569                 {
570                   break;
571                 }
572             }
573           else
574             {
575               u32 to_be_freed_bi = tmp_bi;
576               if (reass->first_bi == tmp_bi)
577                 {
578                   return IP4_REASS_RC_INTERNAL_ERROR;
579                 }
580               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
581                 {
582                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
583                   tmp_bi = tmp->next_buffer;
584                   tmp->next_buffer = 0;
585                   tmp = vlib_get_buffer (vm, tmp_bi);
586                   vlib_buffer_free_one (vm, to_be_freed_bi);
587                 }
588               else
589                 {
590                   tmp->next_buffer = 0;
591                   vlib_buffer_free_one (vm, to_be_freed_bi);
592                   break;
593                 }
594             }
595         }
596       sub_chain_bi =
597         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
598         reass.next_range_bi;
599     }
600   while (~0 != sub_chain_bi);
601
602   if (!last_b)
603     {
604       return IP4_REASS_RC_INTERNAL_ERROR;
605     }
606   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
607
608   if (total_length < first_b->current_length)
609     {
610       return IP4_REASS_RC_INTERNAL_ERROR;
611     }
612   total_length -= first_b->current_length;
613   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
614   first_b->total_length_not_including_first_buffer = total_length;
615   ip4_header_t *ip = vlib_buffer_get_current (first_b);
616   ip->flags_and_fragment_offset = 0;
617   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
618   ip->checksum = ip4_header_checksum (ip);
619   if (!vlib_buffer_chain_linearize (vm, first_b))
620     {
621       return IP4_REASS_RC_NO_BUF;
622     }
623   // reset to reconstruct the mbuf linking
624   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
625   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
626     {
627       ip4_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
628 #if 0
629       // following code does a hexdump of packet fragments to stdout ...
630       do
631         {
632           u32 bi = reass->first_bi;
633           u8 *s = NULL;
634           while (~0 != bi)
635             {
636               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
637               s = format (s, "%u: %U\n", bi, format_hexdump,
638                           vlib_buffer_get_current (b), b->current_length);
639               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
640                 {
641                   bi = b->next_buffer;
642                 }
643               else
644                 {
645                   break;
646                 }
647             }
648           printf ("%.*s\n", vec_len (s), s);
649           fflush (stdout);
650           vec_free (s);
651         }
652       while (0);
653 #endif
654     }
655   *bi0 = reass->first_bi;
656   if (is_feature)
657     {
658       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
659     }
660   else
661     {
662       *next0 = reass->next_index;
663     }
664   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
665   *error0 = IP4_ERROR_NONE;
666   ip4_reass_free (rm, rt, reass);
667   reass = NULL;
668   return IP4_REASS_RC_OK;
669 }
670
671 always_inline ip4_reass_rc_t
672 ip4_reass_insert_range_in_chain (vlib_main_t * vm,
673                                  ip4_reass_main_t * rm,
674                                  ip4_reass_per_thread_t * rt,
675                                  ip4_reass_t * reass,
676                                  u32 prev_range_bi, u32 new_next_bi)
677 {
678   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
679   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
680   if (~0 != prev_range_bi)
681     {
682       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
683       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
684       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
685       prev_vnb->ip.reass.next_range_bi = new_next_bi;
686     }
687   else
688     {
689       if (~0 != reass->first_bi)
690         {
691           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
692         }
693       reass->first_bi = new_next_bi;
694     }
695   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
696   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
697       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
698     {
699       return IP4_REASS_RC_INTERNAL_ERROR;
700     }
701   reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
702   return IP4_REASS_RC_OK;
703 }
704
705 always_inline ip4_reass_rc_t
706 ip4_reass_remove_range_from_chain (vlib_main_t * vm,
707                                    vlib_node_runtime_t * node,
708                                    ip4_reass_main_t * rm,
709                                    ip4_reass_t * reass, u32 prev_range_bi,
710                                    u32 discard_bi)
711 {
712   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
713   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
714   if (~0 != prev_range_bi)
715     {
716       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
717       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
718       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
719         {
720           return IP4_REASS_RC_INTERNAL_ERROR;
721         }
722       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
723     }
724   else
725     {
726       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
727     }
728   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
729   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
730       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
731     {
732       return IP4_REASS_RC_INTERNAL_ERROR;
733     }
734   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
735   while (1)
736     {
737       u32 to_be_freed_bi = discard_bi;
738       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
739         {
740           ip4_reass_add_trace (vm, node, rm, reass, discard_bi, RANGE_DISCARD,
741                                0);
742         }
743       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
744         {
745           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
746           discard_bi = discard_b->next_buffer;
747           discard_b->next_buffer = 0;
748           discard_b = vlib_get_buffer (vm, discard_bi);
749           vlib_buffer_free_one (vm, to_be_freed_bi);
750         }
751       else
752         {
753           discard_b->next_buffer = 0;
754           vlib_buffer_free_one (vm, to_be_freed_bi);
755           break;
756         }
757     }
758   return IP4_REASS_RC_OK;
759 }
760
761 always_inline ip4_reass_rc_t
762 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
763                   ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
764                   ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
765                   bool is_feature)
766 {
767   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
768   int consumed = 0;
769   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
770   ip4_header_t *fip = vlib_buffer_get_current (fb);
771   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
772   reass->next_index = fvnb->ip.reass.next_index;        // store next_index before it's overwritten
773   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
774   const u32 fragment_length =
775     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
776   const u32 fragment_last = fragment_first + fragment_length - 1;
777   fvnb->ip.reass.fragment_first = fragment_first;
778   fvnb->ip.reass.fragment_last = fragment_last;
779   int more_fragments = ip4_get_fragment_more (fip);
780   u32 candidate_range_bi = reass->first_bi;
781   u32 prev_range_bi = ~0;
782   fvnb->ip.reass.range_first = fragment_first;
783   fvnb->ip.reass.range_last = fragment_last;
784   fvnb->ip.reass.next_range_bi = ~0;
785   if (!more_fragments)
786     {
787       reass->last_packet_octet = fragment_last;
788     }
789   if (~0 == reass->first_bi)
790     {
791       // starting a new reassembly
792       rc =
793         ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
794                                          *bi0);
795       if (IP4_REASS_RC_OK != rc)
796         {
797           return rc;
798         }
799       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
800         {
801           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
802         }
803       *bi0 = ~0;
804       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
805       reass->fragments_n = 1;
806       return IP4_REASS_RC_OK;
807     }
808   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
809                                          fvnb->ip.reass.estimated_mtu);
810   while (~0 != candidate_range_bi)
811     {
812       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
813       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
814       if (fragment_first > candidate_vnb->ip.reass.range_last)
815         {
816           // this fragments starts after candidate range
817           prev_range_bi = candidate_range_bi;
818           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
819           if (candidate_vnb->ip.reass.range_last < fragment_last &&
820               ~0 == candidate_range_bi)
821             {
822               // special case - this fragment falls beyond all known ranges
823               rc =
824                 ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
825                                                  prev_range_bi, *bi0);
826               if (IP4_REASS_RC_OK != rc)
827                 {
828                   return rc;
829                 }
830               consumed = 1;
831               break;
832             }
833           continue;
834         }
835       if (fragment_last < candidate_vnb->ip.reass.range_first)
836         {
837           // this fragment ends before candidate range without any overlap
838           rc =
839             ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
840                                              *bi0);
841           if (IP4_REASS_RC_OK != rc)
842             {
843               return rc;
844             }
845           consumed = 1;
846         }
847       else
848         {
849           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
850               fragment_last <= candidate_vnb->ip.reass.range_last)
851             {
852               // this fragment is a (sub)part of existing range, ignore it
853               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
854                 {
855                   ip4_reass_add_trace (vm, node, rm, reass, *bi0,
856                                        RANGE_OVERLAP, 0);
857                 }
858               break;
859             }
860           int discard_candidate = 0;
861           if (fragment_first < candidate_vnb->ip.reass.range_first)
862             {
863               u32 overlap =
864                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
865               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
866                 {
867                   candidate_vnb->ip.reass.range_first += overlap;
868                   if (reass->data_len < overlap)
869                     {
870                       return IP4_REASS_RC_INTERNAL_ERROR;
871                     }
872                   reass->data_len -= overlap;
873                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
874                     {
875                       ip4_reass_add_trace (vm, node, rm, reass,
876                                            candidate_range_bi, RANGE_SHRINK,
877                                            overlap);
878                     }
879                   rc =
880                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
881                                                      prev_range_bi, *bi0);
882                   if (IP4_REASS_RC_OK != rc)
883                     {
884                       return rc;
885                     }
886                   consumed = 1;
887                 }
888               else
889                 {
890                   discard_candidate = 1;
891                 }
892             }
893           else if (fragment_last > candidate_vnb->ip.reass.range_last)
894             {
895               u32 overlap =
896                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
897               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
898                 {
899                   fvnb->ip.reass.range_first += overlap;
900                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
901                     {
902                       prev_range_bi = candidate_range_bi;
903                       candidate_range_bi =
904                         candidate_vnb->ip.reass.next_range_bi;
905                       continue;
906                     }
907                   else
908                     {
909                       // special case - last range discarded
910                       rc =
911                         ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
912                                                          candidate_range_bi,
913                                                          *bi0);
914                       if (IP4_REASS_RC_OK != rc)
915                         {
916                           return rc;
917                         }
918                       consumed = 1;
919                     }
920                 }
921               else
922                 {
923                   discard_candidate = 1;
924                 }
925             }
926           else
927             {
928               discard_candidate = 1;
929             }
930           if (discard_candidate)
931             {
932               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
933               // discard candidate range, probe next range
934               rc =
935                 ip4_reass_remove_range_from_chain (vm, node, rm, reass,
936                                                    prev_range_bi,
937                                                    candidate_range_bi);
938               if (IP4_REASS_RC_OK != rc)
939                 {
940                   return rc;
941                 }
942               if (~0 != next_range_bi)
943                 {
944                   candidate_range_bi = next_range_bi;
945                   continue;
946                 }
947               else
948                 {
949                   // special case - last range discarded
950                   rc =
951                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
952                                                      prev_range_bi, *bi0);
953                   if (IP4_REASS_RC_OK != rc)
954                     {
955                       return rc;
956                     }
957                   consumed = 1;
958                 }
959             }
960         }
961       break;
962     }
963   ++reass->fragments_n;
964   if (consumed)
965     {
966       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
967         {
968           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
969         }
970     }
971   if (~0 != reass->last_packet_octet &&
972       reass->data_len == reass->last_packet_octet + 1)
973     {
974       return ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
975                                  is_feature);
976     }
977   else
978     {
979       if (consumed)
980         {
981           *bi0 = ~0;
982           if (reass->fragments_n > rm->max_reass_len)
983             {
984               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
985             }
986         }
987       else
988         {
989           *next0 = IP4_REASSEMBLY_NEXT_DROP;
990           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
991         }
992     }
993   return rc;
994 }
995
996 always_inline uword
997 ip4_reassembly_inline (vlib_main_t * vm,
998                        vlib_node_runtime_t * node,
999                        vlib_frame_t * frame, bool is_feature)
1000 {
1001   u32 *from = vlib_frame_vector_args (frame);
1002   u32 n_left_from, n_left_to_next, *to_next, next_index;
1003   ip4_reass_main_t *rm = &ip4_reass_main;
1004   ip4_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1005   clib_spinlock_lock (&rt->lock);
1006
1007   n_left_from = frame->n_vectors;
1008   next_index = node->cached_next_index;
1009   while (n_left_from > 0)
1010     {
1011       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1012
1013       while (n_left_from > 0 && n_left_to_next > 0)
1014         {
1015           u32 bi0;
1016           vlib_buffer_t *b0;
1017           u32 next0;
1018           u32 error0 = IP4_ERROR_NONE;
1019
1020           bi0 = from[0];
1021           b0 = vlib_get_buffer (vm, bi0);
1022
1023           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1024           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1025             {
1026               // this is a whole packet - no fragmentation
1027               if (is_feature)
1028                 {
1029                   next0 = IP4_REASSEMBLY_NEXT_INPUT;
1030                 }
1031               else
1032                 {
1033                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1034                 }
1035             }
1036           else
1037             {
1038               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1039               const u32 fragment_length =
1040                 clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1041               const u32 fragment_last = fragment_first + fragment_length - 1;
1042               if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
1043                 {
1044                   next0 = IP4_REASSEMBLY_NEXT_DROP;
1045                   error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1046                 }
1047               else
1048                 {
1049                   ip4_reass_kv_t kv;
1050                   u8 do_handoff = 0;
1051
1052                   kv.k.as_u64[0] =
1053                     (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1054                                    vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1055                     (u64) ip0->src_address.as_u32 << 32;
1056                   kv.k.as_u64[1] =
1057                     (u64) ip0->dst_address.as_u32 |
1058                     (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1059
1060                   ip4_reass_t *reass =
1061                     ip4_reass_find_or_create (vm, node, rm, rt, &kv,
1062                                               &do_handoff, is_feature);
1063
1064                   if (PREDICT_FALSE (do_handoff))
1065                     {
1066                       next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
1067                       if (is_feature)
1068                         vnet_buffer (b0)->ip.
1069                           reass.owner_feature_thread_index =
1070                           kv.v.thread_index;
1071                       else
1072                         vnet_buffer (b0)->ip.reass.owner_thread_index =
1073                           kv.v.thread_index;
1074                     }
1075                   else if (reass)
1076                     {
1077                       switch (ip4_reass_update
1078                               (vm, node, rm, rt, reass, &bi0, &next0,
1079                                &error0, is_feature))
1080                         {
1081                         case IP4_REASS_RC_OK:
1082                           /* nothing to do here */
1083                           break;
1084                         case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1085                           vlib_node_increment_counter (vm, node->node_index,
1086                                                        IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1087                                                        1);
1088                           ip4_reass_drop_all (vm, node, rm, reass,
1089                                               is_feature);
1090                           ip4_reass_free (rm, rt, reass);
1091                           goto next_packet;
1092                           break;
1093                         case IP4_REASS_RC_NO_BUF:
1094                           vlib_node_increment_counter (vm, node->node_index,
1095                                                        IP4_ERROR_REASS_NO_BUF,
1096                                                        1);
1097                           ip4_reass_drop_all (vm, node, rm, reass,
1098                                               is_feature);
1099                           ip4_reass_free (rm, rt, reass);
1100                           goto next_packet;
1101                           break;
1102                         case IP4_REASS_RC_INTERNAL_ERROR:
1103                           /* drop everything and start with a clean slate */
1104                           vlib_node_increment_counter (vm, node->node_index,
1105                                                        IP4_ERROR_REASS_INTERNAL_ERROR,
1106                                                        1);
1107                           ip4_reass_drop_all (vm, node, rm, reass,
1108                                               is_feature);
1109                           ip4_reass_free (rm, rt, reass);
1110                           goto next_packet;
1111                           break;
1112                         }
1113                     }
1114                   else
1115                     {
1116                       next0 = IP4_REASSEMBLY_NEXT_DROP;
1117                       error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1118                     }
1119                 }
1120
1121               b0->error = node->errors[error0];
1122             }
1123
1124           if (bi0 != ~0)
1125             {
1126               to_next[0] = bi0;
1127               to_next += 1;
1128               n_left_to_next -= 1;
1129               if (is_feature && IP4_ERROR_NONE == error0)
1130                 {
1131                   b0 = vlib_get_buffer (vm, bi0);
1132                   vnet_feature_next (&next0, b0);
1133                 }
1134               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1135                                                to_next, n_left_to_next,
1136                                                bi0, next0);
1137               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1138             }
1139
1140         next_packet:
1141           from += 1;
1142           n_left_from -= 1;
1143         }
1144
1145       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1146     }
1147
1148   clib_spinlock_unlock (&rt->lock);
1149   return frame->n_vectors;
1150 }
1151
1152 static char *ip4_reassembly_error_strings[] = {
1153 #define _(sym, string) string,
1154   foreach_ip4_error
1155 #undef _
1156 };
1157
1158 VLIB_NODE_FN (ip4_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1159                                vlib_frame_t * frame)
1160 {
1161   return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ );
1162 }
1163
1164 /* *INDENT-OFF* */
1165 VLIB_REGISTER_NODE (ip4_reass_node) = {
1166     .name = "ip4-reassembly",
1167     .vector_size = sizeof (u32),
1168     .format_trace = format_ip4_reass_trace,
1169     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1170     .error_strings = ip4_reassembly_error_strings,
1171     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1172     .next_nodes =
1173         {
1174                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1175                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1176                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reassembly-handoff",
1177
1178         },
1179 };
1180 /* *INDENT-ON* */
1181
1182 VLIB_NODE_FN (ip4_reass_node_feature) (vlib_main_t * vm,
1183                                        vlib_node_runtime_t * node,
1184                                        vlib_frame_t * frame)
1185 {
1186   return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ );
1187 }
1188
1189 /* *INDENT-OFF* */
1190 VLIB_REGISTER_NODE (ip4_reass_node_feature) = {
1191     .name = "ip4-reassembly-feature",
1192     .vector_size = sizeof (u32),
1193     .format_trace = format_ip4_reass_trace,
1194     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1195     .error_strings = ip4_reassembly_error_strings,
1196     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1197     .next_nodes =
1198         {
1199                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1200                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1201                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reass-feature-hoff",
1202         },
1203 };
1204 /* *INDENT-ON* */
1205
1206 /* *INDENT-OFF* */
1207 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
1208     .arc_name = "ip4-unicast",
1209     .node_name = "ip4-reassembly-feature",
1210     .runs_before = VNET_FEATURES ("ip4-lookup"),
1211     .runs_after = 0,
1212 };
1213 /* *INDENT-ON* */
1214
1215 #ifndef CLIB_MARCH_VARIANT
1216 always_inline u32
1217 ip4_reass_get_nbuckets ()
1218 {
1219   ip4_reass_main_t *rm = &ip4_reass_main;
1220   u32 nbuckets;
1221   u8 i;
1222
1223   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1224
1225   for (i = 0; i < 31; i++)
1226     if ((1 << i) >= nbuckets)
1227       break;
1228   nbuckets = 1 << i;
1229
1230   return nbuckets;
1231 }
1232 #endif /* CLIB_MARCH_VARIANT */
1233
1234 typedef enum
1235 {
1236   IP4_EVENT_CONFIG_CHANGED = 1,
1237 } ip4_reass_event_t;
1238
1239 typedef struct
1240 {
1241   int failure;
1242   clib_bihash_16_8_t *new_hash;
1243 } ip4_rehash_cb_ctx;
1244
1245 #ifndef CLIB_MARCH_VARIANT
1246 static void
1247 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1248 {
1249   ip4_rehash_cb_ctx *ctx = _ctx;
1250   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1251     {
1252       ctx->failure = 1;
1253     }
1254 }
1255
1256 static void
1257 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1258                       u32 max_reassembly_length, u32 expire_walk_interval_ms)
1259 {
1260   ip4_reass_main.timeout_ms = timeout_ms;
1261   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1262   ip4_reass_main.max_reass_n = max_reassemblies;
1263   ip4_reass_main.max_reass_len = max_reassembly_length;
1264   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1265 }
1266
1267 vnet_api_error_t
1268 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
1269                u32 max_reassembly_length, u32 expire_walk_interval_ms)
1270 {
1271   u32 old_nbuckets = ip4_reass_get_nbuckets ();
1272   ip4_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
1273                         expire_walk_interval_ms);
1274   vlib_process_signal_event (ip4_reass_main.vlib_main,
1275                              ip4_reass_main.ip4_reass_expire_node_idx,
1276                              IP4_EVENT_CONFIG_CHANGED, 0);
1277   u32 new_nbuckets = ip4_reass_get_nbuckets ();
1278   if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1279     {
1280       clib_bihash_16_8_t new_hash;
1281       clib_memset (&new_hash, 0, sizeof (new_hash));
1282       ip4_rehash_cb_ctx ctx;
1283       ctx.failure = 0;
1284       ctx.new_hash = &new_hash;
1285       clib_bihash_init_16_8 (&new_hash, "ip4-reass", new_nbuckets,
1286                              new_nbuckets * 1024);
1287       clib_bihash_foreach_key_value_pair_16_8 (&ip4_reass_main.hash,
1288                                                ip4_rehash_cb, &ctx);
1289       if (ctx.failure)
1290         {
1291           clib_bihash_free_16_8 (&new_hash);
1292           return -1;
1293         }
1294       else
1295         {
1296           clib_bihash_free_16_8 (&ip4_reass_main.hash);
1297           clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
1298                             sizeof (ip4_reass_main.hash));
1299         }
1300     }
1301   return 0;
1302 }
1303
1304 vnet_api_error_t
1305 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1306                u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1307 {
1308   *timeout_ms = ip4_reass_main.timeout_ms;
1309   *max_reassemblies = ip4_reass_main.max_reass_n;
1310   *max_reassembly_length = ip4_reass_main.max_reass_len;
1311   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
1312   return 0;
1313 }
1314
1315 static clib_error_t *
1316 ip4_reass_init_function (vlib_main_t * vm)
1317 {
1318   ip4_reass_main_t *rm = &ip4_reass_main;
1319   clib_error_t *error = 0;
1320   u32 nbuckets;
1321   vlib_node_t *node;
1322
1323   rm->vlib_main = vm;
1324   rm->vnet_main = vnet_get_main ();
1325
1326   vec_validate (rm->per_thread_data, vlib_num_workers ());
1327   ip4_reass_per_thread_t *rt;
1328   vec_foreach (rt, rm->per_thread_data)
1329   {
1330     clib_spinlock_init (&rt->lock);
1331     pool_alloc (rt->pool, rm->max_reass_n);
1332   }
1333
1334   node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
1335   ASSERT (node);
1336   rm->ip4_reass_expire_node_idx = node->index;
1337
1338   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1339                         IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1340                         IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1341                         IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1342
1343   nbuckets = ip4_reass_get_nbuckets ();
1344   clib_bihash_init_16_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
1345
1346   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1347   ASSERT (node);
1348   rm->ip4_drop_idx = node->index;
1349
1350   rm->fq_index = vlib_frame_queue_main_init (ip4_reass_node.index, 0);
1351   rm->fq_feature_index =
1352     vlib_frame_queue_main_init (ip4_reass_node_feature.index, 0);
1353
1354
1355   return error;
1356 }
1357
1358 VLIB_INIT_FUNCTION (ip4_reass_init_function);
1359 #endif /* CLIB_MARCH_VARIANT */
1360
1361 static uword
1362 ip4_reass_walk_expired (vlib_main_t * vm,
1363                         vlib_node_runtime_t * node, vlib_frame_t * f)
1364 {
1365   ip4_reass_main_t *rm = &ip4_reass_main;
1366   uword event_type, *event_data = 0;
1367
1368   while (true)
1369     {
1370       vlib_process_wait_for_event_or_clock (vm,
1371                                             (f64)
1372                                             rm->expire_walk_interval_ms /
1373                                             (f64) MSEC_PER_SEC);
1374       event_type = vlib_process_get_events (vm, &event_data);
1375
1376       switch (event_type)
1377         {
1378         case ~0:                /* no events => timeout */
1379           /* nothing to do here */
1380           break;
1381         case IP4_EVENT_CONFIG_CHANGED:
1382           break;
1383         default:
1384           clib_warning ("BUG: event type 0x%wx", event_type);
1385           break;
1386         }
1387       f64 now = vlib_time_now (vm);
1388
1389       ip4_reass_t *reass;
1390       int *pool_indexes_to_free = NULL;
1391
1392       uword thread_index = 0;
1393       int index;
1394       const uword nthreads = vlib_num_workers () + 1;
1395       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1396         {
1397           ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1398           clib_spinlock_lock (&rt->lock);
1399
1400           vec_reset_length (pool_indexes_to_free);
1401           /* *INDENT-OFF* */
1402           pool_foreach_index (index, rt->pool, ({
1403                                 reass = pool_elt_at_index (rt->pool, index);
1404                                 if (now > reass->last_heard + rm->timeout)
1405                                   {
1406                                     vec_add1 (pool_indexes_to_free, index);
1407                                   }
1408                               }));
1409           /* *INDENT-ON* */
1410           int *i;
1411           /* *INDENT-OFF* */
1412           vec_foreach (i, pool_indexes_to_free)
1413           {
1414             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1415             ip4_reass_drop_all (vm, node, rm, reass, reass->is_feature);
1416             ip4_reass_free (rm, rt, reass);
1417           }
1418           /* *INDENT-ON* */
1419
1420           clib_spinlock_unlock (&rt->lock);
1421         }
1422
1423       vec_free (pool_indexes_to_free);
1424       if (event_data)
1425         {
1426           _vec_len (event_data) = 0;
1427         }
1428     }
1429
1430   return 0;
1431 }
1432
1433 /* *INDENT-OFF* */
1434 VLIB_REGISTER_NODE (ip4_reass_expire_node, static) = {
1435     .function = ip4_reass_walk_expired,
1436     .type = VLIB_NODE_TYPE_PROCESS,
1437     .name = "ip4-reassembly-expire-walk",
1438     .format_trace = format_ip4_reass_trace,
1439     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1440     .error_strings = ip4_reassembly_error_strings,
1441
1442 };
1443 /* *INDENT-ON* */
1444
1445 static u8 *
1446 format_ip4_reass_key (u8 * s, va_list * args)
1447 {
1448   ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
1449   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1450               key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1451               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1452   return s;
1453 }
1454
1455 static u8 *
1456 format_ip4_reass (u8 * s, va_list * args)
1457 {
1458   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1459   ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
1460
1461   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1462               "last_packet_octet: %u, trace_op_counter: %u\n",
1463               reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
1464               reass->data_len, reass->last_packet_octet,
1465               reass->trace_op_counter);
1466   u32 bi = reass->first_bi;
1467   u32 counter = 0;
1468   while (~0 != bi)
1469     {
1470       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1471       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1472       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1473                   "fragment[%u, %u]\n",
1474                   counter, vnb->ip.reass.range_first,
1475                   vnb->ip.reass.range_last, bi,
1476                   ip4_reass_buffer_get_data_offset (b),
1477                   ip4_reass_buffer_get_data_len (b),
1478                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1479       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1480         {
1481           bi = b->next_buffer;
1482         }
1483       else
1484         {
1485           bi = ~0;
1486         }
1487     }
1488   return s;
1489 }
1490
1491 static clib_error_t *
1492 show_ip4_reass (vlib_main_t * vm,
1493                 unformat_input_t * input,
1494                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1495 {
1496   ip4_reass_main_t *rm = &ip4_reass_main;
1497
1498   vlib_cli_output (vm, "---------------------");
1499   vlib_cli_output (vm, "IP4 reassembly status");
1500   vlib_cli_output (vm, "---------------------");
1501   bool details = false;
1502   if (unformat (input, "details"))
1503     {
1504       details = true;
1505     }
1506
1507   u32 sum_reass_n = 0;
1508   ip4_reass_t *reass;
1509   uword thread_index;
1510   const uword nthreads = vlib_num_workers () + 1;
1511   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1512     {
1513       ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1514       clib_spinlock_lock (&rt->lock);
1515       if (details)
1516         {
1517           /* *INDENT-OFF* */
1518           pool_foreach (reass, rt->pool, {
1519             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1520           });
1521           /* *INDENT-ON* */
1522         }
1523       sum_reass_n += rt->reass_n;
1524       clib_spinlock_unlock (&rt->lock);
1525     }
1526   vlib_cli_output (vm, "---------------------");
1527   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1528                    (long unsigned) sum_reass_n);
1529   vlib_cli_output (vm,
1530                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1531                    (long unsigned) rm->max_reass_n);
1532   return 0;
1533 }
1534
1535 /* *INDENT-OFF* */
1536 VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
1537     .path = "show ip4-reassembly",
1538     .short_help = "show ip4-reassembly [details]",
1539     .function = show_ip4_reass,
1540 };
1541 /* *INDENT-ON* */
1542
1543 #ifndef CLIB_MARCH_VARIANT
1544 vnet_api_error_t
1545 ip4_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1546 {
1547   return vnet_feature_enable_disable ("ip4-unicast",
1548                                       "ip4-reassembly-feature", sw_if_index,
1549                                       enable_disable, 0, 0);
1550 }
1551 #endif /* CLIB_MARCH_VARIANT */
1552
1553
1554 #define foreach_ip4_reassembly_handoff_error                       \
1555 _(CONGESTION_DROP, "congestion drop")
1556
1557
1558 typedef enum
1559 {
1560 #define _(sym,str) IP4_REASSEMBLY_HANDOFF_ERROR_##sym,
1561   foreach_ip4_reassembly_handoff_error
1562 #undef _
1563     IP4_REASSEMBLY_HANDOFF_N_ERROR,
1564 } ip4_reassembly_handoff_error_t;
1565
1566 static char *ip4_reassembly_handoff_error_strings[] = {
1567 #define _(sym,string) string,
1568   foreach_ip4_reassembly_handoff_error
1569 #undef _
1570 };
1571
1572 typedef struct
1573 {
1574   u32 next_worker_index;
1575 } ip4_reassembly_handoff_trace_t;
1576
1577 static u8 *
1578 format_ip4_reassembly_handoff_trace (u8 * s, va_list * args)
1579 {
1580   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1581   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1582   ip4_reassembly_handoff_trace_t *t =
1583     va_arg (*args, ip4_reassembly_handoff_trace_t *);
1584
1585   s =
1586     format (s, "ip4-reassembly-handoff: next-worker %d",
1587             t->next_worker_index);
1588
1589   return s;
1590 }
1591
1592 always_inline uword
1593 ip4_reassembly_handoff_node_inline (vlib_main_t * vm,
1594                                     vlib_node_runtime_t * node,
1595                                     vlib_frame_t * frame, bool is_feature)
1596 {
1597   ip4_reass_main_t *rm = &ip4_reass_main;
1598
1599   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1600   u32 n_enq, n_left_from, *from;
1601   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1602   u32 fq_index;
1603
1604   from = vlib_frame_vector_args (frame);
1605   n_left_from = frame->n_vectors;
1606   vlib_get_buffers (vm, from, bufs, n_left_from);
1607
1608   b = bufs;
1609   ti = thread_indices;
1610
1611   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1612
1613   while (n_left_from > 0)
1614     {
1615       ti[0] =
1616         (is_feature) ? vnet_buffer (b[0])->ip.
1617         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1618         reass.owner_thread_index;
1619
1620       if (PREDICT_FALSE
1621           ((node->flags & VLIB_NODE_FLAG_TRACE)
1622            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1623         {
1624           ip4_reassembly_handoff_trace_t *t =
1625             vlib_add_trace (vm, node, b[0], sizeof (*t));
1626           t->next_worker_index = ti[0];
1627         }
1628
1629       n_left_from -= 1;
1630       ti += 1;
1631       b += 1;
1632     }
1633   n_enq =
1634     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1635                                    frame->n_vectors, 1);
1636
1637   if (n_enq < frame->n_vectors)
1638     vlib_node_increment_counter (vm, node->node_index,
1639                                  IP4_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1640                                  frame->n_vectors - n_enq);
1641   return frame->n_vectors;
1642 }
1643
1644 VLIB_NODE_FN (ip4_reassembly_handoff_node) (vlib_main_t * vm,
1645                                             vlib_node_runtime_t * node,
1646                                             vlib_frame_t * frame)
1647 {
1648   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1649                                              false /* is_feature */ );
1650 }
1651
1652
1653 /* *INDENT-OFF* */
1654 VLIB_REGISTER_NODE (ip4_reassembly_handoff_node) = {
1655   .name = "ip4-reassembly-handoff",
1656   .vector_size = sizeof (u32),
1657   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1658   .error_strings = ip4_reassembly_handoff_error_strings,
1659   .format_trace = format_ip4_reassembly_handoff_trace,
1660
1661   .n_next_nodes = 1,
1662
1663   .next_nodes = {
1664     [0] = "error-drop",
1665   },
1666 };
1667 /* *INDENT-ON* */
1668
1669
1670 /* *INDENT-OFF* */
1671 VLIB_NODE_FN (ip4_reassembly_feature_handoff_node) (vlib_main_t * vm,
1672                                                     vlib_node_runtime_t *
1673                                                     node,
1674                                                     vlib_frame_t * frame)
1675 {
1676   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1677                                              true /* is_feature */ );
1678 }
1679 /* *INDENT-ON* */
1680
1681
1682 /* *INDENT-OFF* */
1683 VLIB_REGISTER_NODE (ip4_reassembly_feature_handoff_node) = {
1684   .name = "ip4-reass-feature-hoff",
1685   .vector_size = sizeof (u32),
1686   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1687   .error_strings = ip4_reassembly_handoff_error_strings,
1688   .format_trace = format_ip4_reassembly_handoff_trace,
1689
1690   .n_next_nodes = 1,
1691
1692   .next_nodes = {
1693     [0] = "error-drop",
1694   },
1695 };
1696 /* *INDENT-ON* */
1697
1698 /*
1699  * fd.io coding-style-patch-verification: ON
1700  *
1701  * Local Variables:
1702  * eval: (c-set-style "gnu")
1703  * End:
1704  */