bonding: clean up redundant code
[vpp.git] / src / vnet / ip / ip4_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Reassembly.
19  *
20  * This file contains the source code for IPv4 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_16_8.h>
27 #include <vnet/ip/ip4_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
34
35 #define IP4_REASS_DEBUG_BUFFERS 0
36 #if IP4_REASS_DEBUG_BUFFERS
37 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
38   do                                                 \
39     {                                                \
40       u32 _bi = bi;                                  \
41       printf (#what "buffer %u", _bi);               \
42       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
43       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
44         {                                            \
45           _bi = _b->next_buffer;                     \
46           printf ("[%u]", _bi);                      \
47           _b = vlib_get_buffer (vm, _bi);            \
48         }                                            \
49       printf ("\n");                                 \
50       fflush (stdout);                               \
51     }                                                \
52   while (0)
53 #else
54 #define IP4_REASS_DEBUG_BUFFER(...)
55 #endif
56
57 typedef enum
58 {
59   IP4_REASS_RC_OK,
60   IP4_REASS_RC_INTERNAL_ERROR,
61   IP4_REASS_RC_NO_BUF,
62 } ip4_reass_rc_t;
63
64 typedef struct
65 {
66   union
67   {
68     struct
69     {
70       u32 xx_id;
71       ip4_address_t src;
72       ip4_address_t dst;
73       u16 frag_id;
74       u8 proto;
75       u8 unused;
76     };
77     u64 as_u64[2];
78   };
79 } ip4_reass_key_t;
80
81 typedef union
82 {
83   struct
84   {
85     u32 reass_index;
86     u32 thread_index;
87   };
88   u64 as_u64;
89 } ip4_reass_val_t;
90
91 typedef union
92 {
93   struct
94   {
95     ip4_reass_key_t k;
96     ip4_reass_val_t v;
97   };
98   clib_bihash_kv_16_8_t kv;
99 } ip4_reass_kv_t;
100
101 always_inline u32
102 ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
103 {
104   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
105   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
106 }
107
108 always_inline u16
109 ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
110 {
111   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
112   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
113     (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
114 }
115
116 typedef struct
117 {
118   // hash table key
119   ip4_reass_key_t key;
120   // time when last packet was received
121   f64 last_heard;
122   // internal id of this reassembly
123   u64 id;
124   // buffer index of first buffer in this reassembly context
125   u32 first_bi;
126   // last octet of packet, ~0 until fragment without more_fragments arrives
127   u32 last_packet_octet;
128   // length of data collected so far
129   u32 data_len;
130   // trace operation counter
131   u32 trace_op_counter;
132   // next index - used by non-feature node
133   u8 next_index;
134   // minimum fragment length for this reassembly - used to estimate MTU
135   u16 min_fragment_length;
136
137 } ip4_reass_t;
138
139 typedef struct
140 {
141   ip4_reass_t *pool;
142   u32 reass_n;
143   u32 id_counter;
144   clib_spinlock_t lock;
145 } ip4_reass_per_thread_t;
146
147 typedef struct
148 {
149   // IPv4 config
150   u32 timeout_ms;
151   f64 timeout;
152   u32 expire_walk_interval_ms;
153   u32 max_reass_n;
154
155   // IPv4 runtime
156   clib_bihash_16_8_t hash;
157   // per-thread data
158   ip4_reass_per_thread_t *per_thread_data;
159
160   // convenience
161   vlib_main_t *vlib_main;
162   vnet_main_t *vnet_main;
163
164   // node index of ip4-drop node
165   u32 ip4_drop_idx;
166   u32 ip4_reass_expire_node_idx;
167
168   /** Worker handoff */
169   u32 fq_index;
170   u32 fq_feature_index;
171
172 } ip4_reass_main_t;
173
174 extern ip4_reass_main_t ip4_reass_main;
175
176 #ifndef CLIB_MARCH_VARIANT
177 ip4_reass_main_t ip4_reass_main;
178 #endif /* CLIB_MARCH_VARIANT */
179
180 typedef enum
181 {
182   IP4_REASSEMBLY_NEXT_INPUT,
183   IP4_REASSEMBLY_NEXT_DROP,
184   IP4_REASSEMBLY_NEXT_HANDOFF,
185   IP4_REASSEMBLY_N_NEXT,
186 } ip4_reass_next_t;
187
188 typedef enum
189 {
190   RANGE_NEW,
191   RANGE_SHRINK,
192   RANGE_DISCARD,
193   RANGE_OVERLAP,
194   FINALIZE,
195 } ip4_reass_trace_operation_e;
196
197 typedef struct
198 {
199   u16 range_first;
200   u16 range_last;
201   u32 range_bi;
202   i32 data_offset;
203   u32 data_len;
204   u32 first_bi;
205 } ip4_reass_range_trace_t;
206
207 typedef struct
208 {
209   ip4_reass_trace_operation_e action;
210   u32 reass_id;
211   ip4_reass_range_trace_t trace_range;
212   u32 size_diff;
213   u32 op_id;
214   u32 fragment_first;
215   u32 fragment_last;
216   u32 total_data_len;
217 } ip4_reass_trace_t;
218
219 extern vlib_node_registration_t ip4_reass_node;
220 extern vlib_node_registration_t ip4_reass_node_feature;
221
222 static void
223 ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
224                          ip4_reass_range_trace_t * trace)
225 {
226   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
227   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
228   trace->range_first = vnb->ip.reass.range_first;
229   trace->range_last = vnb->ip.reass.range_last;
230   trace->data_offset = ip4_reass_buffer_get_data_offset (b);
231   trace->data_len = ip4_reass_buffer_get_data_len (b);
232   trace->range_bi = bi;
233 }
234
235 static u8 *
236 format_ip4_reass_range_trace (u8 * s, va_list * args)
237 {
238   ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
239   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
240               trace->range_last, trace->data_offset, trace->data_len,
241               trace->range_bi);
242   return s;
243 }
244
245 static u8 *
246 format_ip4_reass_trace (u8 * s, va_list * args)
247 {
248   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
249   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
250   ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
251   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
252   u32 indent = format_get_indent (s);
253   s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
254               t->trace_range.first_bi, t->total_data_len, t->fragment_first,
255               t->fragment_last);
256   switch (t->action)
257     {
258     case RANGE_SHRINK:
259       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
260                   format_ip4_reass_range_trace, &t->trace_range,
261                   t->size_diff);
262       break;
263     case RANGE_DISCARD:
264       s = format (s, "\n%Udiscard %U", format_white_space, indent,
265                   format_ip4_reass_range_trace, &t->trace_range);
266       break;
267     case RANGE_NEW:
268       s = format (s, "\n%Unew %U", format_white_space, indent,
269                   format_ip4_reass_range_trace, &t->trace_range);
270       break;
271     case RANGE_OVERLAP:
272       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
273                   format_ip4_reass_range_trace, &t->trace_range);
274       break;
275     case FINALIZE:
276       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
277       break;
278     }
279   return s;
280 }
281
282 static void
283 ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
284                      ip4_reass_main_t * rm, ip4_reass_t * reass, u32 bi,
285                      ip4_reass_trace_operation_e action, u32 size_diff)
286 {
287   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
288   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
289   if (pool_is_free_index (vm->trace_main.trace_buffer_pool, b->trace_index))
290     {
291       // this buffer's trace is gone
292       b->flags &= ~VLIB_BUFFER_IS_TRACED;
293       return;
294     }
295   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
296   t->reass_id = reass->id;
297   t->action = action;
298   ip4_reass_trace_details (vm, bi, &t->trace_range);
299   t->size_diff = size_diff;
300   t->op_id = reass->trace_op_counter;
301   ++reass->trace_op_counter;
302   t->fragment_first = vnb->ip.reass.fragment_first;
303   t->fragment_last = vnb->ip.reass.fragment_last;
304   t->trace_range.first_bi = reass->first_bi;
305   t->total_data_len = reass->data_len;
306 #if 0
307   static u8 *s = NULL;
308   s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
309   printf ("%.*s\n", vec_len (s), s);
310   fflush (stdout);
311   vec_reset_length (s);
312 #endif
313 }
314
315
316 always_inline void
317 ip4_reass_free (ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
318                 ip4_reass_t * reass)
319 {
320   clib_bihash_kv_16_8_t kv;
321   kv.key[0] = reass->key.as_u64[0];
322   kv.key[1] = reass->key.as_u64[1];
323   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
324   pool_put (rt->pool, reass);
325   --rt->reass_n;
326 }
327
328 always_inline void
329 ip4_reass_on_timeout (vlib_main_t * vm, ip4_reass_main_t * rm,
330                       ip4_reass_t * reass)
331 {
332   u32 range_bi = reass->first_bi;
333   vlib_buffer_t *range_b;
334   vnet_buffer_opaque_t *range_vnb;
335   u32 *to_free = NULL;
336   while (~0 != range_bi)
337     {
338       range_b = vlib_get_buffer (vm, range_bi);
339       range_vnb = vnet_buffer (range_b);
340       u32 bi = range_bi;
341       while (~0 != bi)
342         {
343           vec_add1 (to_free, bi);
344           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
345           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
346             {
347               bi = b->next_buffer;
348               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
349             }
350           else
351             {
352               bi = ~0;
353             }
354         }
355       range_bi = range_vnb->ip.reass.next_range_bi;
356     }
357   vlib_buffer_free (vm, to_free, vec_len (to_free));
358   vec_free (to_free);
359 }
360
361 static ip4_reass_t *
362 ip4_reass_find_or_create (vlib_main_t * vm, ip4_reass_main_t * rm,
363                           ip4_reass_per_thread_t * rt, ip4_reass_kv_t * kv,
364                           u8 * do_handoff)
365 {
366   ip4_reass_t *reass = NULL;
367   f64 now = vlib_time_now (rm->vlib_main);
368
369   if (!clib_bihash_search_16_8
370       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
371     {
372       if (vm->thread_index != kv->v.thread_index)
373         {
374           *do_handoff = 1;
375           return NULL;
376         }
377       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
378
379       if (now > reass->last_heard + rm->timeout)
380         {
381           ip4_reass_on_timeout (vm, rm, reass);
382           ip4_reass_free (rm, rt, reass);
383           reass = NULL;
384         }
385     }
386
387   if (reass)
388     {
389       reass->last_heard = now;
390       return reass;
391     }
392
393   if (rt->reass_n >= rm->max_reass_n)
394     {
395       reass = NULL;
396       return reass;
397     }
398   else
399     {
400       pool_get (rt->pool, reass);
401       clib_memset (reass, 0, sizeof (*reass));
402       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
403       ++rt->id_counter;
404       reass->first_bi = ~0;
405       reass->last_packet_octet = ~0;
406       reass->data_len = 0;
407       ++rt->reass_n;
408     }
409
410   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
411   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
412   kv->v.reass_index = (reass - rt->pool);
413   kv->v.thread_index = vm->thread_index;
414   reass->last_heard = now;
415
416   if (clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 1))
417     {
418       ip4_reass_free (rm, rt, reass);
419       reass = NULL;
420     }
421
422   return reass;
423 }
424
425 always_inline ip4_reass_rc_t
426 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
427                     ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
428                     ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
429                     bool is_feature)
430 {
431   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
432   vlib_buffer_t *last_b = NULL;
433   u32 sub_chain_bi = reass->first_bi;
434   u32 total_length = 0;
435   u32 buf_cnt = 0;
436   do
437     {
438       u32 tmp_bi = sub_chain_bi;
439       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
440       ip4_header_t *ip = vlib_buffer_get_current (tmp);
441       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
442       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
443           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
444         {
445           return IP4_REASS_RC_INTERNAL_ERROR;
446         }
447
448       u32 data_len = ip4_reass_buffer_get_data_len (tmp);
449       u32 trim_front =
450         ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
451       u32 trim_end =
452         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
453       if (tmp_bi == reass->first_bi)
454         {
455           /* first buffer - keep ip4 header */
456           if (0 != ip4_reass_buffer_get_data_offset (tmp))
457             {
458               return IP4_REASS_RC_INTERNAL_ERROR;
459             }
460           trim_front = 0;
461           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
462             ip4_header_bytes (ip);
463           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
464             {
465               return IP4_REASS_RC_INTERNAL_ERROR;
466             }
467         }
468       u32 keep_data =
469         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
470       while (1)
471         {
472           ++buf_cnt;
473           if (trim_front)
474             {
475               if (trim_front > tmp->current_length)
476                 {
477                   /* drop whole buffer */
478                   u32 to_be_freed_bi = tmp_bi;
479                   trim_front -= tmp->current_length;
480                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
481                     {
482                       return IP4_REASS_RC_INTERNAL_ERROR;
483                     }
484                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
485                   tmp_bi = tmp->next_buffer;
486                   tmp = vlib_get_buffer (vm, tmp_bi);
487                   vlib_buffer_free_one (vm, to_be_freed_bi);
488                   continue;
489                 }
490               else
491                 {
492                   vlib_buffer_advance (tmp, trim_front);
493                   trim_front = 0;
494                 }
495             }
496           if (keep_data)
497             {
498               if (last_b)
499                 {
500                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
501                   last_b->next_buffer = tmp_bi;
502                 }
503               last_b = tmp;
504               if (keep_data <= tmp->current_length)
505                 {
506                   tmp->current_length = keep_data;
507                   keep_data = 0;
508                 }
509               else
510                 {
511                   keep_data -= tmp->current_length;
512                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
513                     {
514                       return IP4_REASS_RC_INTERNAL_ERROR;
515                     }
516                 }
517               total_length += tmp->current_length;
518               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
519                 {
520                   tmp_bi = tmp->next_buffer;
521                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
522                 }
523               else
524                 {
525                   break;
526                 }
527             }
528           else
529             {
530               u32 to_be_freed_bi = tmp_bi;
531               if (reass->first_bi == tmp_bi)
532                 {
533                   return IP4_REASS_RC_INTERNAL_ERROR;
534                 }
535               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
536                 {
537                   tmp_bi = tmp->next_buffer;
538                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
539                   vlib_buffer_free_one (vm, to_be_freed_bi);
540                 }
541               else
542                 {
543                   vlib_buffer_free_one (vm, to_be_freed_bi);
544                   break;
545                 }
546             }
547         }
548       sub_chain_bi =
549         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
550         reass.next_range_bi;
551     }
552   while (~0 != sub_chain_bi);
553
554   if (!last_b)
555     {
556       return IP4_REASS_RC_INTERNAL_ERROR;
557     }
558   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
559   if (total_length < first_b->current_length)
560     {
561       return IP4_REASS_RC_INTERNAL_ERROR;
562     }
563   total_length -= first_b->current_length;
564   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
565   first_b->total_length_not_including_first_buffer = total_length;
566   ip4_header_t *ip = vlib_buffer_get_current (first_b);
567   ip->flags_and_fragment_offset = 0;
568   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
569   ip->checksum = ip4_header_checksum (ip);
570   if (!vlib_buffer_chain_linearize (vm, first_b))
571     {
572       return IP4_REASS_RC_NO_BUF;
573     }
574
575   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
576     {
577       ip4_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
578 #if 0
579       // following code does a hexdump of packet fragments to stdout ...
580       do
581         {
582           u32 bi = reass->first_bi;
583           u8 *s = NULL;
584           while (~0 != bi)
585             {
586               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
587               s = format (s, "%u: %U\n", bi, format_hexdump,
588                           vlib_buffer_get_current (b), b->current_length);
589               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
590                 {
591                   bi = b->next_buffer;
592                 }
593               else
594                 {
595                   break;
596                 }
597             }
598           printf ("%.*s\n", vec_len (s), s);
599           fflush (stdout);
600           vec_free (s);
601         }
602       while (0);
603 #endif
604     }
605   *bi0 = reass->first_bi;
606   if (is_feature)
607     {
608       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
609     }
610   else
611     {
612       *next0 = reass->next_index;
613     }
614   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
615   *error0 = IP4_ERROR_NONE;
616   ip4_reass_free (rm, rt, reass);
617   reass = NULL;
618   return IP4_REASS_RC_OK;
619 }
620
621 always_inline ip4_reass_rc_t
622 ip4_reass_insert_range_in_chain (vlib_main_t * vm,
623                                  ip4_reass_main_t * rm,
624                                  ip4_reass_per_thread_t * rt,
625                                  ip4_reass_t * reass,
626                                  u32 prev_range_bi, u32 new_next_bi)
627 {
628   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
629   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
630   if (~0 != prev_range_bi)
631     {
632       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
633       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
634       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
635       prev_vnb->ip.reass.next_range_bi = new_next_bi;
636     }
637   else
638     {
639       if (~0 != reass->first_bi)
640         {
641           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
642         }
643       reass->first_bi = new_next_bi;
644     }
645   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
646   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
647       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
648     {
649       return IP4_REASS_RC_INTERNAL_ERROR;
650     }
651   reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
652   return IP4_REASS_RC_OK;
653 }
654
655 always_inline ip4_reass_rc_t
656 ip4_reass_remove_range_from_chain (vlib_main_t * vm,
657                                    vlib_node_runtime_t * node,
658                                    ip4_reass_main_t * rm,
659                                    ip4_reass_t * reass, u32 prev_range_bi,
660                                    u32 discard_bi)
661 {
662   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
663   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
664   if (~0 != prev_range_bi)
665     {
666       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
667       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
668       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
669         {
670           return IP4_REASS_RC_INTERNAL_ERROR;
671         }
672       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
673     }
674   else
675     {
676       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
677     }
678   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
679   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
680       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
681     {
682       return IP4_REASS_RC_INTERNAL_ERROR;
683     }
684   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
685   while (1)
686     {
687       u32 to_be_freed_bi = discard_bi;
688       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
689         {
690           ip4_reass_add_trace (vm, node, rm, reass, discard_bi, RANGE_DISCARD,
691                                0);
692         }
693       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
694         {
695           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
696           discard_bi = discard_b->next_buffer;
697           discard_b = vlib_get_buffer (vm, discard_bi);
698           vlib_buffer_free_one (vm, to_be_freed_bi);
699         }
700       else
701         {
702           vlib_buffer_free_one (vm, to_be_freed_bi);
703           break;
704         }
705     }
706   return IP4_REASS_RC_OK;
707 }
708
709 always_inline ip4_reass_rc_t
710 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
711                   ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
712                   ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
713                   bool is_feature)
714 {
715   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
716   int consumed = 0;
717   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
718   ip4_header_t *fip = vlib_buffer_get_current (fb);
719   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
720   reass->next_index = fvnb->ip.reass.next_index;        // store next_index before it's overwritten
721   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
722   const u32 fragment_length =
723     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
724   const u32 fragment_last = fragment_first + fragment_length - 1;
725   fvnb->ip.reass.fragment_first = fragment_first;
726   fvnb->ip.reass.fragment_last = fragment_last;
727   int more_fragments = ip4_get_fragment_more (fip);
728   u32 candidate_range_bi = reass->first_bi;
729   u32 prev_range_bi = ~0;
730   fvnb->ip.reass.range_first = fragment_first;
731   fvnb->ip.reass.range_last = fragment_last;
732   fvnb->ip.reass.next_range_bi = ~0;
733   if (!more_fragments)
734     {
735       reass->last_packet_octet = fragment_last;
736     }
737   if (~0 == reass->first_bi)
738     {
739       // starting a new reassembly
740       rc =
741         ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
742                                          *bi0);
743       if (IP4_REASS_RC_OK != rc)
744         {
745           return rc;
746         }
747       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
748         {
749           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
750         }
751       *bi0 = ~0;
752       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
753       return IP4_REASS_RC_OK;
754     }
755   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
756                                          fvnb->ip.reass.estimated_mtu);
757   while (~0 != candidate_range_bi)
758     {
759       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
760       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
761       if (fragment_first > candidate_vnb->ip.reass.range_last)
762         {
763           // this fragments starts after candidate range
764           prev_range_bi = candidate_range_bi;
765           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
766           if (candidate_vnb->ip.reass.range_last < fragment_last &&
767               ~0 == candidate_range_bi)
768             {
769               // special case - this fragment falls beyond all known ranges
770               rc =
771                 ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
772                                                  prev_range_bi, *bi0);
773               if (IP4_REASS_RC_OK != rc)
774                 {
775                   return rc;
776                 }
777               consumed = 1;
778               break;
779             }
780           continue;
781         }
782       if (fragment_last < candidate_vnb->ip.reass.range_first)
783         {
784           // this fragment ends before candidate range without any overlap
785           rc =
786             ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
787                                              *bi0);
788           if (IP4_REASS_RC_OK != rc)
789             {
790               return rc;
791             }
792           consumed = 1;
793         }
794       else
795         {
796           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
797               fragment_last <= candidate_vnb->ip.reass.range_last)
798             {
799               // this fragment is a (sub)part of existing range, ignore it
800               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
801                 {
802                   ip4_reass_add_trace (vm, node, rm, reass, *bi0,
803                                        RANGE_OVERLAP, 0);
804                 }
805               break;
806             }
807           int discard_candidate = 0;
808           if (fragment_first < candidate_vnb->ip.reass.range_first)
809             {
810               u32 overlap =
811                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
812               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
813                 {
814                   candidate_vnb->ip.reass.range_first += overlap;
815                   if (reass->data_len < overlap)
816                     {
817                       return IP4_REASS_RC_INTERNAL_ERROR;
818                     }
819                   reass->data_len -= overlap;
820                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
821                     {
822                       ip4_reass_add_trace (vm, node, rm, reass,
823                                            candidate_range_bi, RANGE_SHRINK,
824                                            overlap);
825                     }
826                   rc =
827                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
828                                                      prev_range_bi, *bi0);
829                   if (IP4_REASS_RC_OK != rc)
830                     {
831                       return rc;
832                     }
833                   consumed = 1;
834                 }
835               else
836                 {
837                   discard_candidate = 1;
838                 }
839             }
840           else if (fragment_last > candidate_vnb->ip.reass.range_last)
841             {
842               u32 overlap =
843                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
844               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
845                 {
846                   fvnb->ip.reass.range_first += overlap;
847                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
848                     {
849                       prev_range_bi = candidate_range_bi;
850                       candidate_range_bi =
851                         candidate_vnb->ip.reass.next_range_bi;
852                       continue;
853                     }
854                   else
855                     {
856                       // special case - last range discarded
857                       rc =
858                         ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
859                                                          candidate_range_bi,
860                                                          *bi0);
861                       if (IP4_REASS_RC_OK != rc)
862                         {
863                           return rc;
864                         }
865                       consumed = 1;
866                     }
867                 }
868               else
869                 {
870                   discard_candidate = 1;
871                 }
872             }
873           else
874             {
875               discard_candidate = 1;
876             }
877           if (discard_candidate)
878             {
879               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
880               // discard candidate range, probe next range
881               rc =
882                 ip4_reass_remove_range_from_chain (vm, node, rm, reass,
883                                                    prev_range_bi,
884                                                    candidate_range_bi);
885               if (IP4_REASS_RC_OK != rc)
886                 {
887                   return rc;
888                 }
889               if (~0 != next_range_bi)
890                 {
891                   candidate_range_bi = next_range_bi;
892                   continue;
893                 }
894               else
895                 {
896                   // special case - last range discarded
897                   rc =
898                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
899                                                      prev_range_bi, *bi0);
900                   if (IP4_REASS_RC_OK != rc)
901                     {
902                       return rc;
903                     }
904                   consumed = 1;
905                 }
906             }
907         }
908       break;
909     }
910   if (consumed)
911     {
912       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
913         {
914           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
915         }
916     }
917   if (~0 != reass->last_packet_octet &&
918       reass->data_len == reass->last_packet_octet + 1)
919     {
920       return ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
921                                  is_feature);
922     }
923   else
924     {
925       if (consumed)
926         {
927           *bi0 = ~0;
928         }
929       else
930         {
931           *next0 = IP4_REASSEMBLY_NEXT_DROP;
932           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
933         }
934     }
935   return rc;
936 }
937
938 always_inline uword
939 ip4_reassembly_inline (vlib_main_t * vm,
940                        vlib_node_runtime_t * node,
941                        vlib_frame_t * frame, bool is_feature)
942 {
943   u32 *from = vlib_frame_vector_args (frame);
944   u32 n_left_from, n_left_to_next, *to_next, next_index;
945   ip4_reass_main_t *rm = &ip4_reass_main;
946   ip4_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
947   clib_spinlock_lock (&rt->lock);
948
949   n_left_from = frame->n_vectors;
950   next_index = node->cached_next_index;
951   while (n_left_from > 0)
952     {
953       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
954
955       while (n_left_from > 0 && n_left_to_next > 0)
956         {
957           u32 bi0;
958           vlib_buffer_t *b0;
959           u32 next0;
960           u32 error0 = IP4_ERROR_NONE;
961
962           bi0 = from[0];
963           b0 = vlib_get_buffer (vm, bi0);
964
965           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
966           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
967             {
968               // this is a whole packet - no fragmentation
969               if (is_feature)
970                 {
971                   next0 = IP4_REASSEMBLY_NEXT_INPUT;
972                 }
973               else
974                 {
975                   next0 = vnet_buffer (b0)->ip.reass.next_index;
976                 }
977             }
978           else
979             {
980               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
981               const u32 fragment_length =
982                 clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
983               const u32 fragment_last = fragment_first + fragment_length - 1;
984               if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
985                 {
986                   next0 = IP4_REASSEMBLY_NEXT_DROP;
987                   error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
988                 }
989               else
990                 {
991                   ip4_reass_kv_t kv;
992                   u8 do_handoff = 0;
993
994                   kv.k.as_u64[0] =
995                     (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
996                                    vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
997                     (u64) ip0->src_address.as_u32 << 32;
998                   kv.k.as_u64[1] =
999                     (u64) ip0->dst_address.as_u32 |
1000                     (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1001
1002                   ip4_reass_t *reass =
1003                     ip4_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
1004
1005                   if (PREDICT_FALSE (do_handoff))
1006                     {
1007                       next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
1008                       if (is_feature)
1009                         vnet_buffer (b0)->ip.
1010                           reass.owner_feature_thread_index =
1011                           kv.v.thread_index;
1012                       else
1013                         vnet_buffer (b0)->ip.reass.owner_thread_index =
1014                           kv.v.thread_index;
1015                     }
1016                   else if (reass)
1017                     {
1018                       switch (ip4_reass_update
1019                               (vm, node, rm, rt, reass, &bi0, &next0,
1020                                &error0, is_feature))
1021                         {
1022                         case IP4_REASS_RC_OK:
1023                           /* nothing to do here */
1024                           break;
1025                         case IP4_REASS_RC_NO_BUF:
1026                           /* fallthrough */
1027                         case IP4_REASS_RC_INTERNAL_ERROR:
1028                           /* drop everything and start with a clean slate */
1029                           ip4_reass_on_timeout (vm, rm, reass);
1030                           ip4_reass_free (rm, rt, reass);
1031                           goto next_packet;
1032                           break;
1033                         }
1034                     }
1035                   else
1036                     {
1037                       next0 = IP4_REASSEMBLY_NEXT_DROP;
1038                       error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1039                     }
1040                 }
1041
1042               b0->error = node->errors[error0];
1043             }
1044
1045           if (bi0 != ~0)
1046             {
1047               to_next[0] = bi0;
1048               to_next += 1;
1049               n_left_to_next -= 1;
1050               if (is_feature && IP4_ERROR_NONE == error0)
1051                 {
1052                   b0 = vlib_get_buffer (vm, bi0);
1053                   vnet_feature_next (&next0, b0);
1054                 }
1055               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1056                                                to_next, n_left_to_next,
1057                                                bi0, next0);
1058               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1059             }
1060
1061         next_packet:
1062           from += 1;
1063           n_left_from -= 1;
1064         }
1065
1066       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1067     }
1068
1069   clib_spinlock_unlock (&rt->lock);
1070   return frame->n_vectors;
1071 }
1072
1073 static char *ip4_reassembly_error_strings[] = {
1074 #define _(sym, string) string,
1075   foreach_ip4_error
1076 #undef _
1077 };
1078
1079 VLIB_NODE_FN (ip4_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1080                                vlib_frame_t * frame)
1081 {
1082   return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ );
1083 }
1084
1085 /* *INDENT-OFF* */
1086 VLIB_REGISTER_NODE (ip4_reass_node) = {
1087     .name = "ip4-reassembly",
1088     .vector_size = sizeof (u32),
1089     .format_trace = format_ip4_reass_trace,
1090     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1091     .error_strings = ip4_reassembly_error_strings,
1092     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1093     .next_nodes =
1094         {
1095                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1096                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1097                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reassembly-handoff",
1098
1099         },
1100 };
1101 /* *INDENT-ON* */
1102
1103 VLIB_NODE_FN (ip4_reass_node_feature) (vlib_main_t * vm,
1104                                        vlib_node_runtime_t * node,
1105                                        vlib_frame_t * frame)
1106 {
1107   return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ );
1108 }
1109
1110 /* *INDENT-OFF* */
1111 VLIB_REGISTER_NODE (ip4_reass_node_feature) = {
1112     .name = "ip4-reassembly-feature",
1113     .vector_size = sizeof (u32),
1114     .format_trace = format_ip4_reass_trace,
1115     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1116     .error_strings = ip4_reassembly_error_strings,
1117     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1118     .next_nodes =
1119         {
1120                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1121                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1122                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reass-feature-hoff",
1123         },
1124 };
1125 /* *INDENT-ON* */
1126
1127 /* *INDENT-OFF* */
1128 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
1129     .arc_name = "ip4-unicast",
1130     .node_name = "ip4-reassembly-feature",
1131     .runs_before = VNET_FEATURES ("ip4-lookup"),
1132     .runs_after = 0,
1133 };
1134 /* *INDENT-ON* */
1135
1136 #ifndef CLIB_MARCH_VARIANT
1137 always_inline u32
1138 ip4_reass_get_nbuckets ()
1139 {
1140   ip4_reass_main_t *rm = &ip4_reass_main;
1141   u32 nbuckets;
1142   u8 i;
1143
1144   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1145
1146   for (i = 0; i < 31; i++)
1147     if ((1 << i) >= nbuckets)
1148       break;
1149   nbuckets = 1 << i;
1150
1151   return nbuckets;
1152 }
1153 #endif /* CLIB_MARCH_VARIANT */
1154
1155 typedef enum
1156 {
1157   IP4_EVENT_CONFIG_CHANGED = 1,
1158 } ip4_reass_event_t;
1159
1160 typedef struct
1161 {
1162   int failure;
1163   clib_bihash_16_8_t *new_hash;
1164 } ip4_rehash_cb_ctx;
1165
1166 #ifndef CLIB_MARCH_VARIANT
1167 static void
1168 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1169 {
1170   ip4_rehash_cb_ctx *ctx = _ctx;
1171   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1172     {
1173       ctx->failure = 1;
1174     }
1175 }
1176
1177 static void
1178 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1179                       u32 expire_walk_interval_ms)
1180 {
1181   ip4_reass_main.timeout_ms = timeout_ms;
1182   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1183   ip4_reass_main.max_reass_n = max_reassemblies;
1184   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1185 }
1186
1187 vnet_api_error_t
1188 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
1189                u32 expire_walk_interval_ms)
1190 {
1191   u32 old_nbuckets = ip4_reass_get_nbuckets ();
1192   ip4_reass_set_params (timeout_ms, max_reassemblies,
1193                         expire_walk_interval_ms);
1194   vlib_process_signal_event (ip4_reass_main.vlib_main,
1195                              ip4_reass_main.ip4_reass_expire_node_idx,
1196                              IP4_EVENT_CONFIG_CHANGED, 0);
1197   u32 new_nbuckets = ip4_reass_get_nbuckets ();
1198   if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1199     {
1200       clib_bihash_16_8_t new_hash;
1201       clib_memset (&new_hash, 0, sizeof (new_hash));
1202       ip4_rehash_cb_ctx ctx;
1203       ctx.failure = 0;
1204       ctx.new_hash = &new_hash;
1205       clib_bihash_init_16_8 (&new_hash, "ip4-reass", new_nbuckets,
1206                              new_nbuckets * 1024);
1207       clib_bihash_foreach_key_value_pair_16_8 (&ip4_reass_main.hash,
1208                                                ip4_rehash_cb, &ctx);
1209       if (ctx.failure)
1210         {
1211           clib_bihash_free_16_8 (&new_hash);
1212           return -1;
1213         }
1214       else
1215         {
1216           clib_bihash_free_16_8 (&ip4_reass_main.hash);
1217           clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
1218                             sizeof (ip4_reass_main.hash));
1219         }
1220     }
1221   return 0;
1222 }
1223
1224 vnet_api_error_t
1225 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1226                u32 * expire_walk_interval_ms)
1227 {
1228   *timeout_ms = ip4_reass_main.timeout_ms;
1229   *max_reassemblies = ip4_reass_main.max_reass_n;
1230   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
1231   return 0;
1232 }
1233
1234 static clib_error_t *
1235 ip4_reass_init_function (vlib_main_t * vm)
1236 {
1237   ip4_reass_main_t *rm = &ip4_reass_main;
1238   clib_error_t *error = 0;
1239   u32 nbuckets;
1240   vlib_node_t *node;
1241
1242   rm->vlib_main = vm;
1243   rm->vnet_main = vnet_get_main ();
1244
1245   vec_validate (rm->per_thread_data, vlib_num_workers ());
1246   ip4_reass_per_thread_t *rt;
1247   vec_foreach (rt, rm->per_thread_data)
1248   {
1249     clib_spinlock_init (&rt->lock);
1250     pool_alloc (rt->pool, rm->max_reass_n);
1251   }
1252
1253   node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
1254   ASSERT (node);
1255   rm->ip4_reass_expire_node_idx = node->index;
1256
1257   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1258                         IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1259                         IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1260
1261   nbuckets = ip4_reass_get_nbuckets ();
1262   clib_bihash_init_16_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
1263
1264   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1265   ASSERT (node);
1266   rm->ip4_drop_idx = node->index;
1267
1268   rm->fq_index = vlib_frame_queue_main_init (ip4_reass_node.index, 0);
1269   rm->fq_feature_index =
1270     vlib_frame_queue_main_init (ip4_reass_node_feature.index, 0);
1271
1272
1273   return error;
1274 }
1275
1276 VLIB_INIT_FUNCTION (ip4_reass_init_function);
1277 #endif /* CLIB_MARCH_VARIANT */
1278
1279 static uword
1280 ip4_reass_walk_expired (vlib_main_t * vm,
1281                         vlib_node_runtime_t * node, vlib_frame_t * f)
1282 {
1283   ip4_reass_main_t *rm = &ip4_reass_main;
1284   uword event_type, *event_data = 0;
1285
1286   while (true)
1287     {
1288       vlib_process_wait_for_event_or_clock (vm,
1289                                             (f64)
1290                                             rm->expire_walk_interval_ms /
1291                                             (f64) MSEC_PER_SEC);
1292       event_type = vlib_process_get_events (vm, &event_data);
1293
1294       switch (event_type)
1295         {
1296         case ~0:                /* no events => timeout */
1297           /* nothing to do here */
1298           break;
1299         case IP4_EVENT_CONFIG_CHANGED:
1300           break;
1301         default:
1302           clib_warning ("BUG: event type 0x%wx", event_type);
1303           break;
1304         }
1305       f64 now = vlib_time_now (vm);
1306
1307       ip4_reass_t *reass;
1308       int *pool_indexes_to_free = NULL;
1309
1310       uword thread_index = 0;
1311       int index;
1312       const uword nthreads = vlib_num_workers () + 1;
1313       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1314         {
1315           ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1316           clib_spinlock_lock (&rt->lock);
1317
1318           vec_reset_length (pool_indexes_to_free);
1319           /* *INDENT-OFF* */
1320           pool_foreach_index (index, rt->pool, ({
1321                                 reass = pool_elt_at_index (rt->pool, index);
1322                                 if (now > reass->last_heard + rm->timeout)
1323                                   {
1324                                     vec_add1 (pool_indexes_to_free, index);
1325                                   }
1326                               }));
1327           /* *INDENT-ON* */
1328           int *i;
1329           /* *INDENT-OFF* */
1330           vec_foreach (i, pool_indexes_to_free)
1331           {
1332             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1333             ip4_reass_on_timeout (vm, rm, reass);
1334             ip4_reass_free (rm, rt, reass);
1335           }
1336           /* *INDENT-ON* */
1337
1338           clib_spinlock_unlock (&rt->lock);
1339         }
1340
1341       vec_free (pool_indexes_to_free);
1342       if (event_data)
1343         {
1344           _vec_len (event_data) = 0;
1345         }
1346     }
1347
1348   return 0;
1349 }
1350
1351 /* *INDENT-OFF* */
1352 VLIB_REGISTER_NODE (ip4_reass_expire_node, static) = {
1353     .function = ip4_reass_walk_expired,
1354     .type = VLIB_NODE_TYPE_PROCESS,
1355     .name = "ip4-reassembly-expire-walk",
1356     .format_trace = format_ip4_reass_trace,
1357     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1358     .error_strings = ip4_reassembly_error_strings,
1359
1360 };
1361 /* *INDENT-ON* */
1362
1363 static u8 *
1364 format_ip4_reass_key (u8 * s, va_list * args)
1365 {
1366   ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
1367   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1368               key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1369               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1370   return s;
1371 }
1372
1373 static u8 *
1374 format_ip4_reass (u8 * s, va_list * args)
1375 {
1376   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1377   ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
1378
1379   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1380               "last_packet_octet: %u, trace_op_counter: %u\n",
1381               reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
1382               reass->data_len, reass->last_packet_octet,
1383               reass->trace_op_counter);
1384   u32 bi = reass->first_bi;
1385   u32 counter = 0;
1386   while (~0 != bi)
1387     {
1388       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1389       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1390       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1391                   "fragment[%u, %u]\n",
1392                   counter, vnb->ip.reass.range_first,
1393                   vnb->ip.reass.range_last, bi,
1394                   ip4_reass_buffer_get_data_offset (b),
1395                   ip4_reass_buffer_get_data_len (b),
1396                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1397       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1398         {
1399           bi = b->next_buffer;
1400         }
1401       else
1402         {
1403           bi = ~0;
1404         }
1405     }
1406   return s;
1407 }
1408
1409 static clib_error_t *
1410 show_ip4_reass (vlib_main_t * vm,
1411                 unformat_input_t * input,
1412                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1413 {
1414   ip4_reass_main_t *rm = &ip4_reass_main;
1415
1416   vlib_cli_output (vm, "---------------------");
1417   vlib_cli_output (vm, "IP4 reassembly status");
1418   vlib_cli_output (vm, "---------------------");
1419   bool details = false;
1420   if (unformat (input, "details"))
1421     {
1422       details = true;
1423     }
1424
1425   u32 sum_reass_n = 0;
1426   ip4_reass_t *reass;
1427   uword thread_index;
1428   const uword nthreads = vlib_num_workers () + 1;
1429   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1430     {
1431       ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1432       clib_spinlock_lock (&rt->lock);
1433       if (details)
1434         {
1435           /* *INDENT-OFF* */
1436           pool_foreach (reass, rt->pool, {
1437             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1438           });
1439           /* *INDENT-ON* */
1440         }
1441       sum_reass_n += rt->reass_n;
1442       clib_spinlock_unlock (&rt->lock);
1443     }
1444   vlib_cli_output (vm, "---------------------");
1445   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1446                    (long unsigned) sum_reass_n);
1447   vlib_cli_output (vm,
1448                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1449                    (long unsigned) rm->max_reass_n);
1450   return 0;
1451 }
1452
1453 /* *INDENT-OFF* */
1454 VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
1455     .path = "show ip4-reassembly",
1456     .short_help = "show ip4-reassembly [details]",
1457     .function = show_ip4_reass,
1458 };
1459 /* *INDENT-ON* */
1460
1461 #ifndef CLIB_MARCH_VARIANT
1462 vnet_api_error_t
1463 ip4_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1464 {
1465   return vnet_feature_enable_disable ("ip4-unicast",
1466                                       "ip4-reassembly-feature", sw_if_index,
1467                                       enable_disable, 0, 0);
1468 }
1469 #endif /* CLIB_MARCH_VARIANT */
1470
1471
1472 #define foreach_ip4_reassembly_handoff_error                       \
1473 _(CONGESTION_DROP, "congestion drop")
1474
1475
1476 typedef enum
1477 {
1478 #define _(sym,str) IP4_REASSEMBLY_HANDOFF_ERROR_##sym,
1479   foreach_ip4_reassembly_handoff_error
1480 #undef _
1481     IP4_REASSEMBLY_HANDOFF_N_ERROR,
1482 } ip4_reassembly_handoff_error_t;
1483
1484 static char *ip4_reassembly_handoff_error_strings[] = {
1485 #define _(sym,string) string,
1486   foreach_ip4_reassembly_handoff_error
1487 #undef _
1488 };
1489
1490 typedef struct
1491 {
1492   u32 next_worker_index;
1493 } ip4_reassembly_handoff_trace_t;
1494
1495 static u8 *
1496 format_ip4_reassembly_handoff_trace (u8 * s, va_list * args)
1497 {
1498   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1499   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1500   ip4_reassembly_handoff_trace_t *t =
1501     va_arg (*args, ip4_reassembly_handoff_trace_t *);
1502
1503   s =
1504     format (s, "ip4-reassembly-handoff: next-worker %d",
1505             t->next_worker_index);
1506
1507   return s;
1508 }
1509
1510 always_inline uword
1511 ip4_reassembly_handoff_node_inline (vlib_main_t * vm,
1512                                     vlib_node_runtime_t * node,
1513                                     vlib_frame_t * frame, bool is_feature)
1514 {
1515   ip4_reass_main_t *rm = &ip4_reass_main;
1516
1517   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1518   u32 n_enq, n_left_from, *from;
1519   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1520   u32 fq_index;
1521
1522   from = vlib_frame_vector_args (frame);
1523   n_left_from = frame->n_vectors;
1524   vlib_get_buffers (vm, from, bufs, n_left_from);
1525
1526   b = bufs;
1527   ti = thread_indices;
1528
1529   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1530
1531   while (n_left_from > 0)
1532     {
1533       ti[0] =
1534         (is_feature) ? vnet_buffer (b[0])->ip.
1535         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1536         reass.owner_thread_index;
1537
1538       if (PREDICT_FALSE
1539           ((node->flags & VLIB_NODE_FLAG_TRACE)
1540            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1541         {
1542           ip4_reassembly_handoff_trace_t *t =
1543             vlib_add_trace (vm, node, b[0], sizeof (*t));
1544           t->next_worker_index = ti[0];
1545         }
1546
1547       n_left_from -= 1;
1548       ti += 1;
1549       b += 1;
1550     }
1551   n_enq =
1552     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1553                                    frame->n_vectors, 1);
1554
1555   if (n_enq < frame->n_vectors)
1556     vlib_node_increment_counter (vm, node->node_index,
1557                                  IP4_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1558                                  frame->n_vectors - n_enq);
1559   return frame->n_vectors;
1560 }
1561
1562 VLIB_NODE_FN (ip4_reassembly_handoff_node) (vlib_main_t * vm,
1563                                             vlib_node_runtime_t * node,
1564                                             vlib_frame_t * frame)
1565 {
1566   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1567                                              false /* is_feature */ );
1568 }
1569
1570
1571 /* *INDENT-OFF* */
1572 VLIB_REGISTER_NODE (ip4_reassembly_handoff_node) = {
1573   .name = "ip4-reassembly-handoff",
1574   .vector_size = sizeof (u32),
1575   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1576   .error_strings = ip4_reassembly_handoff_error_strings,
1577   .format_trace = format_ip4_reassembly_handoff_trace,
1578
1579   .n_next_nodes = 1,
1580
1581   .next_nodes = {
1582     [0] = "error-drop",
1583   },
1584 };
1585 /* *INDENT-ON* */
1586
1587
1588 /* *INDENT-OFF* */
1589 VLIB_NODE_FN (ip4_reassembly_feature_handoff_node) (vlib_main_t * vm,
1590                                                     vlib_node_runtime_t *
1591                                                     node,
1592                                                     vlib_frame_t * frame)
1593 {
1594   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1595                                              true /* is_feature */ );
1596 }
1597 /* *INDENT-ON* */
1598
1599
1600 /* *INDENT-OFF* */
1601 VLIB_REGISTER_NODE (ip4_reassembly_feature_handoff_node) = {
1602   .name = "ip4-reass-feature-hoff",
1603   .vector_size = sizeof (u32),
1604   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1605   .error_strings = ip4_reassembly_handoff_error_strings,
1606   .format_trace = format_ip4_reassembly_handoff_trace,
1607
1608   .n_next_nodes = 1,
1609
1610   .next_nodes = {
1611     [0] = "error-drop",
1612   },
1613 };
1614 /* *INDENT-ON* */
1615
1616 /*
1617  * fd.io coding-style-patch-verification: ON
1618  *
1619  * Local Variables:
1620  * eval: (c-set-style "gnu")
1621  * End:
1622  */