ip: migrate old MULTIARCH macros to VLIB_NODE_FN
[vpp.git] / src / vnet / ip / ip4_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Reassembly.
19  *
20  * This file contains the source code for IPv4 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_16_8.h>
27 #include <vnet/ip/ip4_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
34
35 #define IP4_REASS_DEBUG_BUFFERS 0
36 #if IP4_REASS_DEBUG_BUFFERS
37 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
38   do                                                 \
39     {                                                \
40       u32 _bi = bi;                                  \
41       printf (#what "buffer %u", _bi);               \
42       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
43       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
44         {                                            \
45           _bi = _b->next_buffer;                     \
46           printf ("[%u]", _bi);                      \
47           _b = vlib_get_buffer (vm, _bi);            \
48         }                                            \
49       printf ("\n");                                 \
50       fflush (stdout);                               \
51     }                                                \
52   while (0)
53 #else
54 #define IP4_REASS_DEBUG_BUFFER(...)
55 #endif
56
57 typedef enum
58 {
59   IP4_REASS_RC_OK,
60   IP4_REASS_RC_INTERNAL_ERROR,
61   IP4_REASS_RC_NO_BUF,
62 } ip4_reass_rc_t;
63
64 typedef struct
65 {
66   union
67   {
68     struct
69     {
70       u32 xx_id;
71       ip4_address_t src;
72       ip4_address_t dst;
73       u16 frag_id;
74       u8 proto;
75       u8 unused;
76     };
77     u64 as_u64[2];
78   };
79 } ip4_reass_key_t;
80
81 typedef union
82 {
83   struct
84   {
85     u32 reass_index;
86     u32 thread_index;
87   };
88   u64 as_u64;
89 } ip4_reass_val_t;
90
91 typedef union
92 {
93   struct
94   {
95     ip4_reass_key_t k;
96     ip4_reass_val_t v;
97   };
98   clib_bihash_kv_16_8_t kv;
99 } ip4_reass_kv_t;
100
101 always_inline u32
102 ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
103 {
104   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
105   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
106 }
107
108 always_inline u16
109 ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
110 {
111   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
112   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
113     (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
114 }
115
116 typedef struct
117 {
118   // hash table key
119   ip4_reass_key_t key;
120   // time when last packet was received
121   f64 last_heard;
122   // internal id of this reassembly
123   u64 id;
124   // buffer index of first buffer in this reassembly context
125   u32 first_bi;
126   // last octet of packet, ~0 until fragment without more_fragments arrives
127   u32 last_packet_octet;
128   // length of data collected so far
129   u32 data_len;
130   // trace operation counter
131   u32 trace_op_counter;
132   // next index - used by non-feature node
133   u8 next_index;
134   // minimum fragment length for this reassembly - used to estimate MTU
135   u16 min_fragment_length;
136
137 } ip4_reass_t;
138
139 typedef struct
140 {
141   ip4_reass_t *pool;
142   u32 reass_n;
143   u32 id_counter;
144   clib_spinlock_t lock;
145 } ip4_reass_per_thread_t;
146
147 typedef struct
148 {
149   // IPv4 config
150   u32 timeout_ms;
151   f64 timeout;
152   u32 expire_walk_interval_ms;
153   u32 max_reass_n;
154
155   // IPv4 runtime
156   clib_bihash_16_8_t hash;
157   // per-thread data
158   ip4_reass_per_thread_t *per_thread_data;
159
160   // convenience
161   vlib_main_t *vlib_main;
162   vnet_main_t *vnet_main;
163
164   // node index of ip4-drop node
165   u32 ip4_drop_idx;
166   u32 ip4_reass_expire_node_idx;
167
168   /** Worker handoff */
169   u32 fq_index;
170   u32 fq_feature_index;
171
172 } ip4_reass_main_t;
173
174 extern ip4_reass_main_t ip4_reass_main;
175
176 #ifndef CLIB_MARCH_VARIANT
177 ip4_reass_main_t ip4_reass_main;
178 #endif /* CLIB_MARCH_VARIANT */
179
180 typedef enum
181 {
182   IP4_REASSEMBLY_NEXT_INPUT,
183   IP4_REASSEMBLY_NEXT_DROP,
184   IP4_REASSEMBLY_NEXT_HANDOFF,
185   IP4_REASSEMBLY_N_NEXT,
186 } ip4_reass_next_t;
187
188 typedef enum
189 {
190   RANGE_NEW,
191   RANGE_SHRINK,
192   RANGE_DISCARD,
193   RANGE_OVERLAP,
194   FINALIZE,
195 } ip4_reass_trace_operation_e;
196
197 typedef struct
198 {
199   u16 range_first;
200   u16 range_last;
201   u32 range_bi;
202   i32 data_offset;
203   u32 data_len;
204   u32 first_bi;
205 } ip4_reass_range_trace_t;
206
207 typedef struct
208 {
209   ip4_reass_trace_operation_e action;
210   u32 reass_id;
211   ip4_reass_range_trace_t trace_range;
212   u32 size_diff;
213   u32 op_id;
214   u32 fragment_first;
215   u32 fragment_last;
216   u32 total_data_len;
217 } ip4_reass_trace_t;
218
219 extern vlib_node_registration_t ip4_reass_node;
220 extern vlib_node_registration_t ip4_reass_node_feature;
221
222 static void
223 ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
224                          ip4_reass_range_trace_t * trace)
225 {
226   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
227   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
228   trace->range_first = vnb->ip.reass.range_first;
229   trace->range_last = vnb->ip.reass.range_last;
230   trace->data_offset = ip4_reass_buffer_get_data_offset (b);
231   trace->data_len = ip4_reass_buffer_get_data_len (b);
232   trace->range_bi = bi;
233 }
234
235 static u8 *
236 format_ip4_reass_range_trace (u8 * s, va_list * args)
237 {
238   ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
239   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
240               trace->range_last, trace->data_offset, trace->data_len,
241               trace->range_bi);
242   return s;
243 }
244
245 static u8 *
246 format_ip4_reass_trace (u8 * s, va_list * args)
247 {
248   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
249   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
250   ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
251   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
252   u32 indent = format_get_indent (s);
253   s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
254               t->trace_range.first_bi, t->total_data_len, t->fragment_first,
255               t->fragment_last);
256   switch (t->action)
257     {
258     case RANGE_SHRINK:
259       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
260                   format_ip4_reass_range_trace, &t->trace_range,
261                   t->size_diff);
262       break;
263     case RANGE_DISCARD:
264       s = format (s, "\n%Udiscard %U", format_white_space, indent,
265                   format_ip4_reass_range_trace, &t->trace_range);
266       break;
267     case RANGE_NEW:
268       s = format (s, "\n%Unew %U", format_white_space, indent,
269                   format_ip4_reass_range_trace, &t->trace_range);
270       break;
271     case RANGE_OVERLAP:
272       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
273                   format_ip4_reass_range_trace, &t->trace_range);
274       break;
275     case FINALIZE:
276       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
277       break;
278     }
279   return s;
280 }
281
282 static void
283 ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
284                      ip4_reass_main_t * rm, ip4_reass_t * reass, u32 bi,
285                      ip4_reass_trace_operation_e action, u32 size_diff)
286 {
287   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
288   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
289   if (pool_is_free_index (vm->trace_main.trace_buffer_pool, b->trace_index))
290     {
291       // this buffer's trace is gone
292       b->flags &= ~VLIB_BUFFER_IS_TRACED;
293       return;
294     }
295   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
296   t->reass_id = reass->id;
297   t->action = action;
298   ip4_reass_trace_details (vm, bi, &t->trace_range);
299   t->size_diff = size_diff;
300   t->op_id = reass->trace_op_counter;
301   ++reass->trace_op_counter;
302   t->fragment_first = vnb->ip.reass.fragment_first;
303   t->fragment_last = vnb->ip.reass.fragment_last;
304   t->trace_range.first_bi = reass->first_bi;
305   t->total_data_len = reass->data_len;
306 #if 0
307   static u8 *s = NULL;
308   s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
309   printf ("%.*s\n", vec_len (s), s);
310   fflush (stdout);
311   vec_reset_length (s);
312 #endif
313 }
314
315
316 always_inline void
317 ip4_reass_free (ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
318                 ip4_reass_t * reass)
319 {
320   clib_bihash_kv_16_8_t kv;
321   kv.key[0] = reass->key.as_u64[0];
322   kv.key[1] = reass->key.as_u64[1];
323   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
324   pool_put (rt->pool, reass);
325   --rt->reass_n;
326 }
327
328 always_inline void
329 ip4_reass_on_timeout (vlib_main_t * vm, ip4_reass_main_t * rm,
330                       ip4_reass_t * reass)
331 {
332   u32 range_bi = reass->first_bi;
333   vlib_buffer_t *range_b;
334   vnet_buffer_opaque_t *range_vnb;
335   u32 *to_free = NULL;
336   while (~0 != range_bi)
337     {
338       range_b = vlib_get_buffer (vm, range_bi);
339       range_vnb = vnet_buffer (range_b);
340       u32 bi = range_bi;
341       while (~0 != bi)
342         {
343           vec_add1 (to_free, bi);
344           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
345           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
346             {
347               bi = b->next_buffer;
348               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
349             }
350           else
351             {
352               bi = ~0;
353             }
354         }
355       range_bi = range_vnb->ip.reass.next_range_bi;
356     }
357   vlib_buffer_free (vm, to_free, vec_len (to_free));
358   vec_free (to_free);
359 }
360
361 static ip4_reass_t *
362 ip4_reass_find_or_create (vlib_main_t * vm, ip4_reass_main_t * rm,
363                           ip4_reass_per_thread_t * rt, ip4_reass_kv_t * kv,
364                           u8 * do_handoff)
365 {
366   ip4_reass_t *reass = NULL;
367   f64 now = vlib_time_now (rm->vlib_main);
368
369   if (!clib_bihash_search_16_8
370       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
371     {
372       if (vm->thread_index != kv->v.thread_index)
373         {
374           *do_handoff = 1;
375           return NULL;
376         }
377       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
378
379       if (now > reass->last_heard + rm->timeout)
380         {
381           ip4_reass_on_timeout (vm, rm, reass);
382           ip4_reass_free (rm, rt, reass);
383           reass = NULL;
384         }
385     }
386
387   if (reass)
388     {
389       reass->last_heard = now;
390       return reass;
391     }
392
393   if (rt->reass_n >= rm->max_reass_n)
394     {
395       reass = NULL;
396       return reass;
397     }
398   else
399     {
400       pool_get (rt->pool, reass);
401       clib_memset (reass, 0, sizeof (*reass));
402       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
403       ++rt->id_counter;
404       reass->first_bi = ~0;
405       reass->last_packet_octet = ~0;
406       reass->data_len = 0;
407       ++rt->reass_n;
408     }
409
410   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
411   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
412   kv->v.reass_index = (reass - rt->pool);
413   kv->v.thread_index = vm->thread_index;
414   reass->last_heard = now;
415
416   if (clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 1))
417     {
418       ip4_reass_free (rm, rt, reass);
419       reass = NULL;
420     }
421
422   return reass;
423 }
424
425 always_inline ip4_reass_rc_t
426 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
427                     ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
428                     ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
429                     bool is_feature)
430 {
431   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
432   vlib_buffer_t *last_b = NULL;
433   u32 sub_chain_bi = reass->first_bi;
434   u32 total_length = 0;
435   u32 buf_cnt = 0;
436   do
437     {
438       u32 tmp_bi = sub_chain_bi;
439       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
440       ip4_header_t *ip = vlib_buffer_get_current (tmp);
441       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
442       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
443           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
444         {
445           return IP4_REASS_RC_INTERNAL_ERROR;
446         }
447
448       u32 data_len = ip4_reass_buffer_get_data_len (tmp);
449       u32 trim_front =
450         ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
451       u32 trim_end =
452         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
453       if (tmp_bi == reass->first_bi)
454         {
455           /* first buffer - keep ip4 header */
456           if (0 != ip4_reass_buffer_get_data_offset (tmp))
457             {
458               return IP4_REASS_RC_INTERNAL_ERROR;
459             }
460           trim_front = 0;
461           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
462             ip4_header_bytes (ip);
463           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
464             {
465               return IP4_REASS_RC_INTERNAL_ERROR;
466             }
467         }
468       u32 keep_data =
469         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
470       while (1)
471         {
472           ++buf_cnt;
473           if (trim_front)
474             {
475               if (trim_front > tmp->current_length)
476                 {
477                   /* drop whole buffer */
478                   vlib_buffer_free_one (vm, tmp_bi);
479                   trim_front -= tmp->current_length;
480                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
481                     {
482                       return IP4_REASS_RC_INTERNAL_ERROR;
483                     }
484                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
485                   tmp_bi = tmp->next_buffer;
486                   tmp = vlib_get_buffer (vm, tmp_bi);
487                   continue;
488                 }
489               else
490                 {
491                   vlib_buffer_advance (tmp, trim_front);
492                   trim_front = 0;
493                 }
494             }
495           if (keep_data)
496             {
497               if (last_b)
498                 {
499                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
500                   last_b->next_buffer = tmp_bi;
501                 }
502               last_b = tmp;
503               if (keep_data <= tmp->current_length)
504                 {
505                   tmp->current_length = keep_data;
506                   keep_data = 0;
507                 }
508               else
509                 {
510                   keep_data -= tmp->current_length;
511                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
512                     {
513                       return IP4_REASS_RC_INTERNAL_ERROR;
514                     }
515                 }
516               total_length += tmp->current_length;
517             }
518           else
519             {
520               vlib_buffer_free_one (vm, tmp_bi);
521               if (reass->first_bi == tmp_bi)
522                 {
523                   return IP4_REASS_RC_INTERNAL_ERROR;
524                 }
525             }
526           if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
527             {
528               tmp_bi = tmp->next_buffer;
529               tmp = vlib_get_buffer (vm, tmp->next_buffer);
530             }
531           else
532             {
533               break;
534             }
535         }
536       sub_chain_bi =
537         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
538         reass.next_range_bi;
539     }
540   while (~0 != sub_chain_bi);
541
542   if (!last_b)
543     {
544       return IP4_REASS_RC_INTERNAL_ERROR;
545     }
546   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
547   if (total_length < first_b->current_length)
548     {
549       return IP4_REASS_RC_INTERNAL_ERROR;
550     }
551   total_length -= first_b->current_length;
552   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
553   first_b->total_length_not_including_first_buffer = total_length;
554   ip4_header_t *ip = vlib_buffer_get_current (first_b);
555   ip->flags_and_fragment_offset = 0;
556   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
557   ip->checksum = ip4_header_checksum (ip);
558   if (!vlib_buffer_chain_linearize (vm, first_b))
559     {
560       return IP4_REASS_RC_NO_BUF;
561     }
562
563   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
564     {
565       ip4_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
566 #if 0
567       // following code does a hexdump of packet fragments to stdout ...
568       do
569         {
570           u32 bi = reass->first_bi;
571           u8 *s = NULL;
572           while (~0 != bi)
573             {
574               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
575               s = format (s, "%u: %U\n", bi, format_hexdump,
576                           vlib_buffer_get_current (b), b->current_length);
577               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
578                 {
579                   bi = b->next_buffer;
580                 }
581               else
582                 {
583                   break;
584                 }
585             }
586           printf ("%.*s\n", vec_len (s), s);
587           fflush (stdout);
588           vec_free (s);
589         }
590       while (0);
591 #endif
592     }
593   *bi0 = reass->first_bi;
594   if (is_feature)
595     {
596       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
597     }
598   else
599     {
600       *next0 = reass->next_index;
601     }
602   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
603   *error0 = IP4_ERROR_NONE;
604   ip4_reass_free (rm, rt, reass);
605   reass = NULL;
606   return IP4_REASS_RC_OK;
607 }
608
609 always_inline ip4_reass_rc_t
610 ip4_reass_insert_range_in_chain (vlib_main_t * vm,
611                                  ip4_reass_main_t * rm,
612                                  ip4_reass_per_thread_t * rt,
613                                  ip4_reass_t * reass,
614                                  u32 prev_range_bi, u32 new_next_bi)
615 {
616   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
617   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
618   if (~0 != prev_range_bi)
619     {
620       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
621       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
622       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
623       prev_vnb->ip.reass.next_range_bi = new_next_bi;
624     }
625   else
626     {
627       if (~0 != reass->first_bi)
628         {
629           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
630         }
631       reass->first_bi = new_next_bi;
632     }
633   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
634   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
635       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
636     {
637       return IP4_REASS_RC_INTERNAL_ERROR;
638     }
639   reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
640   return IP4_REASS_RC_OK;
641 }
642
643 always_inline ip4_reass_rc_t
644 ip4_reass_remove_range_from_chain (vlib_main_t * vm,
645                                    vlib_node_runtime_t * node,
646                                    ip4_reass_main_t * rm,
647                                    ip4_reass_t * reass, u32 prev_range_bi,
648                                    u32 discard_bi)
649 {
650   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
651   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
652   if (~0 != prev_range_bi)
653     {
654       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
655       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
656       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
657         {
658           return IP4_REASS_RC_INTERNAL_ERROR;
659         }
660       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
661     }
662   else
663     {
664       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
665     }
666   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
667   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
668       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
669     {
670       return IP4_REASS_RC_INTERNAL_ERROR;
671     }
672   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
673   while (1)
674     {
675       vlib_buffer_free_one (vm, discard_bi);
676       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
677         {
678           ip4_reass_add_trace (vm, node, rm, reass, discard_bi, RANGE_DISCARD,
679                                0);
680         }
681       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
682         {
683           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
684           discard_bi = discard_b->next_buffer;
685           discard_b = vlib_get_buffer (vm, discard_bi);
686         }
687       else
688         {
689           break;
690         }
691     }
692   return IP4_REASS_RC_OK;
693 }
694
695 always_inline ip4_reass_rc_t
696 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
697                   ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
698                   ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
699                   bool is_feature)
700 {
701   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
702   int consumed = 0;
703   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
704   ip4_header_t *fip = vlib_buffer_get_current (fb);
705   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
706   reass->next_index = fvnb->ip.reass.next_index;        // store next_index before it's overwritten
707   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
708   const u32 fragment_length =
709     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
710   const u32 fragment_last = fragment_first + fragment_length - 1;
711   fvnb->ip.reass.fragment_first = fragment_first;
712   fvnb->ip.reass.fragment_last = fragment_last;
713   int more_fragments = ip4_get_fragment_more (fip);
714   u32 candidate_range_bi = reass->first_bi;
715   u32 prev_range_bi = ~0;
716   fvnb->ip.reass.range_first = fragment_first;
717   fvnb->ip.reass.range_last = fragment_last;
718   fvnb->ip.reass.next_range_bi = ~0;
719   if (!more_fragments)
720     {
721       reass->last_packet_octet = fragment_last;
722     }
723   if (~0 == reass->first_bi)
724     {
725       // starting a new reassembly
726       rc =
727         ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
728                                          *bi0);
729       if (IP4_REASS_RC_OK != rc)
730         {
731           return rc;
732         }
733       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
734         {
735           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
736         }
737       *bi0 = ~0;
738       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
739       return IP4_REASS_RC_OK;
740     }
741   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
742                                          fvnb->ip.reass.estimated_mtu);
743   while (~0 != candidate_range_bi)
744     {
745       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
746       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
747       if (fragment_first > candidate_vnb->ip.reass.range_last)
748         {
749           // this fragments starts after candidate range
750           prev_range_bi = candidate_range_bi;
751           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
752           if (candidate_vnb->ip.reass.range_last < fragment_last &&
753               ~0 == candidate_range_bi)
754             {
755               // special case - this fragment falls beyond all known ranges
756               rc =
757                 ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
758                                                  prev_range_bi, *bi0);
759               if (IP4_REASS_RC_OK != rc)
760                 {
761                   return rc;
762                 }
763               consumed = 1;
764               break;
765             }
766           continue;
767         }
768       if (fragment_last < candidate_vnb->ip.reass.range_first)
769         {
770           // this fragment ends before candidate range without any overlap
771           rc =
772             ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
773                                              *bi0);
774           if (IP4_REASS_RC_OK != rc)
775             {
776               return rc;
777             }
778           consumed = 1;
779         }
780       else
781         {
782           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
783               fragment_last <= candidate_vnb->ip.reass.range_last)
784             {
785               // this fragment is a (sub)part of existing range, ignore it
786               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
787                 {
788                   ip4_reass_add_trace (vm, node, rm, reass, *bi0,
789                                        RANGE_OVERLAP, 0);
790                 }
791               break;
792             }
793           int discard_candidate = 0;
794           if (fragment_first < candidate_vnb->ip.reass.range_first)
795             {
796               u32 overlap =
797                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
798               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
799                 {
800                   candidate_vnb->ip.reass.range_first += overlap;
801                   if (reass->data_len < overlap)
802                     {
803                       return IP4_REASS_RC_INTERNAL_ERROR;
804                     }
805                   reass->data_len -= overlap;
806                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
807                     {
808                       ip4_reass_add_trace (vm, node, rm, reass,
809                                            candidate_range_bi, RANGE_SHRINK,
810                                            overlap);
811                     }
812                   rc =
813                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
814                                                      prev_range_bi, *bi0);
815                   if (IP4_REASS_RC_OK != rc)
816                     {
817                       return rc;
818                     }
819                   consumed = 1;
820                 }
821               else
822                 {
823                   discard_candidate = 1;
824                 }
825             }
826           else if (fragment_last > candidate_vnb->ip.reass.range_last)
827             {
828               u32 overlap =
829                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
830               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
831                 {
832                   fvnb->ip.reass.range_first += overlap;
833                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
834                     {
835                       prev_range_bi = candidate_range_bi;
836                       candidate_range_bi =
837                         candidate_vnb->ip.reass.next_range_bi;
838                       continue;
839                     }
840                   else
841                     {
842                       // special case - last range discarded
843                       rc =
844                         ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
845                                                          candidate_range_bi,
846                                                          *bi0);
847                       if (IP4_REASS_RC_OK != rc)
848                         {
849                           return rc;
850                         }
851                       consumed = 1;
852                     }
853                 }
854               else
855                 {
856                   discard_candidate = 1;
857                 }
858             }
859           else
860             {
861               discard_candidate = 1;
862             }
863           if (discard_candidate)
864             {
865               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
866               // discard candidate range, probe next range
867               rc =
868                 ip4_reass_remove_range_from_chain (vm, node, rm, reass,
869                                                    prev_range_bi,
870                                                    candidate_range_bi);
871               if (IP4_REASS_RC_OK != rc)
872                 {
873                   return rc;
874                 }
875               if (~0 != next_range_bi)
876                 {
877                   candidate_range_bi = next_range_bi;
878                   continue;
879                 }
880               else
881                 {
882                   // special case - last range discarded
883                   rc =
884                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
885                                                      prev_range_bi, *bi0);
886                   if (IP4_REASS_RC_OK != rc)
887                     {
888                       return rc;
889                     }
890                   consumed = 1;
891                 }
892             }
893         }
894       break;
895     }
896   if (consumed)
897     {
898       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
899         {
900           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
901         }
902     }
903   if (~0 != reass->last_packet_octet &&
904       reass->data_len == reass->last_packet_octet + 1)
905     {
906       return ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
907                                  is_feature);
908     }
909   else
910     {
911       if (consumed)
912         {
913           *bi0 = ~0;
914         }
915       else
916         {
917           *next0 = IP4_REASSEMBLY_NEXT_DROP;
918           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
919         }
920     }
921   return rc;
922 }
923
924 always_inline uword
925 ip4_reassembly_inline (vlib_main_t * vm,
926                        vlib_node_runtime_t * node,
927                        vlib_frame_t * frame, bool is_feature)
928 {
929   u32 *from = vlib_frame_vector_args (frame);
930   u32 n_left_from, n_left_to_next, *to_next, next_index;
931   ip4_reass_main_t *rm = &ip4_reass_main;
932   ip4_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
933   clib_spinlock_lock (&rt->lock);
934
935   n_left_from = frame->n_vectors;
936   next_index = node->cached_next_index;
937   while (n_left_from > 0)
938     {
939       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
940
941       while (n_left_from > 0 && n_left_to_next > 0)
942         {
943           u32 bi0;
944           vlib_buffer_t *b0;
945           u32 next0;
946           u32 error0 = IP4_ERROR_NONE;
947
948           bi0 = from[0];
949           b0 = vlib_get_buffer (vm, bi0);
950
951           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
952           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
953             {
954               // this is a whole packet - no fragmentation
955               if (is_feature)
956                 {
957                   next0 = IP4_REASSEMBLY_NEXT_INPUT;
958                 }
959               else
960                 {
961                   next0 = vnet_buffer (b0)->ip.reass.next_index;
962                 }
963             }
964           else
965             {
966               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
967               const u32 fragment_length =
968                 clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
969               const u32 fragment_last = fragment_first + fragment_length - 1;
970               if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
971                 {
972                   next0 = IP4_REASSEMBLY_NEXT_DROP;
973                   error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
974                 }
975               else
976                 {
977                   ip4_reass_kv_t kv;
978                   u8 do_handoff = 0;
979
980                   kv.k.as_u64[0] =
981                     (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
982                                    vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
983                     (u64) ip0->src_address.as_u32 << 32;
984                   kv.k.as_u64[1] =
985                     (u64) ip0->dst_address.as_u32 |
986                     (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
987
988                   ip4_reass_t *reass =
989                     ip4_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
990
991                   if (PREDICT_FALSE (do_handoff))
992                     {
993                       next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
994                       if (is_feature)
995                         vnet_buffer (b0)->ip.
996                           reass.owner_feature_thread_index =
997                           kv.v.thread_index;
998                       else
999                         vnet_buffer (b0)->ip.reass.owner_thread_index =
1000                           kv.v.thread_index;
1001                     }
1002                   else if (reass)
1003                     {
1004                       switch (ip4_reass_update
1005                               (vm, node, rm, rt, reass, &bi0, &next0,
1006                                &error0, is_feature))
1007                         {
1008                         case IP4_REASS_RC_OK:
1009                           /* nothing to do here */
1010                           break;
1011                         case IP4_REASS_RC_NO_BUF:
1012                           /* fallthrough */
1013                         case IP4_REASS_RC_INTERNAL_ERROR:
1014                           /* drop everything and start with a clean slate */
1015                           ip4_reass_on_timeout (vm, rm, reass);
1016                           ip4_reass_free (rm, rt, reass);
1017                           goto next_packet;
1018                           break;
1019                         }
1020                     }
1021                   else
1022                     {
1023                       next0 = IP4_REASSEMBLY_NEXT_DROP;
1024                       error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1025                     }
1026                 }
1027
1028               b0->error = node->errors[error0];
1029             }
1030
1031           if (bi0 != ~0)
1032             {
1033               to_next[0] = bi0;
1034               to_next += 1;
1035               n_left_to_next -= 1;
1036               if (is_feature && IP4_ERROR_NONE == error0)
1037                 {
1038                   b0 = vlib_get_buffer (vm, bi0);
1039                   vnet_feature_next (&next0, b0);
1040                 }
1041               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1042                                                to_next, n_left_to_next,
1043                                                bi0, next0);
1044               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1045             }
1046
1047         next_packet:
1048           from += 1;
1049           n_left_from -= 1;
1050         }
1051
1052       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1053     }
1054
1055   clib_spinlock_unlock (&rt->lock);
1056   return frame->n_vectors;
1057 }
1058
1059 static char *ip4_reassembly_error_strings[] = {
1060 #define _(sym, string) string,
1061   foreach_ip4_error
1062 #undef _
1063 };
1064
1065 VLIB_NODE_FN (ip4_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1066                                vlib_frame_t * frame)
1067 {
1068   return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ );
1069 }
1070
1071 /* *INDENT-OFF* */
1072 VLIB_REGISTER_NODE (ip4_reass_node) = {
1073     .name = "ip4-reassembly",
1074     .vector_size = sizeof (u32),
1075     .format_trace = format_ip4_reass_trace,
1076     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1077     .error_strings = ip4_reassembly_error_strings,
1078     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1079     .next_nodes =
1080         {
1081                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1082                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1083                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reassembly-handoff",
1084
1085         },
1086 };
1087 /* *INDENT-ON* */
1088
1089 VLIB_NODE_FN (ip4_reass_node_feature) (vlib_main_t * vm,
1090                                        vlib_node_runtime_t * node,
1091                                        vlib_frame_t * frame)
1092 {
1093   return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ );
1094 }
1095
1096 /* *INDENT-OFF* */
1097 VLIB_REGISTER_NODE (ip4_reass_node_feature) = {
1098     .name = "ip4-reassembly-feature",
1099     .vector_size = sizeof (u32),
1100     .format_trace = format_ip4_reass_trace,
1101     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1102     .error_strings = ip4_reassembly_error_strings,
1103     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1104     .next_nodes =
1105         {
1106                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1107                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1108                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reass-feature-hoff",
1109         },
1110 };
1111 /* *INDENT-ON* */
1112
1113 /* *INDENT-OFF* */
1114 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
1115     .arc_name = "ip4-unicast",
1116     .node_name = "ip4-reassembly-feature",
1117     .runs_before = VNET_FEATURES ("ip4-lookup"),
1118     .runs_after = 0,
1119 };
1120 /* *INDENT-ON* */
1121
1122 #ifndef CLIB_MARCH_VARIANT
1123 always_inline u32
1124 ip4_reass_get_nbuckets ()
1125 {
1126   ip4_reass_main_t *rm = &ip4_reass_main;
1127   u32 nbuckets;
1128   u8 i;
1129
1130   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1131
1132   for (i = 0; i < 31; i++)
1133     if ((1 << i) >= nbuckets)
1134       break;
1135   nbuckets = 1 << i;
1136
1137   return nbuckets;
1138 }
1139 #endif /* CLIB_MARCH_VARIANT */
1140
1141 typedef enum
1142 {
1143   IP4_EVENT_CONFIG_CHANGED = 1,
1144 } ip4_reass_event_t;
1145
1146 typedef struct
1147 {
1148   int failure;
1149   clib_bihash_16_8_t *new_hash;
1150 } ip4_rehash_cb_ctx;
1151
1152 #ifndef CLIB_MARCH_VARIANT
1153 static void
1154 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1155 {
1156   ip4_rehash_cb_ctx *ctx = _ctx;
1157   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1158     {
1159       ctx->failure = 1;
1160     }
1161 }
1162
1163 static void
1164 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1165                       u32 expire_walk_interval_ms)
1166 {
1167   ip4_reass_main.timeout_ms = timeout_ms;
1168   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1169   ip4_reass_main.max_reass_n = max_reassemblies;
1170   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1171 }
1172
1173 vnet_api_error_t
1174 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
1175                u32 expire_walk_interval_ms)
1176 {
1177   u32 old_nbuckets = ip4_reass_get_nbuckets ();
1178   ip4_reass_set_params (timeout_ms, max_reassemblies,
1179                         expire_walk_interval_ms);
1180   vlib_process_signal_event (ip4_reass_main.vlib_main,
1181                              ip4_reass_main.ip4_reass_expire_node_idx,
1182                              IP4_EVENT_CONFIG_CHANGED, 0);
1183   u32 new_nbuckets = ip4_reass_get_nbuckets ();
1184   if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1185     {
1186       clib_bihash_16_8_t new_hash;
1187       clib_memset (&new_hash, 0, sizeof (new_hash));
1188       ip4_rehash_cb_ctx ctx;
1189       ctx.failure = 0;
1190       ctx.new_hash = &new_hash;
1191       clib_bihash_init_16_8 (&new_hash, "ip4-reass", new_nbuckets,
1192                              new_nbuckets * 1024);
1193       clib_bihash_foreach_key_value_pair_16_8 (&ip4_reass_main.hash,
1194                                                ip4_rehash_cb, &ctx);
1195       if (ctx.failure)
1196         {
1197           clib_bihash_free_16_8 (&new_hash);
1198           return -1;
1199         }
1200       else
1201         {
1202           clib_bihash_free_16_8 (&ip4_reass_main.hash);
1203           clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
1204                             sizeof (ip4_reass_main.hash));
1205         }
1206     }
1207   return 0;
1208 }
1209
1210 vnet_api_error_t
1211 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1212                u32 * expire_walk_interval_ms)
1213 {
1214   *timeout_ms = ip4_reass_main.timeout_ms;
1215   *max_reassemblies = ip4_reass_main.max_reass_n;
1216   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
1217   return 0;
1218 }
1219
1220 static clib_error_t *
1221 ip4_reass_init_function (vlib_main_t * vm)
1222 {
1223   ip4_reass_main_t *rm = &ip4_reass_main;
1224   clib_error_t *error = 0;
1225   u32 nbuckets;
1226   vlib_node_t *node;
1227
1228   rm->vlib_main = vm;
1229   rm->vnet_main = vnet_get_main ();
1230
1231   vec_validate (rm->per_thread_data, vlib_num_workers ());
1232   ip4_reass_per_thread_t *rt;
1233   vec_foreach (rt, rm->per_thread_data)
1234   {
1235     clib_spinlock_init (&rt->lock);
1236     pool_alloc (rt->pool, rm->max_reass_n);
1237   }
1238
1239   node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
1240   ASSERT (node);
1241   rm->ip4_reass_expire_node_idx = node->index;
1242
1243   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1244                         IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1245                         IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1246
1247   nbuckets = ip4_reass_get_nbuckets ();
1248   clib_bihash_init_16_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
1249
1250   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1251   ASSERT (node);
1252   rm->ip4_drop_idx = node->index;
1253
1254   rm->fq_index = vlib_frame_queue_main_init (ip4_reass_node.index, 0);
1255   rm->fq_feature_index =
1256     vlib_frame_queue_main_init (ip4_reass_node_feature.index, 0);
1257
1258
1259   return error;
1260 }
1261
1262 VLIB_INIT_FUNCTION (ip4_reass_init_function);
1263 #endif /* CLIB_MARCH_VARIANT */
1264
1265 static uword
1266 ip4_reass_walk_expired (vlib_main_t * vm,
1267                         vlib_node_runtime_t * node, vlib_frame_t * f)
1268 {
1269   ip4_reass_main_t *rm = &ip4_reass_main;
1270   uword event_type, *event_data = 0;
1271
1272   while (true)
1273     {
1274       vlib_process_wait_for_event_or_clock (vm,
1275                                             (f64)
1276                                             rm->expire_walk_interval_ms /
1277                                             (f64) MSEC_PER_SEC);
1278       event_type = vlib_process_get_events (vm, &event_data);
1279
1280       switch (event_type)
1281         {
1282         case ~0:                /* no events => timeout */
1283           /* nothing to do here */
1284           break;
1285         case IP4_EVENT_CONFIG_CHANGED:
1286           break;
1287         default:
1288           clib_warning ("BUG: event type 0x%wx", event_type);
1289           break;
1290         }
1291       f64 now = vlib_time_now (vm);
1292
1293       ip4_reass_t *reass;
1294       int *pool_indexes_to_free = NULL;
1295
1296       uword thread_index = 0;
1297       int index;
1298       const uword nthreads = vlib_num_workers () + 1;
1299       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1300         {
1301           ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1302           clib_spinlock_lock (&rt->lock);
1303
1304           vec_reset_length (pool_indexes_to_free);
1305           /* *INDENT-OFF* */
1306           pool_foreach_index (index, rt->pool, ({
1307                                 reass = pool_elt_at_index (rt->pool, index);
1308                                 if (now > reass->last_heard + rm->timeout)
1309                                   {
1310                                     vec_add1 (pool_indexes_to_free, index);
1311                                   }
1312                               }));
1313           /* *INDENT-ON* */
1314           int *i;
1315           /* *INDENT-OFF* */
1316           vec_foreach (i, pool_indexes_to_free)
1317           {
1318             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1319             ip4_reass_on_timeout (vm, rm, reass);
1320             ip4_reass_free (rm, rt, reass);
1321           }
1322           /* *INDENT-ON* */
1323
1324           clib_spinlock_unlock (&rt->lock);
1325         }
1326
1327       vec_free (pool_indexes_to_free);
1328       if (event_data)
1329         {
1330           _vec_len (event_data) = 0;
1331         }
1332     }
1333
1334   return 0;
1335 }
1336
1337 /* *INDENT-OFF* */
1338 VLIB_REGISTER_NODE (ip4_reass_expire_node, static) = {
1339     .function = ip4_reass_walk_expired,
1340     .type = VLIB_NODE_TYPE_PROCESS,
1341     .name = "ip4-reassembly-expire-walk",
1342     .format_trace = format_ip4_reass_trace,
1343     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1344     .error_strings = ip4_reassembly_error_strings,
1345
1346 };
1347 /* *INDENT-ON* */
1348
1349 static u8 *
1350 format_ip4_reass_key (u8 * s, va_list * args)
1351 {
1352   ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
1353   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1354               key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1355               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1356   return s;
1357 }
1358
1359 static u8 *
1360 format_ip4_reass (u8 * s, va_list * args)
1361 {
1362   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1363   ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
1364
1365   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1366               "last_packet_octet: %u, trace_op_counter: %u\n",
1367               reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
1368               reass->data_len, reass->last_packet_octet,
1369               reass->trace_op_counter);
1370   u32 bi = reass->first_bi;
1371   u32 counter = 0;
1372   while (~0 != bi)
1373     {
1374       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1375       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1376       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1377                   "fragment[%u, %u]\n",
1378                   counter, vnb->ip.reass.range_first,
1379                   vnb->ip.reass.range_last, bi,
1380                   ip4_reass_buffer_get_data_offset (b),
1381                   ip4_reass_buffer_get_data_len (b),
1382                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1383       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1384         {
1385           bi = b->next_buffer;
1386         }
1387       else
1388         {
1389           bi = ~0;
1390         }
1391     }
1392   return s;
1393 }
1394
1395 static clib_error_t *
1396 show_ip4_reass (vlib_main_t * vm,
1397                 unformat_input_t * input,
1398                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1399 {
1400   ip4_reass_main_t *rm = &ip4_reass_main;
1401
1402   vlib_cli_output (vm, "---------------------");
1403   vlib_cli_output (vm, "IP4 reassembly status");
1404   vlib_cli_output (vm, "---------------------");
1405   bool details = false;
1406   if (unformat (input, "details"))
1407     {
1408       details = true;
1409     }
1410
1411   u32 sum_reass_n = 0;
1412   ip4_reass_t *reass;
1413   uword thread_index;
1414   const uword nthreads = vlib_num_workers () + 1;
1415   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1416     {
1417       ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1418       clib_spinlock_lock (&rt->lock);
1419       if (details)
1420         {
1421           /* *INDENT-OFF* */
1422           pool_foreach (reass, rt->pool, {
1423             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1424           });
1425           /* *INDENT-ON* */
1426         }
1427       sum_reass_n += rt->reass_n;
1428       clib_spinlock_unlock (&rt->lock);
1429     }
1430   vlib_cli_output (vm, "---------------------");
1431   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1432                    (long unsigned) sum_reass_n);
1433   vlib_cli_output (vm,
1434                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1435                    (long unsigned) rm->max_reass_n);
1436   return 0;
1437 }
1438
1439 /* *INDENT-OFF* */
1440 VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
1441     .path = "show ip4-reassembly",
1442     .short_help = "show ip4-reassembly [details]",
1443     .function = show_ip4_reass,
1444 };
1445 /* *INDENT-ON* */
1446
1447 #ifndef CLIB_MARCH_VARIANT
1448 vnet_api_error_t
1449 ip4_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1450 {
1451   return vnet_feature_enable_disable ("ip4-unicast",
1452                                       "ip4-reassembly-feature", sw_if_index,
1453                                       enable_disable, 0, 0);
1454 }
1455 #endif /* CLIB_MARCH_VARIANT */
1456
1457
1458 #define foreach_ip4_reassembly_handoff_error                       \
1459 _(CONGESTION_DROP, "congestion drop")
1460
1461
1462 typedef enum
1463 {
1464 #define _(sym,str) IP4_REASSEMBLY_HANDOFF_ERROR_##sym,
1465   foreach_ip4_reassembly_handoff_error
1466 #undef _
1467     IP4_REASSEMBLY_HANDOFF_N_ERROR,
1468 } ip4_reassembly_handoff_error_t;
1469
1470 static char *ip4_reassembly_handoff_error_strings[] = {
1471 #define _(sym,string) string,
1472   foreach_ip4_reassembly_handoff_error
1473 #undef _
1474 };
1475
1476 typedef struct
1477 {
1478   u32 next_worker_index;
1479 } ip4_reassembly_handoff_trace_t;
1480
1481 static u8 *
1482 format_ip4_reassembly_handoff_trace (u8 * s, va_list * args)
1483 {
1484   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1485   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1486   ip4_reassembly_handoff_trace_t *t =
1487     va_arg (*args, ip4_reassembly_handoff_trace_t *);
1488
1489   s =
1490     format (s, "ip4-reassembly-handoff: next-worker %d",
1491             t->next_worker_index);
1492
1493   return s;
1494 }
1495
1496 always_inline uword
1497 ip4_reassembly_handoff_node_inline (vlib_main_t * vm,
1498                                     vlib_node_runtime_t * node,
1499                                     vlib_frame_t * frame, bool is_feature)
1500 {
1501   ip4_reass_main_t *rm = &ip4_reass_main;
1502
1503   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1504   u32 n_enq, n_left_from, *from;
1505   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1506   u32 fq_index;
1507
1508   from = vlib_frame_vector_args (frame);
1509   n_left_from = frame->n_vectors;
1510   vlib_get_buffers (vm, from, bufs, n_left_from);
1511
1512   b = bufs;
1513   ti = thread_indices;
1514
1515   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1516
1517   while (n_left_from > 0)
1518     {
1519       ti[0] =
1520         (is_feature) ? vnet_buffer (b[0])->ip.
1521         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1522         reass.owner_thread_index;
1523
1524       if (PREDICT_FALSE
1525           ((node->flags & VLIB_NODE_FLAG_TRACE)
1526            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1527         {
1528           ip4_reassembly_handoff_trace_t *t =
1529             vlib_add_trace (vm, node, b[0], sizeof (*t));
1530           t->next_worker_index = ti[0];
1531         }
1532
1533       n_left_from -= 1;
1534       ti += 1;
1535       b += 1;
1536     }
1537   n_enq =
1538     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1539                                    frame->n_vectors, 1);
1540
1541   if (n_enq < frame->n_vectors)
1542     vlib_node_increment_counter (vm, node->node_index,
1543                                  IP4_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1544                                  frame->n_vectors - n_enq);
1545   return frame->n_vectors;
1546 }
1547
1548 VLIB_NODE_FN (ip4_reassembly_handoff_node) (vlib_main_t * vm,
1549                                             vlib_node_runtime_t * node,
1550                                             vlib_frame_t * frame)
1551 {
1552   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1553                                              false /* is_feature */ );
1554 }
1555
1556
1557 /* *INDENT-OFF* */
1558 VLIB_REGISTER_NODE (ip4_reassembly_handoff_node) = {
1559   .name = "ip4-reassembly-handoff",
1560   .vector_size = sizeof (u32),
1561   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1562   .error_strings = ip4_reassembly_handoff_error_strings,
1563   .format_trace = format_ip4_reassembly_handoff_trace,
1564
1565   .n_next_nodes = 1,
1566
1567   .next_nodes = {
1568     [0] = "error-drop",
1569   },
1570 };
1571 /* *INDENT-ON* */
1572
1573
1574 /* *INDENT-OFF* */
1575 VLIB_NODE_FN (ip4_reassembly_feature_handoff_node) (vlib_main_t * vm,
1576                                                     vlib_node_runtime_t *
1577                                                     node,
1578                                                     vlib_frame_t * frame)
1579 {
1580   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1581                                              true /* is_feature */ );
1582 }
1583 /* *INDENT-ON* */
1584
1585
1586 /* *INDENT-OFF* */
1587 VLIB_REGISTER_NODE (ip4_reassembly_feature_handoff_node) = {
1588   .name = "ip4-reass-feature-hoff",
1589   .vector_size = sizeof (u32),
1590   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1591   .error_strings = ip4_reassembly_handoff_error_strings,
1592   .format_trace = format_ip4_reassembly_handoff_trace,
1593
1594   .n_next_nodes = 1,
1595
1596   .next_nodes = {
1597     [0] = "error-drop",
1598   },
1599 };
1600 /* *INDENT-ON* */
1601
1602 /*
1603  * fd.io coding-style-patch-verification: ON
1604  *
1605  * Local Variables:
1606  * eval: (c-set-style "gnu")
1607  * End:
1608  */