API: Add python2.7 support for enum flags via aenum
[vpp.git] / src / vnet / ip / ip4_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Reassembly.
19  *
20  * This file contains the source code for IPv4 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_16_8.h>
27 #include <vnet/ip/ip4_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
34
35 #define IP4_REASS_DEBUG_BUFFERS 0
36 #if IP4_REASS_DEBUG_BUFFERS
37 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
38   do                                                 \
39     {                                                \
40       u32 _bi = bi;                                  \
41       printf (#what "buffer %u", _bi);               \
42       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
43       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
44         {                                            \
45           _bi = _b->next_buffer;                     \
46           printf ("[%u]", _bi);                      \
47           _b = vlib_get_buffer (vm, _bi);            \
48         }                                            \
49       printf ("\n");                                 \
50       fflush (stdout);                               \
51     }                                                \
52   while (0)
53 #else
54 #define IP4_REASS_DEBUG_BUFFER(...)
55 #endif
56
57 typedef enum
58 {
59   IP4_REASS_RC_OK,
60   IP4_REASS_RC_INTERNAL_ERROR,
61   IP4_REASS_RC_NO_BUF,
62 } ip4_reass_rc_t;
63
64 typedef struct
65 {
66   union
67   {
68     struct
69     {
70       u32 xx_id;
71       ip4_address_t src;
72       ip4_address_t dst;
73       u16 frag_id;
74       u8 proto;
75       u8 unused;
76     };
77     u64 as_u64[2];
78   };
79 } ip4_reass_key_t;
80
81 typedef union
82 {
83   struct
84   {
85     u32 reass_index;
86     u32 thread_index;
87   };
88   u64 as_u64;
89 } ip4_reass_val_t;
90
91 typedef union
92 {
93   struct
94   {
95     ip4_reass_key_t k;
96     ip4_reass_val_t v;
97   };
98   clib_bihash_kv_16_8_t kv;
99 } ip4_reass_kv_t;
100
101 always_inline u32
102 ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
103 {
104   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
105   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
106 }
107
108 always_inline u16
109 ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
110 {
111   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
112   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
113     (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
114 }
115
116 typedef struct
117 {
118   // hash table key
119   ip4_reass_key_t key;
120   // time when last packet was received
121   f64 last_heard;
122   // internal id of this reassembly
123   u64 id;
124   // buffer index of first buffer in this reassembly context
125   u32 first_bi;
126   // last octet of packet, ~0 until fragment without more_fragments arrives
127   u32 last_packet_octet;
128   // length of data collected so far
129   u32 data_len;
130   // trace operation counter
131   u32 trace_op_counter;
132   // next index - used by non-feature node
133   u8 next_index;
134   // minimum fragment length for this reassembly - used to estimate MTU
135   u16 min_fragment_length;
136
137 } ip4_reass_t;
138
139 typedef struct
140 {
141   ip4_reass_t *pool;
142   u32 reass_n;
143   u32 id_counter;
144   clib_spinlock_t lock;
145 } ip4_reass_per_thread_t;
146
147 typedef struct
148 {
149   // IPv4 config
150   u32 timeout_ms;
151   f64 timeout;
152   u32 expire_walk_interval_ms;
153   u32 max_reass_n;
154
155   // IPv4 runtime
156   clib_bihash_16_8_t hash;
157   // per-thread data
158   ip4_reass_per_thread_t *per_thread_data;
159
160   // convenience
161   vlib_main_t *vlib_main;
162   vnet_main_t *vnet_main;
163
164   // node index of ip4-drop node
165   u32 ip4_drop_idx;
166   u32 ip4_reass_expire_node_idx;
167
168   /** Worker handoff */
169   u32 fq_index;
170   u32 fq_feature_index;
171
172 } ip4_reass_main_t;
173
174 ip4_reass_main_t ip4_reass_main;
175
176 typedef enum
177 {
178   IP4_REASSEMBLY_NEXT_INPUT,
179   IP4_REASSEMBLY_NEXT_DROP,
180   IP4_REASSEMBLY_NEXT_HANDOFF,
181   IP4_REASSEMBLY_N_NEXT,
182 } ip4_reass_next_t;
183
184 typedef enum
185 {
186   RANGE_NEW,
187   RANGE_SHRINK,
188   RANGE_DISCARD,
189   RANGE_OVERLAP,
190   FINALIZE,
191 } ip4_reass_trace_operation_e;
192
193 typedef struct
194 {
195   u16 range_first;
196   u16 range_last;
197   u32 range_bi;
198   i32 data_offset;
199   u32 data_len;
200   u32 first_bi;
201 } ip4_reass_range_trace_t;
202
203 typedef struct
204 {
205   ip4_reass_trace_operation_e action;
206   u32 reass_id;
207   ip4_reass_range_trace_t trace_range;
208   u32 size_diff;
209   u32 op_id;
210   u32 fragment_first;
211   u32 fragment_last;
212   u32 total_data_len;
213 } ip4_reass_trace_t;
214
215 static void
216 ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
217                          ip4_reass_range_trace_t * trace)
218 {
219   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
220   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
221   trace->range_first = vnb->ip.reass.range_first;
222   trace->range_last = vnb->ip.reass.range_last;
223   trace->data_offset = ip4_reass_buffer_get_data_offset (b);
224   trace->data_len = ip4_reass_buffer_get_data_len (b);
225   trace->range_bi = bi;
226 }
227
228 static u8 *
229 format_ip4_reass_range_trace (u8 * s, va_list * args)
230 {
231   ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
232   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
233               trace->range_last, trace->data_offset, trace->data_len,
234               trace->range_bi);
235   return s;
236 }
237
238 u8 *
239 format_ip4_reass_trace (u8 * s, va_list * args)
240 {
241   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
242   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
243   ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
244   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
245   u32 indent = format_get_indent (s);
246   s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
247               t->trace_range.first_bi, t->total_data_len, t->fragment_first,
248               t->fragment_last);
249   switch (t->action)
250     {
251     case RANGE_SHRINK:
252       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
253                   format_ip4_reass_range_trace, &t->trace_range,
254                   t->size_diff);
255       break;
256     case RANGE_DISCARD:
257       s = format (s, "\n%Udiscard %U", format_white_space, indent,
258                   format_ip4_reass_range_trace, &t->trace_range);
259       break;
260     case RANGE_NEW:
261       s = format (s, "\n%Unew %U", format_white_space, indent,
262                   format_ip4_reass_range_trace, &t->trace_range);
263       break;
264     case RANGE_OVERLAP:
265       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
266                   format_ip4_reass_range_trace, &t->trace_range);
267       break;
268     case FINALIZE:
269       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
270       break;
271     }
272   return s;
273 }
274
275 static void
276 ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
277                      ip4_reass_main_t * rm, ip4_reass_t * reass, u32 bi,
278                      ip4_reass_trace_operation_e action, u32 size_diff)
279 {
280   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
281   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
282   if (pool_is_free_index (vm->trace_main.trace_buffer_pool, b->trace_index))
283     {
284       // this buffer's trace is gone
285       b->flags &= ~VLIB_BUFFER_IS_TRACED;
286       return;
287     }
288   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
289   t->reass_id = reass->id;
290   t->action = action;
291   ip4_reass_trace_details (vm, bi, &t->trace_range);
292   t->size_diff = size_diff;
293   t->op_id = reass->trace_op_counter;
294   ++reass->trace_op_counter;
295   t->fragment_first = vnb->ip.reass.fragment_first;
296   t->fragment_last = vnb->ip.reass.fragment_last;
297   t->trace_range.first_bi = reass->first_bi;
298   t->total_data_len = reass->data_len;
299 #if 0
300   static u8 *s = NULL;
301   s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
302   printf ("%.*s\n", vec_len (s), s);
303   fflush (stdout);
304   vec_reset_length (s);
305 #endif
306 }
307
308
309 always_inline void
310 ip4_reass_free (ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
311                 ip4_reass_t * reass)
312 {
313   clib_bihash_kv_16_8_t kv;
314   kv.key[0] = reass->key.as_u64[0];
315   kv.key[1] = reass->key.as_u64[1];
316   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
317   pool_put (rt->pool, reass);
318   --rt->reass_n;
319 }
320
321 always_inline void
322 ip4_reass_on_timeout (vlib_main_t * vm, ip4_reass_main_t * rm,
323                       ip4_reass_t * reass)
324 {
325   u32 range_bi = reass->first_bi;
326   vlib_buffer_t *range_b;
327   vnet_buffer_opaque_t *range_vnb;
328   u32 *to_free = NULL;
329   while (~0 != range_bi)
330     {
331       range_b = vlib_get_buffer (vm, range_bi);
332       range_vnb = vnet_buffer (range_b);
333       u32 bi = range_bi;
334       while (~0 != bi)
335         {
336           vec_add1 (to_free, bi);
337           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
338           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
339             {
340               bi = b->next_buffer;
341               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
342             }
343           else
344             {
345               bi = ~0;
346             }
347         }
348       range_bi = range_vnb->ip.reass.next_range_bi;
349     }
350   vlib_buffer_free (vm, to_free, vec_len (to_free));
351   vec_free (to_free);
352 }
353
354 ip4_reass_t *
355 ip4_reass_find_or_create (vlib_main_t * vm, ip4_reass_main_t * rm,
356                           ip4_reass_per_thread_t * rt, ip4_reass_kv_t * kv,
357                           u8 * do_handoff)
358 {
359   ip4_reass_t *reass = NULL;
360   f64 now = vlib_time_now (rm->vlib_main);
361
362   if (!clib_bihash_search_16_8
363       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
364     {
365       if (vm->thread_index != kv->v.thread_index)
366         {
367           *do_handoff = 1;
368           return NULL;
369         }
370       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
371
372       if (now > reass->last_heard + rm->timeout)
373         {
374           ip4_reass_on_timeout (vm, rm, reass);
375           ip4_reass_free (rm, rt, reass);
376           reass = NULL;
377         }
378     }
379
380   if (reass)
381     {
382       reass->last_heard = now;
383       return reass;
384     }
385
386   if (rt->reass_n >= rm->max_reass_n)
387     {
388       reass = NULL;
389       return reass;
390     }
391   else
392     {
393       pool_get (rt->pool, reass);
394       clib_memset (reass, 0, sizeof (*reass));
395       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
396       ++rt->id_counter;
397       reass->first_bi = ~0;
398       reass->last_packet_octet = ~0;
399       reass->data_len = 0;
400       ++rt->reass_n;
401     }
402
403   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
404   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
405   kv->v.reass_index = (reass - rt->pool);
406   kv->v.thread_index = vm->thread_index;
407   reass->last_heard = now;
408
409   if (clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 1))
410     {
411       ip4_reass_free (rm, rt, reass);
412       reass = NULL;
413     }
414
415   return reass;
416 }
417
418 always_inline ip4_reass_rc_t
419 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
420                     ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
421                     ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
422                     bool is_feature)
423 {
424   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
425   vlib_buffer_t *last_b = NULL;
426   u32 sub_chain_bi = reass->first_bi;
427   u32 total_length = 0;
428   u32 buf_cnt = 0;
429   do
430     {
431       u32 tmp_bi = sub_chain_bi;
432       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
433       ip4_header_t *ip = vlib_buffer_get_current (tmp);
434       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
435       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
436           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
437         {
438           return IP4_REASS_RC_INTERNAL_ERROR;
439         }
440
441       u32 data_len = ip4_reass_buffer_get_data_len (tmp);
442       u32 trim_front =
443         ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
444       u32 trim_end =
445         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
446       if (tmp_bi == reass->first_bi)
447         {
448           /* first buffer - keep ip4 header */
449           if (0 != ip4_reass_buffer_get_data_offset (tmp))
450             {
451               return IP4_REASS_RC_INTERNAL_ERROR;
452             }
453           trim_front = 0;
454           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
455             ip4_header_bytes (ip);
456           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
457             {
458               return IP4_REASS_RC_INTERNAL_ERROR;
459             }
460         }
461       u32 keep_data =
462         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
463       while (1)
464         {
465           ++buf_cnt;
466           if (trim_front)
467             {
468               if (trim_front > tmp->current_length)
469                 {
470                   /* drop whole buffer */
471                   vlib_buffer_free_one (vm, tmp_bi);
472                   trim_front -= tmp->current_length;
473                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
474                     {
475                       return IP4_REASS_RC_INTERNAL_ERROR;
476                     }
477                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
478                   tmp_bi = tmp->next_buffer;
479                   tmp = vlib_get_buffer (vm, tmp_bi);
480                   continue;
481                 }
482               else
483                 {
484                   vlib_buffer_advance (tmp, trim_front);
485                   trim_front = 0;
486                 }
487             }
488           if (keep_data)
489             {
490               if (last_b)
491                 {
492                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
493                   last_b->next_buffer = tmp_bi;
494                 }
495               last_b = tmp;
496               if (keep_data <= tmp->current_length)
497                 {
498                   tmp->current_length = keep_data;
499                   keep_data = 0;
500                 }
501               else
502                 {
503                   keep_data -= tmp->current_length;
504                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
505                     {
506                       return IP4_REASS_RC_INTERNAL_ERROR;
507                     }
508                 }
509               total_length += tmp->current_length;
510             }
511           else
512             {
513               vlib_buffer_free_one (vm, tmp_bi);
514               if (reass->first_bi == tmp_bi)
515                 {
516                   return IP4_REASS_RC_INTERNAL_ERROR;
517                 }
518             }
519           if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
520             {
521               tmp_bi = tmp->next_buffer;
522               tmp = vlib_get_buffer (vm, tmp->next_buffer);
523             }
524           else
525             {
526               break;
527             }
528         }
529       sub_chain_bi =
530         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
531         reass.next_range_bi;
532     }
533   while (~0 != sub_chain_bi);
534
535   if (!last_b)
536     {
537       return IP4_REASS_RC_INTERNAL_ERROR;
538     }
539   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
540   if (total_length < first_b->current_length)
541     {
542       return IP4_REASS_RC_INTERNAL_ERROR;
543     }
544   total_length -= first_b->current_length;
545   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
546   first_b->total_length_not_including_first_buffer = total_length;
547   ip4_header_t *ip = vlib_buffer_get_current (first_b);
548   ip->flags_and_fragment_offset = 0;
549   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
550   ip->checksum = ip4_header_checksum (ip);
551   if (!vlib_buffer_chain_linearize (vm, first_b))
552     {
553       return IP4_REASS_RC_NO_BUF;
554     }
555
556   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
557     {
558       ip4_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
559 #if 0
560       // following code does a hexdump of packet fragments to stdout ...
561       do
562         {
563           u32 bi = reass->first_bi;
564           u8 *s = NULL;
565           while (~0 != bi)
566             {
567               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
568               s = format (s, "%u: %U\n", bi, format_hexdump,
569                           vlib_buffer_get_current (b), b->current_length);
570               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
571                 {
572                   bi = b->next_buffer;
573                 }
574               else
575                 {
576                   break;
577                 }
578             }
579           printf ("%.*s\n", vec_len (s), s);
580           fflush (stdout);
581           vec_free (s);
582         }
583       while (0);
584 #endif
585     }
586   *bi0 = reass->first_bi;
587   if (is_feature)
588     {
589       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
590     }
591   else
592     {
593       *next0 = reass->next_index;
594     }
595   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
596   *error0 = IP4_ERROR_NONE;
597   ip4_reass_free (rm, rt, reass);
598   reass = NULL;
599   return IP4_REASS_RC_OK;
600 }
601
602 always_inline ip4_reass_rc_t
603 ip4_reass_insert_range_in_chain (vlib_main_t * vm,
604                                  ip4_reass_main_t * rm,
605                                  ip4_reass_per_thread_t * rt,
606                                  ip4_reass_t * reass,
607                                  u32 prev_range_bi, u32 new_next_bi)
608 {
609   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
610   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
611   if (~0 != prev_range_bi)
612     {
613       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
614       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
615       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
616       prev_vnb->ip.reass.next_range_bi = new_next_bi;
617     }
618   else
619     {
620       if (~0 != reass->first_bi)
621         {
622           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
623         }
624       reass->first_bi = new_next_bi;
625     }
626   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
627   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
628       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
629     {
630       return IP4_REASS_RC_INTERNAL_ERROR;
631     }
632   reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
633   return IP4_REASS_RC_OK;
634 }
635
636 always_inline ip4_reass_rc_t
637 ip4_reass_remove_range_from_chain (vlib_main_t * vm,
638                                    vlib_node_runtime_t * node,
639                                    ip4_reass_main_t * rm,
640                                    ip4_reass_t * reass, u32 prev_range_bi,
641                                    u32 discard_bi)
642 {
643   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
644   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
645   if (~0 != prev_range_bi)
646     {
647       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
648       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
649       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
650         {
651           return IP4_REASS_RC_INTERNAL_ERROR;
652         }
653       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
654     }
655   else
656     {
657       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
658     }
659   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
660   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
661       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
662     {
663       return IP4_REASS_RC_INTERNAL_ERROR;
664     }
665   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
666   while (1)
667     {
668       vlib_buffer_free_one (vm, discard_bi);
669       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
670         {
671           ip4_reass_add_trace (vm, node, rm, reass, discard_bi, RANGE_DISCARD,
672                                0);
673         }
674       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
675         {
676           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
677           discard_bi = discard_b->next_buffer;
678           discard_b = vlib_get_buffer (vm, discard_bi);
679         }
680       else
681         {
682           break;
683         }
684     }
685   return IP4_REASS_RC_OK;
686 }
687
688 always_inline ip4_reass_rc_t
689 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
690                   ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
691                   ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
692                   bool is_feature)
693 {
694   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
695   int consumed = 0;
696   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
697   ip4_header_t *fip = vlib_buffer_get_current (fb);
698   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
699   reass->next_index = fvnb->ip.reass.next_index;        // store next_index before it's overwritten
700   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
701   const u32 fragment_length =
702     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
703   const u32 fragment_last = fragment_first + fragment_length - 1;
704   fvnb->ip.reass.fragment_first = fragment_first;
705   fvnb->ip.reass.fragment_last = fragment_last;
706   int more_fragments = ip4_get_fragment_more (fip);
707   u32 candidate_range_bi = reass->first_bi;
708   u32 prev_range_bi = ~0;
709   fvnb->ip.reass.range_first = fragment_first;
710   fvnb->ip.reass.range_last = fragment_last;
711   fvnb->ip.reass.next_range_bi = ~0;
712   if (!more_fragments)
713     {
714       reass->last_packet_octet = fragment_last;
715     }
716   if (~0 == reass->first_bi)
717     {
718       // starting a new reassembly
719       rc =
720         ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
721                                          *bi0);
722       if (IP4_REASS_RC_OK != rc)
723         {
724           return rc;
725         }
726       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
727         {
728           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
729         }
730       *bi0 = ~0;
731       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
732       return IP4_REASS_RC_OK;
733     }
734   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
735                                          fvnb->ip.reass.estimated_mtu);
736   while (~0 != candidate_range_bi)
737     {
738       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
739       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
740       if (fragment_first > candidate_vnb->ip.reass.range_last)
741         {
742           // this fragments starts after candidate range
743           prev_range_bi = candidate_range_bi;
744           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
745           if (candidate_vnb->ip.reass.range_last < fragment_last &&
746               ~0 == candidate_range_bi)
747             {
748               // special case - this fragment falls beyond all known ranges
749               rc =
750                 ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
751                                                  prev_range_bi, *bi0);
752               if (IP4_REASS_RC_OK != rc)
753                 {
754                   return rc;
755                 }
756               consumed = 1;
757               break;
758             }
759           continue;
760         }
761       if (fragment_last < candidate_vnb->ip.reass.range_first)
762         {
763           // this fragment ends before candidate range without any overlap
764           rc =
765             ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
766                                              *bi0);
767           if (IP4_REASS_RC_OK != rc)
768             {
769               return rc;
770             }
771           consumed = 1;
772         }
773       else
774         {
775           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
776               fragment_last <= candidate_vnb->ip.reass.range_last)
777             {
778               // this fragment is a (sub)part of existing range, ignore it
779               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
780                 {
781                   ip4_reass_add_trace (vm, node, rm, reass, *bi0,
782                                        RANGE_OVERLAP, 0);
783                 }
784               break;
785             }
786           int discard_candidate = 0;
787           if (fragment_first < candidate_vnb->ip.reass.range_first)
788             {
789               u32 overlap =
790                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
791               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
792                 {
793                   candidate_vnb->ip.reass.range_first += overlap;
794                   if (reass->data_len < overlap)
795                     {
796                       return IP4_REASS_RC_INTERNAL_ERROR;
797                     }
798                   reass->data_len -= overlap;
799                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
800                     {
801                       ip4_reass_add_trace (vm, node, rm, reass,
802                                            candidate_range_bi, RANGE_SHRINK,
803                                            overlap);
804                     }
805                   rc =
806                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
807                                                      prev_range_bi, *bi0);
808                   if (IP4_REASS_RC_OK != rc)
809                     {
810                       return rc;
811                     }
812                   consumed = 1;
813                 }
814               else
815                 {
816                   discard_candidate = 1;
817                 }
818             }
819           else if (fragment_last > candidate_vnb->ip.reass.range_last)
820             {
821               u32 overlap =
822                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
823               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
824                 {
825                   fvnb->ip.reass.range_first += overlap;
826                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
827                     {
828                       prev_range_bi = candidate_range_bi;
829                       candidate_range_bi =
830                         candidate_vnb->ip.reass.next_range_bi;
831                       continue;
832                     }
833                   else
834                     {
835                       // special case - last range discarded
836                       rc =
837                         ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
838                                                          candidate_range_bi,
839                                                          *bi0);
840                       if (IP4_REASS_RC_OK != rc)
841                         {
842                           return rc;
843                         }
844                       consumed = 1;
845                     }
846                 }
847               else
848                 {
849                   discard_candidate = 1;
850                 }
851             }
852           else
853             {
854               discard_candidate = 1;
855             }
856           if (discard_candidate)
857             {
858               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
859               // discard candidate range, probe next range
860               rc =
861                 ip4_reass_remove_range_from_chain (vm, node, rm, reass,
862                                                    prev_range_bi,
863                                                    candidate_range_bi);
864               if (IP4_REASS_RC_OK != rc)
865                 {
866                   return rc;
867                 }
868               if (~0 != next_range_bi)
869                 {
870                   candidate_range_bi = next_range_bi;
871                   continue;
872                 }
873               else
874                 {
875                   // special case - last range discarded
876                   rc =
877                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
878                                                      prev_range_bi, *bi0);
879                   if (IP4_REASS_RC_OK != rc)
880                     {
881                       return rc;
882                     }
883                   consumed = 1;
884                 }
885             }
886         }
887       break;
888     }
889   if (consumed)
890     {
891       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
892         {
893           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
894         }
895     }
896   if (~0 != reass->last_packet_octet &&
897       reass->data_len == reass->last_packet_octet + 1)
898     {
899       return ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
900                                  is_feature);
901     }
902   else
903     {
904       if (consumed)
905         {
906           *bi0 = ~0;
907         }
908       else
909         {
910           *next0 = IP4_REASSEMBLY_NEXT_DROP;
911           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
912         }
913     }
914   return rc;
915 }
916
917 always_inline uword
918 ip4_reassembly_inline (vlib_main_t * vm,
919                        vlib_node_runtime_t * node,
920                        vlib_frame_t * frame, bool is_feature)
921 {
922   u32 *from = vlib_frame_vector_args (frame);
923   u32 n_left_from, n_left_to_next, *to_next, next_index;
924   ip4_reass_main_t *rm = &ip4_reass_main;
925   ip4_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
926   clib_spinlock_lock (&rt->lock);
927
928   n_left_from = frame->n_vectors;
929   next_index = node->cached_next_index;
930   while (n_left_from > 0)
931     {
932       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
933
934       while (n_left_from > 0 && n_left_to_next > 0)
935         {
936           u32 bi0;
937           vlib_buffer_t *b0;
938           u32 next0;
939           u32 error0 = IP4_ERROR_NONE;
940
941           bi0 = from[0];
942           b0 = vlib_get_buffer (vm, bi0);
943
944           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
945           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
946             {
947               // this is a whole packet - no fragmentation
948               if (is_feature)
949                 {
950                   next0 = IP4_REASSEMBLY_NEXT_INPUT;
951                 }
952               else
953                 {
954                   next0 = vnet_buffer (b0)->ip.reass.next_index;
955                 }
956             }
957           else
958             {
959               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
960               const u32 fragment_length =
961                 clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
962               const u32 fragment_last = fragment_first + fragment_length - 1;
963               if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
964                 {
965                   next0 = IP4_REASSEMBLY_NEXT_DROP;
966                   error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
967                 }
968               else
969                 {
970                   ip4_reass_kv_t kv;
971                   u8 do_handoff = 0;
972
973                   kv.k.as_u64[0] =
974                     (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
975                                    vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
976                     (u64) ip0->src_address.as_u32 << 32;
977                   kv.k.as_u64[1] =
978                     (u64) ip0->dst_address.as_u32 |
979                     (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
980
981                   ip4_reass_t *reass =
982                     ip4_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
983
984                   if (PREDICT_FALSE (do_handoff))
985                     {
986                       next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
987                       if (is_feature)
988                         vnet_buffer (b0)->ip.
989                           reass.owner_feature_thread_index =
990                           kv.v.thread_index;
991                       else
992                         vnet_buffer (b0)->ip.reass.owner_thread_index =
993                           kv.v.thread_index;
994                     }
995                   else if (reass)
996                     {
997                       switch (ip4_reass_update
998                               (vm, node, rm, rt, reass, &bi0, &next0,
999                                &error0, is_feature))
1000                         {
1001                         case IP4_REASS_RC_OK:
1002                           /* nothing to do here */
1003                           break;
1004                         case IP4_REASS_RC_NO_BUF:
1005                           /* fallthrough */
1006                         case IP4_REASS_RC_INTERNAL_ERROR:
1007                           /* drop everything and start with a clean slate */
1008                           ip4_reass_on_timeout (vm, rm, reass);
1009                           ip4_reass_free (rm, rt, reass);
1010                           goto next_packet;
1011                           break;
1012                         }
1013                     }
1014                   else
1015                     {
1016                       next0 = IP4_REASSEMBLY_NEXT_DROP;
1017                       error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1018                     }
1019                 }
1020
1021               b0->error = node->errors[error0];
1022             }
1023
1024           if (bi0 != ~0)
1025             {
1026               to_next[0] = bi0;
1027               to_next += 1;
1028               n_left_to_next -= 1;
1029               if (is_feature && IP4_ERROR_NONE == error0)
1030                 {
1031                   b0 = vlib_get_buffer (vm, bi0);
1032                   vnet_feature_next (&next0, b0);
1033                 }
1034               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1035                                                to_next, n_left_to_next,
1036                                                bi0, next0);
1037               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1038             }
1039
1040         next_packet:
1041           from += 1;
1042           n_left_from -= 1;
1043         }
1044
1045       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1046     }
1047
1048   clib_spinlock_unlock (&rt->lock);
1049   return frame->n_vectors;
1050 }
1051
1052 static char *ip4_reassembly_error_strings[] = {
1053 #define _(sym, string) string,
1054   foreach_ip4_error
1055 #undef _
1056 };
1057
1058 static uword
1059 ip4_reassembly (vlib_main_t * vm, vlib_node_runtime_t * node,
1060                 vlib_frame_t * frame)
1061 {
1062   return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ );
1063 }
1064
1065 /* *INDENT-OFF* */
1066 VLIB_REGISTER_NODE (ip4_reass_node, static) = {
1067     .function = ip4_reassembly,
1068     .name = "ip4-reassembly",
1069     .vector_size = sizeof (u32),
1070     .format_trace = format_ip4_reass_trace,
1071     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1072     .error_strings = ip4_reassembly_error_strings,
1073     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1074     .next_nodes =
1075         {
1076                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1077                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1078                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reassembly-handoff",
1079
1080         },
1081 };
1082 /* *INDENT-ON* */
1083
1084 VLIB_NODE_FUNCTION_MULTIARCH (ip4_reass_node, ip4_reassembly);
1085
1086 static uword
1087 ip4_reassembly_feature (vlib_main_t * vm,
1088                         vlib_node_runtime_t * node, vlib_frame_t * frame)
1089 {
1090   return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ );
1091 }
1092
1093 /* *INDENT-OFF* */
1094 VLIB_REGISTER_NODE (ip4_reass_node_feature, static) = {
1095     .function = ip4_reassembly_feature,
1096     .name = "ip4-reassembly-feature",
1097     .vector_size = sizeof (u32),
1098     .format_trace = format_ip4_reass_trace,
1099     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1100     .error_strings = ip4_reassembly_error_strings,
1101     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1102     .next_nodes =
1103         {
1104                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1105                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1106                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reass-feature-hoff",
1107         },
1108 };
1109 /* *INDENT-ON* */
1110
1111 VLIB_NODE_FUNCTION_MULTIARCH (ip4_reass_node_feature, ip4_reassembly_feature);
1112
1113 /* *INDENT-OFF* */
1114 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
1115     .arc_name = "ip4-unicast",
1116     .node_name = "ip4-reassembly-feature",
1117     .runs_before = VNET_FEATURES ("ip4-lookup"),
1118     .runs_after = 0,
1119 };
1120 /* *INDENT-ON* */
1121
1122 always_inline u32
1123 ip4_reass_get_nbuckets ()
1124 {
1125   ip4_reass_main_t *rm = &ip4_reass_main;
1126   u32 nbuckets;
1127   u8 i;
1128
1129   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1130
1131   for (i = 0; i < 31; i++)
1132     if ((1 << i) >= nbuckets)
1133       break;
1134   nbuckets = 1 << i;
1135
1136   return nbuckets;
1137 }
1138
1139 typedef enum
1140 {
1141   IP4_EVENT_CONFIG_CHANGED = 1,
1142 } ip4_reass_event_t;
1143
1144 typedef struct
1145 {
1146   int failure;
1147   clib_bihash_16_8_t *new_hash;
1148 } ip4_rehash_cb_ctx;
1149
1150 static void
1151 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1152 {
1153   ip4_rehash_cb_ctx *ctx = _ctx;
1154   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1155     {
1156       ctx->failure = 1;
1157     }
1158 }
1159
1160 static void
1161 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1162                       u32 expire_walk_interval_ms)
1163 {
1164   ip4_reass_main.timeout_ms = timeout_ms;
1165   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1166   ip4_reass_main.max_reass_n = max_reassemblies;
1167   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1168 }
1169
1170 vnet_api_error_t
1171 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
1172                u32 expire_walk_interval_ms)
1173 {
1174   u32 old_nbuckets = ip4_reass_get_nbuckets ();
1175   ip4_reass_set_params (timeout_ms, max_reassemblies,
1176                         expire_walk_interval_ms);
1177   vlib_process_signal_event (ip4_reass_main.vlib_main,
1178                              ip4_reass_main.ip4_reass_expire_node_idx,
1179                              IP4_EVENT_CONFIG_CHANGED, 0);
1180   u32 new_nbuckets = ip4_reass_get_nbuckets ();
1181   if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1182     {
1183       clib_bihash_16_8_t new_hash;
1184       clib_memset (&new_hash, 0, sizeof (new_hash));
1185       ip4_rehash_cb_ctx ctx;
1186       ctx.failure = 0;
1187       ctx.new_hash = &new_hash;
1188       clib_bihash_init_16_8 (&new_hash, "ip4-reass", new_nbuckets,
1189                              new_nbuckets * 1024);
1190       clib_bihash_foreach_key_value_pair_16_8 (&ip4_reass_main.hash,
1191                                                ip4_rehash_cb, &ctx);
1192       if (ctx.failure)
1193         {
1194           clib_bihash_free_16_8 (&new_hash);
1195           return -1;
1196         }
1197       else
1198         {
1199           clib_bihash_free_16_8 (&ip4_reass_main.hash);
1200           clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
1201                             sizeof (ip4_reass_main.hash));
1202         }
1203     }
1204   return 0;
1205 }
1206
1207 vnet_api_error_t
1208 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1209                u32 * expire_walk_interval_ms)
1210 {
1211   *timeout_ms = ip4_reass_main.timeout_ms;
1212   *max_reassemblies = ip4_reass_main.max_reass_n;
1213   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
1214   return 0;
1215 }
1216
1217 static clib_error_t *
1218 ip4_reass_init_function (vlib_main_t * vm)
1219 {
1220   ip4_reass_main_t *rm = &ip4_reass_main;
1221   clib_error_t *error = 0;
1222   u32 nbuckets;
1223   vlib_node_t *node;
1224
1225   rm->vlib_main = vm;
1226   rm->vnet_main = vnet_get_main ();
1227
1228   vec_validate (rm->per_thread_data, vlib_num_workers ());
1229   ip4_reass_per_thread_t *rt;
1230   vec_foreach (rt, rm->per_thread_data)
1231   {
1232     clib_spinlock_init (&rt->lock);
1233     pool_alloc (rt->pool, rm->max_reass_n);
1234   }
1235
1236   node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
1237   ASSERT (node);
1238   rm->ip4_reass_expire_node_idx = node->index;
1239
1240   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1241                         IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1242                         IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1243
1244   nbuckets = ip4_reass_get_nbuckets ();
1245   clib_bihash_init_16_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
1246
1247   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1248   ASSERT (node);
1249   rm->ip4_drop_idx = node->index;
1250
1251   rm->fq_index = vlib_frame_queue_main_init (ip4_reass_node.index, 0);
1252   rm->fq_feature_index =
1253     vlib_frame_queue_main_init (ip4_reass_node_feature.index, 0);
1254
1255
1256   return error;
1257 }
1258
1259 VLIB_INIT_FUNCTION (ip4_reass_init_function);
1260
1261 static uword
1262 ip4_reass_walk_expired (vlib_main_t * vm,
1263                         vlib_node_runtime_t * node, vlib_frame_t * f)
1264 {
1265   ip4_reass_main_t *rm = &ip4_reass_main;
1266   uword event_type, *event_data = 0;
1267
1268   while (true)
1269     {
1270       vlib_process_wait_for_event_or_clock (vm,
1271                                             (f64)
1272                                             rm->expire_walk_interval_ms /
1273                                             (f64) MSEC_PER_SEC);
1274       event_type = vlib_process_get_events (vm, &event_data);
1275
1276       switch (event_type)
1277         {
1278         case ~0:                /* no events => timeout */
1279           /* nothing to do here */
1280           break;
1281         case IP4_EVENT_CONFIG_CHANGED:
1282           break;
1283         default:
1284           clib_warning ("BUG: event type 0x%wx", event_type);
1285           break;
1286         }
1287       f64 now = vlib_time_now (vm);
1288
1289       ip4_reass_t *reass;
1290       int *pool_indexes_to_free = NULL;
1291
1292       uword thread_index = 0;
1293       int index;
1294       const uword nthreads = vlib_num_workers () + 1;
1295       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1296         {
1297           ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1298           clib_spinlock_lock (&rt->lock);
1299
1300           vec_reset_length (pool_indexes_to_free);
1301           /* *INDENT-OFF* */
1302           pool_foreach_index (index, rt->pool, ({
1303                                 reass = pool_elt_at_index (rt->pool, index);
1304                                 if (now > reass->last_heard + rm->timeout)
1305                                   {
1306                                     vec_add1 (pool_indexes_to_free, index);
1307                                   }
1308                               }));
1309           /* *INDENT-ON* */
1310           int *i;
1311           /* *INDENT-OFF* */
1312           vec_foreach (i, pool_indexes_to_free)
1313           {
1314             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1315             ip4_reass_on_timeout (vm, rm, reass);
1316             ip4_reass_free (rm, rt, reass);
1317           }
1318           /* *INDENT-ON* */
1319
1320           clib_spinlock_unlock (&rt->lock);
1321         }
1322
1323       vec_free (pool_indexes_to_free);
1324       if (event_data)
1325         {
1326           _vec_len (event_data) = 0;
1327         }
1328     }
1329
1330   return 0;
1331 }
1332
1333 /* *INDENT-OFF* */
1334 VLIB_REGISTER_NODE (ip4_reass_expire_node, static) = {
1335     .function = ip4_reass_walk_expired,
1336     .type = VLIB_NODE_TYPE_PROCESS,
1337     .name = "ip4-reassembly-expire-walk",
1338     .format_trace = format_ip4_reass_trace,
1339     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1340     .error_strings = ip4_reassembly_error_strings,
1341
1342 };
1343 /* *INDENT-ON* */
1344
1345 static u8 *
1346 format_ip4_reass_key (u8 * s, va_list * args)
1347 {
1348   ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
1349   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1350               key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1351               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1352   return s;
1353 }
1354
1355 static u8 *
1356 format_ip4_reass (u8 * s, va_list * args)
1357 {
1358   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1359   ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
1360
1361   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1362               "last_packet_octet: %u, trace_op_counter: %u\n",
1363               reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
1364               reass->data_len, reass->last_packet_octet,
1365               reass->trace_op_counter);
1366   u32 bi = reass->first_bi;
1367   u32 counter = 0;
1368   while (~0 != bi)
1369     {
1370       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1371       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1372       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1373                   "fragment[%u, %u]\n",
1374                   counter, vnb->ip.reass.range_first,
1375                   vnb->ip.reass.range_last, bi,
1376                   ip4_reass_buffer_get_data_offset (b),
1377                   ip4_reass_buffer_get_data_len (b),
1378                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1379       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1380         {
1381           bi = b->next_buffer;
1382         }
1383       else
1384         {
1385           bi = ~0;
1386         }
1387     }
1388   return s;
1389 }
1390
1391 static clib_error_t *
1392 show_ip4_reass (vlib_main_t * vm,
1393                 unformat_input_t * input,
1394                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1395 {
1396   ip4_reass_main_t *rm = &ip4_reass_main;
1397
1398   vlib_cli_output (vm, "---------------------");
1399   vlib_cli_output (vm, "IP4 reassembly status");
1400   vlib_cli_output (vm, "---------------------");
1401   bool details = false;
1402   if (unformat (input, "details"))
1403     {
1404       details = true;
1405     }
1406
1407   u32 sum_reass_n = 0;
1408   ip4_reass_t *reass;
1409   uword thread_index;
1410   const uword nthreads = vlib_num_workers () + 1;
1411   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1412     {
1413       ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1414       clib_spinlock_lock (&rt->lock);
1415       if (details)
1416         {
1417           /* *INDENT-OFF* */
1418           pool_foreach (reass, rt->pool, {
1419             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1420           });
1421           /* *INDENT-ON* */
1422         }
1423       sum_reass_n += rt->reass_n;
1424       clib_spinlock_unlock (&rt->lock);
1425     }
1426   vlib_cli_output (vm, "---------------------");
1427   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1428                    (long unsigned) sum_reass_n);
1429   vlib_cli_output (vm,
1430                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1431                    (long unsigned) rm->max_reass_n);
1432   return 0;
1433 }
1434
1435 /* *INDENT-OFF* */
1436 VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
1437     .path = "show ip4-reassembly",
1438     .short_help = "show ip4-reassembly [details]",
1439     .function = show_ip4_reass,
1440 };
1441 /* *INDENT-ON* */
1442
1443 vnet_api_error_t
1444 ip4_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1445 {
1446   return vnet_feature_enable_disable ("ip4-unicast",
1447                                       "ip4-reassembly-feature", sw_if_index,
1448                                       enable_disable, 0, 0);
1449 }
1450
1451
1452 #define foreach_ip4_reassembly_handoff_error                       \
1453 _(CONGESTION_DROP, "congestion drop")
1454
1455
1456 typedef enum
1457 {
1458 #define _(sym,str) IP4_REASSEMBLY_HANDOFF_ERROR_##sym,
1459   foreach_ip4_reassembly_handoff_error
1460 #undef _
1461     IP4_REASSEMBLY_HANDOFF_N_ERROR,
1462 } ip4_reassembly_handoff_error_t;
1463
1464 static char *ip4_reassembly_handoff_error_strings[] = {
1465 #define _(sym,string) string,
1466   foreach_ip4_reassembly_handoff_error
1467 #undef _
1468 };
1469
1470 typedef struct
1471 {
1472   u32 next_worker_index;
1473 } ip4_reassembly_handoff_trace_t;
1474
1475 static u8 *
1476 format_ip4_reassembly_handoff_trace (u8 * s, va_list * args)
1477 {
1478   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1479   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1480   ip4_reassembly_handoff_trace_t *t =
1481     va_arg (*args, ip4_reassembly_handoff_trace_t *);
1482
1483   s =
1484     format (s, "ip4-reassembly-handoff: next-worker %d",
1485             t->next_worker_index);
1486
1487   return s;
1488 }
1489
1490 always_inline uword
1491 ip4_reassembly_handoff_node_inline (vlib_main_t * vm,
1492                                     vlib_node_runtime_t * node,
1493                                     vlib_frame_t * frame, bool is_feature)
1494 {
1495   ip4_reass_main_t *rm = &ip4_reass_main;
1496
1497   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1498   u32 n_enq, n_left_from, *from;
1499   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1500   u32 fq_index;
1501
1502   from = vlib_frame_vector_args (frame);
1503   n_left_from = frame->n_vectors;
1504   vlib_get_buffers (vm, from, bufs, n_left_from);
1505
1506   b = bufs;
1507   ti = thread_indices;
1508
1509   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1510
1511   while (n_left_from > 0)
1512     {
1513       ti[0] =
1514         (is_feature) ? vnet_buffer (b[0])->ip.
1515         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1516         reass.owner_thread_index;
1517
1518       if (PREDICT_FALSE
1519           ((node->flags & VLIB_NODE_FLAG_TRACE)
1520            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1521         {
1522           ip4_reassembly_handoff_trace_t *t =
1523             vlib_add_trace (vm, node, b[0], sizeof (*t));
1524           t->next_worker_index = ti[0];
1525         }
1526
1527       n_left_from -= 1;
1528       ti += 1;
1529       b += 1;
1530     }
1531   n_enq =
1532     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1533                                    frame->n_vectors, 1);
1534
1535   if (n_enq < frame->n_vectors)
1536     vlib_node_increment_counter (vm, node->node_index,
1537                                  IP4_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1538                                  frame->n_vectors - n_enq);
1539   return frame->n_vectors;
1540 }
1541
1542 VLIB_NODE_FN (ip4_reassembly_handoff_node) (vlib_main_t * vm,
1543                                             vlib_node_runtime_t * node,
1544                                             vlib_frame_t * frame)
1545 {
1546   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1547                                              false /* is_feature */ );
1548 }
1549
1550
1551 /* *INDENT-OFF* */
1552 VLIB_REGISTER_NODE (ip4_reassembly_handoff_node) = {
1553   .name = "ip4-reassembly-handoff",
1554   .vector_size = sizeof (u32),
1555   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1556   .error_strings = ip4_reassembly_handoff_error_strings,
1557   .format_trace = format_ip4_reassembly_handoff_trace,
1558
1559   .n_next_nodes = 1,
1560
1561   .next_nodes = {
1562     [0] = "error-drop",
1563   },
1564 };
1565 /* *INDENT-ON* */
1566
1567
1568 /* *INDENT-OFF* */
1569 VLIB_NODE_FN (ip4_reassembly_feature_handoff_node) (vlib_main_t * vm,
1570                                                     vlib_node_runtime_t *
1571                                                     node,
1572                                                     vlib_frame_t * frame)
1573 {
1574   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1575                                              true /* is_feature */ );
1576 }
1577 /* *INDENT-ON* */
1578
1579
1580 /* *INDENT-OFF* */
1581 VLIB_REGISTER_NODE (ip4_reassembly_feature_handoff_node) = {
1582   .name = "ip4-reass-feature-hoff",
1583   .vector_size = sizeof (u32),
1584   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1585   .error_strings = ip4_reassembly_handoff_error_strings,
1586   .format_trace = format_ip4_reassembly_handoff_trace,
1587
1588   .n_next_nodes = 1,
1589
1590   .next_nodes = {
1591     [0] = "error-drop",
1592   },
1593 };
1594 /* *INDENT-ON* */
1595
1596 /*
1597  * fd.io coding-style-patch-verification: ON
1598  *
1599  * Local Variables:
1600  * eval: (c-set-style "gnu")
1601  * End:
1602  */