L3 cross connect
[vpp.git] / src / vnet / ip / ip4_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Reassembly.
19  *
20  * This file contains the source code for IPv4 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_16_8.h>
27 #include <vnet/ip/ip4_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
34 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
35
36 #define IP4_REASS_DEBUG_BUFFERS 0
37 #if IP4_REASS_DEBUG_BUFFERS
38 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
39   do                                                 \
40     {                                                \
41       u32 _bi = bi;                                  \
42       printf (#what "buffer %u", _bi);               \
43       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
44       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
45         {                                            \
46           _bi = _b->next_buffer;                     \
47           printf ("[%u]", _bi);                      \
48           _b = vlib_get_buffer (vm, _bi);            \
49         }                                            \
50       printf ("\n");                                 \
51       fflush (stdout);                               \
52     }                                                \
53   while (0)
54 #else
55 #define IP4_REASS_DEBUG_BUFFER(...)
56 #endif
57
58 typedef enum
59 {
60   IP4_REASS_RC_OK,
61   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
62   IP4_REASS_RC_INTERNAL_ERROR,
63   IP4_REASS_RC_NO_BUF,
64 } ip4_reass_rc_t;
65
66 typedef struct
67 {
68   union
69   {
70     struct
71     {
72       u32 xx_id;
73       ip4_address_t src;
74       ip4_address_t dst;
75       u16 frag_id;
76       u8 proto;
77       u8 unused;
78     };
79     u64 as_u64[2];
80   };
81 } ip4_reass_key_t;
82
83 typedef union
84 {
85   struct
86   {
87     u32 reass_index;
88     u32 thread_index;
89   };
90   u64 as_u64;
91 } ip4_reass_val_t;
92
93 typedef union
94 {
95   struct
96   {
97     ip4_reass_key_t k;
98     ip4_reass_val_t v;
99   };
100   clib_bihash_kv_16_8_t kv;
101 } ip4_reass_kv_t;
102
103 always_inline u32
104 ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
105 {
106   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
107   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
108 }
109
110 always_inline u16
111 ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
112 {
113   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
114   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
115     (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
116 }
117
118 typedef struct
119 {
120   // hash table key
121   ip4_reass_key_t key;
122   // time when last packet was received
123   f64 last_heard;
124   // internal id of this reassembly
125   u64 id;
126   // buffer index of first buffer in this reassembly context
127   u32 first_bi;
128   // last octet of packet, ~0 until fragment without more_fragments arrives
129   u32 last_packet_octet;
130   // length of data collected so far
131   u32 data_len;
132   // trace operation counter
133   u32 trace_op_counter;
134   // next index - used by non-feature node
135   u8 next_index;
136   // minimum fragment length for this reassembly - used to estimate MTU
137   u16 min_fragment_length;
138   // number of fragments in this reassembly
139   u32 fragments_n;
140 } ip4_reass_t;
141
142 typedef struct
143 {
144   ip4_reass_t *pool;
145   u32 reass_n;
146   u32 id_counter;
147   clib_spinlock_t lock;
148 } ip4_reass_per_thread_t;
149
150 typedef struct
151 {
152   // IPv4 config
153   u32 timeout_ms;
154   f64 timeout;
155   u32 expire_walk_interval_ms;
156   // maximum number of fragments in one reassembly
157   u32 max_reass_len;
158   // maximum number of reassemblies
159   u32 max_reass_n;
160
161   // IPv4 runtime
162   clib_bihash_16_8_t hash;
163   // per-thread data
164   ip4_reass_per_thread_t *per_thread_data;
165
166   // convenience
167   vlib_main_t *vlib_main;
168   vnet_main_t *vnet_main;
169
170   // node index of ip4-drop node
171   u32 ip4_drop_idx;
172   u32 ip4_reass_expire_node_idx;
173
174   /** Worker handoff */
175   u32 fq_index;
176   u32 fq_feature_index;
177
178 } ip4_reass_main_t;
179
180 extern ip4_reass_main_t ip4_reass_main;
181
182 #ifndef CLIB_MARCH_VARIANT
183 ip4_reass_main_t ip4_reass_main;
184 #endif /* CLIB_MARCH_VARIANT */
185
186 typedef enum
187 {
188   IP4_REASSEMBLY_NEXT_INPUT,
189   IP4_REASSEMBLY_NEXT_DROP,
190   IP4_REASSEMBLY_NEXT_HANDOFF,
191   IP4_REASSEMBLY_N_NEXT,
192 } ip4_reass_next_t;
193
194 typedef enum
195 {
196   RANGE_NEW,
197   RANGE_SHRINK,
198   RANGE_DISCARD,
199   RANGE_OVERLAP,
200   FINALIZE,
201 } ip4_reass_trace_operation_e;
202
203 typedef struct
204 {
205   u16 range_first;
206   u16 range_last;
207   u32 range_bi;
208   i32 data_offset;
209   u32 data_len;
210   u32 first_bi;
211 } ip4_reass_range_trace_t;
212
213 typedef struct
214 {
215   ip4_reass_trace_operation_e action;
216   u32 reass_id;
217   ip4_reass_range_trace_t trace_range;
218   u32 size_diff;
219   u32 op_id;
220   u32 fragment_first;
221   u32 fragment_last;
222   u32 total_data_len;
223 } ip4_reass_trace_t;
224
225 extern vlib_node_registration_t ip4_reass_node;
226 extern vlib_node_registration_t ip4_reass_node_feature;
227
228 static void
229 ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
230                          ip4_reass_range_trace_t * trace)
231 {
232   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
233   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
234   trace->range_first = vnb->ip.reass.range_first;
235   trace->range_last = vnb->ip.reass.range_last;
236   trace->data_offset = ip4_reass_buffer_get_data_offset (b);
237   trace->data_len = ip4_reass_buffer_get_data_len (b);
238   trace->range_bi = bi;
239 }
240
241 static u8 *
242 format_ip4_reass_range_trace (u8 * s, va_list * args)
243 {
244   ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
245   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
246               trace->range_last, trace->data_offset, trace->data_len,
247               trace->range_bi);
248   return s;
249 }
250
251 static u8 *
252 format_ip4_reass_trace (u8 * s, va_list * args)
253 {
254   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
255   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
256   ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
257   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
258   u32 indent = format_get_indent (s);
259   s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
260               t->trace_range.first_bi, t->total_data_len, t->fragment_first,
261               t->fragment_last);
262   switch (t->action)
263     {
264     case RANGE_SHRINK:
265       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
266                   format_ip4_reass_range_trace, &t->trace_range,
267                   t->size_diff);
268       break;
269     case RANGE_DISCARD:
270       s = format (s, "\n%Udiscard %U", format_white_space, indent,
271                   format_ip4_reass_range_trace, &t->trace_range);
272       break;
273     case RANGE_NEW:
274       s = format (s, "\n%Unew %U", format_white_space, indent,
275                   format_ip4_reass_range_trace, &t->trace_range);
276       break;
277     case RANGE_OVERLAP:
278       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
279                   format_ip4_reass_range_trace, &t->trace_range);
280       break;
281     case FINALIZE:
282       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
283       break;
284     }
285   return s;
286 }
287
288 static void
289 ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
290                      ip4_reass_main_t * rm, ip4_reass_t * reass, u32 bi,
291                      ip4_reass_trace_operation_e action, u32 size_diff)
292 {
293   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
294   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
295   if (pool_is_free_index (vm->trace_main.trace_buffer_pool, b->trace_index))
296     {
297       // this buffer's trace is gone
298       b->flags &= ~VLIB_BUFFER_IS_TRACED;
299       return;
300     }
301   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
302   t->reass_id = reass->id;
303   t->action = action;
304   ip4_reass_trace_details (vm, bi, &t->trace_range);
305   t->size_diff = size_diff;
306   t->op_id = reass->trace_op_counter;
307   ++reass->trace_op_counter;
308   t->fragment_first = vnb->ip.reass.fragment_first;
309   t->fragment_last = vnb->ip.reass.fragment_last;
310   t->trace_range.first_bi = reass->first_bi;
311   t->total_data_len = reass->data_len;
312 #if 0
313   static u8 *s = NULL;
314   s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
315   printf ("%.*s\n", vec_len (s), s);
316   fflush (stdout);
317   vec_reset_length (s);
318 #endif
319 }
320
321
322 always_inline void
323 ip4_reass_free (ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
324                 ip4_reass_t * reass)
325 {
326   clib_bihash_kv_16_8_t kv;
327   kv.key[0] = reass->key.as_u64[0];
328   kv.key[1] = reass->key.as_u64[1];
329   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
330   pool_put (rt->pool, reass);
331   --rt->reass_n;
332 }
333
334 always_inline void
335 ip4_reass_on_timeout (vlib_main_t * vm, ip4_reass_main_t * rm,
336                       ip4_reass_t * reass)
337 {
338   u32 range_bi = reass->first_bi;
339   vlib_buffer_t *range_b;
340   vnet_buffer_opaque_t *range_vnb;
341   u32 *to_free = NULL;
342   while (~0 != range_bi)
343     {
344       range_b = vlib_get_buffer (vm, range_bi);
345       range_vnb = vnet_buffer (range_b);
346       u32 bi = range_bi;
347       while (~0 != bi)
348         {
349           vec_add1 (to_free, bi);
350           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
351           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
352             {
353               bi = b->next_buffer;
354               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
355             }
356           else
357             {
358               bi = ~0;
359             }
360         }
361       range_bi = range_vnb->ip.reass.next_range_bi;
362     }
363   vlib_buffer_free (vm, to_free, vec_len (to_free));
364   vec_free (to_free);
365 }
366
367 static ip4_reass_t *
368 ip4_reass_find_or_create (vlib_main_t * vm, ip4_reass_main_t * rm,
369                           ip4_reass_per_thread_t * rt, ip4_reass_kv_t * kv,
370                           u8 * do_handoff)
371 {
372   ip4_reass_t *reass = NULL;
373   f64 now = vlib_time_now (rm->vlib_main);
374
375   if (!clib_bihash_search_16_8
376       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
377     {
378       if (vm->thread_index != kv->v.thread_index)
379         {
380           *do_handoff = 1;
381           return NULL;
382         }
383       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
384
385       if (now > reass->last_heard + rm->timeout)
386         {
387           ip4_reass_on_timeout (vm, rm, reass);
388           ip4_reass_free (rm, rt, reass);
389           reass = NULL;
390         }
391     }
392
393   if (reass)
394     {
395       reass->last_heard = now;
396       return reass;
397     }
398
399   if (rt->reass_n >= rm->max_reass_n)
400     {
401       reass = NULL;
402       return reass;
403     }
404   else
405     {
406       pool_get (rt->pool, reass);
407       clib_memset (reass, 0, sizeof (*reass));
408       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
409       ++rt->id_counter;
410       reass->first_bi = ~0;
411       reass->last_packet_octet = ~0;
412       reass->data_len = 0;
413       ++rt->reass_n;
414     }
415
416   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
417   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
418   kv->v.reass_index = (reass - rt->pool);
419   kv->v.thread_index = vm->thread_index;
420   reass->last_heard = now;
421
422   if (clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 1))
423     {
424       ip4_reass_free (rm, rt, reass);
425       reass = NULL;
426     }
427
428   return reass;
429 }
430
431 always_inline ip4_reass_rc_t
432 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
433                     ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
434                     ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
435                     bool is_feature)
436 {
437   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
438   vlib_buffer_t *last_b = NULL;
439   u32 sub_chain_bi = reass->first_bi;
440   u32 total_length = 0;
441   u32 buf_cnt = 0;
442   do
443     {
444       u32 tmp_bi = sub_chain_bi;
445       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
446       ip4_header_t *ip = vlib_buffer_get_current (tmp);
447       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
448       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
449           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
450         {
451           return IP4_REASS_RC_INTERNAL_ERROR;
452         }
453
454       u32 data_len = ip4_reass_buffer_get_data_len (tmp);
455       u32 trim_front =
456         ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
457       u32 trim_end =
458         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
459       if (tmp_bi == reass->first_bi)
460         {
461           /* first buffer - keep ip4 header */
462           if (0 != ip4_reass_buffer_get_data_offset (tmp))
463             {
464               return IP4_REASS_RC_INTERNAL_ERROR;
465             }
466           trim_front = 0;
467           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
468             ip4_header_bytes (ip);
469           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
470             {
471               return IP4_REASS_RC_INTERNAL_ERROR;
472             }
473         }
474       u32 keep_data =
475         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
476       while (1)
477         {
478           ++buf_cnt;
479           if (trim_front)
480             {
481               if (trim_front > tmp->current_length)
482                 {
483                   /* drop whole buffer */
484                   u32 to_be_freed_bi = tmp_bi;
485                   trim_front -= tmp->current_length;
486                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
487                     {
488                       return IP4_REASS_RC_INTERNAL_ERROR;
489                     }
490                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
491                   tmp_bi = tmp->next_buffer;
492                   tmp->next_buffer = 0;
493                   tmp = vlib_get_buffer (vm, tmp_bi);
494                   vlib_buffer_free_one (vm, to_be_freed_bi);
495                   continue;
496                 }
497               else
498                 {
499                   vlib_buffer_advance (tmp, trim_front);
500                   trim_front = 0;
501                 }
502             }
503           if (keep_data)
504             {
505               if (last_b)
506                 {
507                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
508                   last_b->next_buffer = tmp_bi;
509                 }
510               last_b = tmp;
511               if (keep_data <= tmp->current_length)
512                 {
513                   tmp->current_length = keep_data;
514                   keep_data = 0;
515                 }
516               else
517                 {
518                   keep_data -= tmp->current_length;
519                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
520                     {
521                       return IP4_REASS_RC_INTERNAL_ERROR;
522                     }
523                 }
524               total_length += tmp->current_length;
525               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
526                 {
527                   tmp_bi = tmp->next_buffer;
528                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
529                 }
530               else
531                 {
532                   break;
533                 }
534             }
535           else
536             {
537               u32 to_be_freed_bi = tmp_bi;
538               if (reass->first_bi == tmp_bi)
539                 {
540                   return IP4_REASS_RC_INTERNAL_ERROR;
541                 }
542               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
543                 {
544                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
545                   tmp_bi = tmp->next_buffer;
546                   tmp->next_buffer = 0;
547                   tmp = vlib_get_buffer (vm, tmp_bi);
548                   vlib_buffer_free_one (vm, to_be_freed_bi);
549                 }
550               else
551                 {
552                   tmp->next_buffer = 0;
553                   vlib_buffer_free_one (vm, to_be_freed_bi);
554                   break;
555                 }
556             }
557         }
558       sub_chain_bi =
559         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
560         reass.next_range_bi;
561     }
562   while (~0 != sub_chain_bi);
563
564   if (!last_b)
565     {
566       return IP4_REASS_RC_INTERNAL_ERROR;
567     }
568   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
569
570   if (total_length < first_b->current_length)
571     {
572       return IP4_REASS_RC_INTERNAL_ERROR;
573     }
574   total_length -= first_b->current_length;
575   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
576   first_b->total_length_not_including_first_buffer = total_length;
577   ip4_header_t *ip = vlib_buffer_get_current (first_b);
578   ip->flags_and_fragment_offset = 0;
579   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
580   ip->checksum = ip4_header_checksum (ip);
581   if (!vlib_buffer_chain_linearize (vm, first_b))
582     {
583       return IP4_REASS_RC_NO_BUF;
584     }
585   // reset to reconstruct the mbuf linking
586   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
587   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
588     {
589       ip4_reass_add_trace (vm, node, rm, reass, reass->first_bi, FINALIZE, 0);
590 #if 0
591       // following code does a hexdump of packet fragments to stdout ...
592       do
593         {
594           u32 bi = reass->first_bi;
595           u8 *s = NULL;
596           while (~0 != bi)
597             {
598               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
599               s = format (s, "%u: %U\n", bi, format_hexdump,
600                           vlib_buffer_get_current (b), b->current_length);
601               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
602                 {
603                   bi = b->next_buffer;
604                 }
605               else
606                 {
607                   break;
608                 }
609             }
610           printf ("%.*s\n", vec_len (s), s);
611           fflush (stdout);
612           vec_free (s);
613         }
614       while (0);
615 #endif
616     }
617   *bi0 = reass->first_bi;
618   if (is_feature)
619     {
620       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
621     }
622   else
623     {
624       *next0 = reass->next_index;
625     }
626   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
627   *error0 = IP4_ERROR_NONE;
628   ip4_reass_free (rm, rt, reass);
629   reass = NULL;
630   return IP4_REASS_RC_OK;
631 }
632
633 always_inline ip4_reass_rc_t
634 ip4_reass_insert_range_in_chain (vlib_main_t * vm,
635                                  ip4_reass_main_t * rm,
636                                  ip4_reass_per_thread_t * rt,
637                                  ip4_reass_t * reass,
638                                  u32 prev_range_bi, u32 new_next_bi)
639 {
640   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
641   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
642   if (~0 != prev_range_bi)
643     {
644       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
645       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
646       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
647       prev_vnb->ip.reass.next_range_bi = new_next_bi;
648     }
649   else
650     {
651       if (~0 != reass->first_bi)
652         {
653           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
654         }
655       reass->first_bi = new_next_bi;
656     }
657   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
658   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
659       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
660     {
661       return IP4_REASS_RC_INTERNAL_ERROR;
662     }
663   reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
664   return IP4_REASS_RC_OK;
665 }
666
667 always_inline ip4_reass_rc_t
668 ip4_reass_remove_range_from_chain (vlib_main_t * vm,
669                                    vlib_node_runtime_t * node,
670                                    ip4_reass_main_t * rm,
671                                    ip4_reass_t * reass, u32 prev_range_bi,
672                                    u32 discard_bi)
673 {
674   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
675   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
676   if (~0 != prev_range_bi)
677     {
678       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
679       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
680       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
681         {
682           return IP4_REASS_RC_INTERNAL_ERROR;
683         }
684       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
685     }
686   else
687     {
688       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
689     }
690   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
691   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
692       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
693     {
694       return IP4_REASS_RC_INTERNAL_ERROR;
695     }
696   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
697   while (1)
698     {
699       u32 to_be_freed_bi = discard_bi;
700       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
701         {
702           ip4_reass_add_trace (vm, node, rm, reass, discard_bi, RANGE_DISCARD,
703                                0);
704         }
705       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
706         {
707           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
708           discard_bi = discard_b->next_buffer;
709           discard_b->next_buffer = 0;
710           discard_b = vlib_get_buffer (vm, discard_bi);
711           vlib_buffer_free_one (vm, to_be_freed_bi);
712         }
713       else
714         {
715           discard_b->next_buffer = 0;
716           vlib_buffer_free_one (vm, to_be_freed_bi);
717           break;
718         }
719     }
720   return IP4_REASS_RC_OK;
721 }
722
723 always_inline ip4_reass_rc_t
724 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
725                   ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
726                   ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
727                   bool is_feature)
728 {
729   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
730   int consumed = 0;
731   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
732   ip4_header_t *fip = vlib_buffer_get_current (fb);
733   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
734   reass->next_index = fvnb->ip.reass.next_index;        // store next_index before it's overwritten
735   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
736   const u32 fragment_length =
737     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
738   const u32 fragment_last = fragment_first + fragment_length - 1;
739   fvnb->ip.reass.fragment_first = fragment_first;
740   fvnb->ip.reass.fragment_last = fragment_last;
741   int more_fragments = ip4_get_fragment_more (fip);
742   u32 candidate_range_bi = reass->first_bi;
743   u32 prev_range_bi = ~0;
744   fvnb->ip.reass.range_first = fragment_first;
745   fvnb->ip.reass.range_last = fragment_last;
746   fvnb->ip.reass.next_range_bi = ~0;
747   if (!more_fragments)
748     {
749       reass->last_packet_octet = fragment_last;
750     }
751   if (~0 == reass->first_bi)
752     {
753       // starting a new reassembly
754       rc =
755         ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
756                                          *bi0);
757       if (IP4_REASS_RC_OK != rc)
758         {
759           return rc;
760         }
761       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
762         {
763           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
764         }
765       *bi0 = ~0;
766       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
767       reass->fragments_n = 1;
768       return IP4_REASS_RC_OK;
769     }
770   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
771                                          fvnb->ip.reass.estimated_mtu);
772   while (~0 != candidate_range_bi)
773     {
774       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
775       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
776       if (fragment_first > candidate_vnb->ip.reass.range_last)
777         {
778           // this fragments starts after candidate range
779           prev_range_bi = candidate_range_bi;
780           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
781           if (candidate_vnb->ip.reass.range_last < fragment_last &&
782               ~0 == candidate_range_bi)
783             {
784               // special case - this fragment falls beyond all known ranges
785               rc =
786                 ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
787                                                  prev_range_bi, *bi0);
788               if (IP4_REASS_RC_OK != rc)
789                 {
790                   return rc;
791                 }
792               consumed = 1;
793               break;
794             }
795           continue;
796         }
797       if (fragment_last < candidate_vnb->ip.reass.range_first)
798         {
799           // this fragment ends before candidate range without any overlap
800           rc =
801             ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
802                                              *bi0);
803           if (IP4_REASS_RC_OK != rc)
804             {
805               return rc;
806             }
807           consumed = 1;
808         }
809       else
810         {
811           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
812               fragment_last <= candidate_vnb->ip.reass.range_last)
813             {
814               // this fragment is a (sub)part of existing range, ignore it
815               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
816                 {
817                   ip4_reass_add_trace (vm, node, rm, reass, *bi0,
818                                        RANGE_OVERLAP, 0);
819                 }
820               break;
821             }
822           int discard_candidate = 0;
823           if (fragment_first < candidate_vnb->ip.reass.range_first)
824             {
825               u32 overlap =
826                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
827               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
828                 {
829                   candidate_vnb->ip.reass.range_first += overlap;
830                   if (reass->data_len < overlap)
831                     {
832                       return IP4_REASS_RC_INTERNAL_ERROR;
833                     }
834                   reass->data_len -= overlap;
835                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
836                     {
837                       ip4_reass_add_trace (vm, node, rm, reass,
838                                            candidate_range_bi, RANGE_SHRINK,
839                                            overlap);
840                     }
841                   rc =
842                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
843                                                      prev_range_bi, *bi0);
844                   if (IP4_REASS_RC_OK != rc)
845                     {
846                       return rc;
847                     }
848                   consumed = 1;
849                 }
850               else
851                 {
852                   discard_candidate = 1;
853                 }
854             }
855           else if (fragment_last > candidate_vnb->ip.reass.range_last)
856             {
857               u32 overlap =
858                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
859               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
860                 {
861                   fvnb->ip.reass.range_first += overlap;
862                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
863                     {
864                       prev_range_bi = candidate_range_bi;
865                       candidate_range_bi =
866                         candidate_vnb->ip.reass.next_range_bi;
867                       continue;
868                     }
869                   else
870                     {
871                       // special case - last range discarded
872                       rc =
873                         ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
874                                                          candidate_range_bi,
875                                                          *bi0);
876                       if (IP4_REASS_RC_OK != rc)
877                         {
878                           return rc;
879                         }
880                       consumed = 1;
881                     }
882                 }
883               else
884                 {
885                   discard_candidate = 1;
886                 }
887             }
888           else
889             {
890               discard_candidate = 1;
891             }
892           if (discard_candidate)
893             {
894               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
895               // discard candidate range, probe next range
896               rc =
897                 ip4_reass_remove_range_from_chain (vm, node, rm, reass,
898                                                    prev_range_bi,
899                                                    candidate_range_bi);
900               if (IP4_REASS_RC_OK != rc)
901                 {
902                   return rc;
903                 }
904               if (~0 != next_range_bi)
905                 {
906                   candidate_range_bi = next_range_bi;
907                   continue;
908                 }
909               else
910                 {
911                   // special case - last range discarded
912                   rc =
913                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
914                                                      prev_range_bi, *bi0);
915                   if (IP4_REASS_RC_OK != rc)
916                     {
917                       return rc;
918                     }
919                   consumed = 1;
920                 }
921             }
922         }
923       break;
924     }
925   ++reass->fragments_n;
926   if (consumed)
927     {
928       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
929         {
930           ip4_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0);
931         }
932     }
933   if (~0 != reass->last_packet_octet &&
934       reass->data_len == reass->last_packet_octet + 1)
935     {
936       return ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
937                                  is_feature);
938     }
939   else
940     {
941       if (consumed)
942         {
943           *bi0 = ~0;
944           if (reass->fragments_n > rm->max_reass_len)
945             {
946               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
947             }
948         }
949       else
950         {
951           *next0 = IP4_REASSEMBLY_NEXT_DROP;
952           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
953         }
954     }
955   return rc;
956 }
957
958 always_inline uword
959 ip4_reassembly_inline (vlib_main_t * vm,
960                        vlib_node_runtime_t * node,
961                        vlib_frame_t * frame, bool is_feature)
962 {
963   u32 *from = vlib_frame_vector_args (frame);
964   u32 n_left_from, n_left_to_next, *to_next, next_index;
965   ip4_reass_main_t *rm = &ip4_reass_main;
966   ip4_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
967   clib_spinlock_lock (&rt->lock);
968
969   n_left_from = frame->n_vectors;
970   next_index = node->cached_next_index;
971   while (n_left_from > 0)
972     {
973       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
974
975       while (n_left_from > 0 && n_left_to_next > 0)
976         {
977           u32 bi0;
978           vlib_buffer_t *b0;
979           u32 next0;
980           u32 error0 = IP4_ERROR_NONE;
981
982           bi0 = from[0];
983           b0 = vlib_get_buffer (vm, bi0);
984
985           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
986           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
987             {
988               // this is a whole packet - no fragmentation
989               if (is_feature)
990                 {
991                   next0 = IP4_REASSEMBLY_NEXT_INPUT;
992                 }
993               else
994                 {
995                   next0 = vnet_buffer (b0)->ip.reass.next_index;
996                 }
997             }
998           else
999             {
1000               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1001               const u32 fragment_length =
1002                 clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1003               const u32 fragment_last = fragment_first + fragment_length - 1;
1004               if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
1005                 {
1006                   next0 = IP4_REASSEMBLY_NEXT_DROP;
1007                   error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1008                 }
1009               else
1010                 {
1011                   ip4_reass_kv_t kv;
1012                   u8 do_handoff = 0;
1013
1014                   kv.k.as_u64[0] =
1015                     (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1016                                    vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1017                     (u64) ip0->src_address.as_u32 << 32;
1018                   kv.k.as_u64[1] =
1019                     (u64) ip0->dst_address.as_u32 |
1020                     (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1021
1022                   ip4_reass_t *reass =
1023                     ip4_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
1024
1025                   if (PREDICT_FALSE (do_handoff))
1026                     {
1027                       next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
1028                       if (is_feature)
1029                         vnet_buffer (b0)->ip.
1030                           reass.owner_feature_thread_index =
1031                           kv.v.thread_index;
1032                       else
1033                         vnet_buffer (b0)->ip.reass.owner_thread_index =
1034                           kv.v.thread_index;
1035                     }
1036                   else if (reass)
1037                     {
1038                       switch (ip4_reass_update
1039                               (vm, node, rm, rt, reass, &bi0, &next0,
1040                                &error0, is_feature))
1041                         {
1042                         case IP4_REASS_RC_OK:
1043                           /* nothing to do here */
1044                           break;
1045                         case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1046                           vlib_node_increment_counter (vm, node->node_index,
1047                                                        IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1048                                                        1);
1049                           ip4_reass_on_timeout (vm, rm, reass);
1050                           ip4_reass_free (rm, rt, reass);
1051                           goto next_packet;
1052                           break;
1053                         case IP4_REASS_RC_NO_BUF:
1054                           vlib_node_increment_counter (vm, node->node_index,
1055                                                        IP4_ERROR_REASS_NO_BUF,
1056                                                        1);
1057                           ip4_reass_on_timeout (vm, rm, reass);
1058                           ip4_reass_free (rm, rt, reass);
1059                           goto next_packet;
1060                           break;
1061                         case IP4_REASS_RC_INTERNAL_ERROR:
1062                           vlib_node_increment_counter (vm, node->node_index,
1063                                                        IP4_ERROR_REASS_INTERNAL_ERROR,
1064                                                        1);
1065                           ip4_reass_on_timeout (vm, rm, reass);
1066                           ip4_reass_free (rm, rt, reass);
1067                           goto next_packet;
1068                           break;
1069                         }
1070                     }
1071                   else
1072                     {
1073                       next0 = IP4_REASSEMBLY_NEXT_DROP;
1074                       error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1075                     }
1076                 }
1077
1078               b0->error = node->errors[error0];
1079             }
1080
1081           if (bi0 != ~0)
1082             {
1083               to_next[0] = bi0;
1084               to_next += 1;
1085               n_left_to_next -= 1;
1086               if (is_feature && IP4_ERROR_NONE == error0)
1087                 {
1088                   b0 = vlib_get_buffer (vm, bi0);
1089                   vnet_feature_next (&next0, b0);
1090                 }
1091               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1092                                                to_next, n_left_to_next,
1093                                                bi0, next0);
1094               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1095             }
1096
1097         next_packet:
1098           from += 1;
1099           n_left_from -= 1;
1100         }
1101
1102       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1103     }
1104
1105   clib_spinlock_unlock (&rt->lock);
1106   return frame->n_vectors;
1107 }
1108
1109 static char *ip4_reassembly_error_strings[] = {
1110 #define _(sym, string) string,
1111   foreach_ip4_error
1112 #undef _
1113 };
1114
1115 VLIB_NODE_FN (ip4_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1116                                vlib_frame_t * frame)
1117 {
1118   return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ );
1119 }
1120
1121 /* *INDENT-OFF* */
1122 VLIB_REGISTER_NODE (ip4_reass_node) = {
1123     .name = "ip4-reassembly",
1124     .vector_size = sizeof (u32),
1125     .format_trace = format_ip4_reass_trace,
1126     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1127     .error_strings = ip4_reassembly_error_strings,
1128     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1129     .next_nodes =
1130         {
1131                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1132                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1133                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reassembly-handoff",
1134
1135         },
1136 };
1137 /* *INDENT-ON* */
1138
1139 VLIB_NODE_FN (ip4_reass_node_feature) (vlib_main_t * vm,
1140                                        vlib_node_runtime_t * node,
1141                                        vlib_frame_t * frame)
1142 {
1143   return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ );
1144 }
1145
1146 /* *INDENT-OFF* */
1147 VLIB_REGISTER_NODE (ip4_reass_node_feature) = {
1148     .name = "ip4-reassembly-feature",
1149     .vector_size = sizeof (u32),
1150     .format_trace = format_ip4_reass_trace,
1151     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1152     .error_strings = ip4_reassembly_error_strings,
1153     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1154     .next_nodes =
1155         {
1156                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1157                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1158                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reass-feature-hoff",
1159         },
1160 };
1161 /* *INDENT-ON* */
1162
1163 /* *INDENT-OFF* */
1164 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
1165     .arc_name = "ip4-unicast",
1166     .node_name = "ip4-reassembly-feature",
1167     .runs_before = VNET_FEATURES ("ip4-lookup"),
1168     .runs_after = 0,
1169 };
1170 /* *INDENT-ON* */
1171
1172 #ifndef CLIB_MARCH_VARIANT
1173 always_inline u32
1174 ip4_reass_get_nbuckets ()
1175 {
1176   ip4_reass_main_t *rm = &ip4_reass_main;
1177   u32 nbuckets;
1178   u8 i;
1179
1180   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1181
1182   for (i = 0; i < 31; i++)
1183     if ((1 << i) >= nbuckets)
1184       break;
1185   nbuckets = 1 << i;
1186
1187   return nbuckets;
1188 }
1189 #endif /* CLIB_MARCH_VARIANT */
1190
1191 typedef enum
1192 {
1193   IP4_EVENT_CONFIG_CHANGED = 1,
1194 } ip4_reass_event_t;
1195
1196 typedef struct
1197 {
1198   int failure;
1199   clib_bihash_16_8_t *new_hash;
1200 } ip4_rehash_cb_ctx;
1201
1202 #ifndef CLIB_MARCH_VARIANT
1203 static void
1204 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1205 {
1206   ip4_rehash_cb_ctx *ctx = _ctx;
1207   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1208     {
1209       ctx->failure = 1;
1210     }
1211 }
1212
1213 static void
1214 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1215                       u32 max_reassembly_length, u32 expire_walk_interval_ms)
1216 {
1217   ip4_reass_main.timeout_ms = timeout_ms;
1218   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1219   ip4_reass_main.max_reass_n = max_reassemblies;
1220   ip4_reass_main.max_reass_len = max_reassembly_length;
1221   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1222 }
1223
1224 vnet_api_error_t
1225 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
1226                u32 max_reassembly_length, u32 expire_walk_interval_ms)
1227 {
1228   u32 old_nbuckets = ip4_reass_get_nbuckets ();
1229   ip4_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
1230                         expire_walk_interval_ms);
1231   vlib_process_signal_event (ip4_reass_main.vlib_main,
1232                              ip4_reass_main.ip4_reass_expire_node_idx,
1233                              IP4_EVENT_CONFIG_CHANGED, 0);
1234   u32 new_nbuckets = ip4_reass_get_nbuckets ();
1235   if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1236     {
1237       clib_bihash_16_8_t new_hash;
1238       clib_memset (&new_hash, 0, sizeof (new_hash));
1239       ip4_rehash_cb_ctx ctx;
1240       ctx.failure = 0;
1241       ctx.new_hash = &new_hash;
1242       clib_bihash_init_16_8 (&new_hash, "ip4-reass", new_nbuckets,
1243                              new_nbuckets * 1024);
1244       clib_bihash_foreach_key_value_pair_16_8 (&ip4_reass_main.hash,
1245                                                ip4_rehash_cb, &ctx);
1246       if (ctx.failure)
1247         {
1248           clib_bihash_free_16_8 (&new_hash);
1249           return -1;
1250         }
1251       else
1252         {
1253           clib_bihash_free_16_8 (&ip4_reass_main.hash);
1254           clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
1255                             sizeof (ip4_reass_main.hash));
1256         }
1257     }
1258   return 0;
1259 }
1260
1261 vnet_api_error_t
1262 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1263                u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1264 {
1265   *timeout_ms = ip4_reass_main.timeout_ms;
1266   *max_reassemblies = ip4_reass_main.max_reass_n;
1267   *max_reassembly_length = ip4_reass_main.max_reass_len;
1268   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
1269   return 0;
1270 }
1271
1272 static clib_error_t *
1273 ip4_reass_init_function (vlib_main_t * vm)
1274 {
1275   ip4_reass_main_t *rm = &ip4_reass_main;
1276   clib_error_t *error = 0;
1277   u32 nbuckets;
1278   vlib_node_t *node;
1279
1280   rm->vlib_main = vm;
1281   rm->vnet_main = vnet_get_main ();
1282
1283   vec_validate (rm->per_thread_data, vlib_num_workers ());
1284   ip4_reass_per_thread_t *rt;
1285   vec_foreach (rt, rm->per_thread_data)
1286   {
1287     clib_spinlock_init (&rt->lock);
1288     pool_alloc (rt->pool, rm->max_reass_n);
1289   }
1290
1291   node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
1292   ASSERT (node);
1293   rm->ip4_reass_expire_node_idx = node->index;
1294
1295   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1296                         IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1297                         IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1298                         IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1299
1300   nbuckets = ip4_reass_get_nbuckets ();
1301   clib_bihash_init_16_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
1302
1303   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1304   ASSERT (node);
1305   rm->ip4_drop_idx = node->index;
1306
1307   rm->fq_index = vlib_frame_queue_main_init (ip4_reass_node.index, 0);
1308   rm->fq_feature_index =
1309     vlib_frame_queue_main_init (ip4_reass_node_feature.index, 0);
1310
1311
1312   return error;
1313 }
1314
1315 VLIB_INIT_FUNCTION (ip4_reass_init_function);
1316 #endif /* CLIB_MARCH_VARIANT */
1317
1318 static uword
1319 ip4_reass_walk_expired (vlib_main_t * vm,
1320                         vlib_node_runtime_t * node, vlib_frame_t * f)
1321 {
1322   ip4_reass_main_t *rm = &ip4_reass_main;
1323   uword event_type, *event_data = 0;
1324
1325   while (true)
1326     {
1327       vlib_process_wait_for_event_or_clock (vm,
1328                                             (f64)
1329                                             rm->expire_walk_interval_ms /
1330                                             (f64) MSEC_PER_SEC);
1331       event_type = vlib_process_get_events (vm, &event_data);
1332
1333       switch (event_type)
1334         {
1335         case ~0:                /* no events => timeout */
1336           /* nothing to do here */
1337           break;
1338         case IP4_EVENT_CONFIG_CHANGED:
1339           break;
1340         default:
1341           clib_warning ("BUG: event type 0x%wx", event_type);
1342           break;
1343         }
1344       f64 now = vlib_time_now (vm);
1345
1346       ip4_reass_t *reass;
1347       int *pool_indexes_to_free = NULL;
1348
1349       uword thread_index = 0;
1350       int index;
1351       const uword nthreads = vlib_num_workers () + 1;
1352       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1353         {
1354           ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1355           clib_spinlock_lock (&rt->lock);
1356
1357           vec_reset_length (pool_indexes_to_free);
1358           /* *INDENT-OFF* */
1359           pool_foreach_index (index, rt->pool, ({
1360                                 reass = pool_elt_at_index (rt->pool, index);
1361                                 if (now > reass->last_heard + rm->timeout)
1362                                   {
1363                                     vec_add1 (pool_indexes_to_free, index);
1364                                   }
1365                               }));
1366           /* *INDENT-ON* */
1367           int *i;
1368           /* *INDENT-OFF* */
1369           vec_foreach (i, pool_indexes_to_free)
1370           {
1371             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1372             ip4_reass_on_timeout (vm, rm, reass);
1373             ip4_reass_free (rm, rt, reass);
1374           }
1375           /* *INDENT-ON* */
1376
1377           clib_spinlock_unlock (&rt->lock);
1378         }
1379
1380       vec_free (pool_indexes_to_free);
1381       if (event_data)
1382         {
1383           _vec_len (event_data) = 0;
1384         }
1385     }
1386
1387   return 0;
1388 }
1389
1390 /* *INDENT-OFF* */
1391 VLIB_REGISTER_NODE (ip4_reass_expire_node, static) = {
1392     .function = ip4_reass_walk_expired,
1393     .type = VLIB_NODE_TYPE_PROCESS,
1394     .name = "ip4-reassembly-expire-walk",
1395     .format_trace = format_ip4_reass_trace,
1396     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1397     .error_strings = ip4_reassembly_error_strings,
1398
1399 };
1400 /* *INDENT-ON* */
1401
1402 static u8 *
1403 format_ip4_reass_key (u8 * s, va_list * args)
1404 {
1405   ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
1406   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1407               key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1408               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1409   return s;
1410 }
1411
1412 static u8 *
1413 format_ip4_reass (u8 * s, va_list * args)
1414 {
1415   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1416   ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
1417
1418   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1419               "last_packet_octet: %u, trace_op_counter: %u\n",
1420               reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
1421               reass->data_len, reass->last_packet_octet,
1422               reass->trace_op_counter);
1423   u32 bi = reass->first_bi;
1424   u32 counter = 0;
1425   while (~0 != bi)
1426     {
1427       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1428       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1429       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1430                   "fragment[%u, %u]\n",
1431                   counter, vnb->ip.reass.range_first,
1432                   vnb->ip.reass.range_last, bi,
1433                   ip4_reass_buffer_get_data_offset (b),
1434                   ip4_reass_buffer_get_data_len (b),
1435                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1436       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1437         {
1438           bi = b->next_buffer;
1439         }
1440       else
1441         {
1442           bi = ~0;
1443         }
1444     }
1445   return s;
1446 }
1447
1448 static clib_error_t *
1449 show_ip4_reass (vlib_main_t * vm,
1450                 unformat_input_t * input,
1451                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1452 {
1453   ip4_reass_main_t *rm = &ip4_reass_main;
1454
1455   vlib_cli_output (vm, "---------------------");
1456   vlib_cli_output (vm, "IP4 reassembly status");
1457   vlib_cli_output (vm, "---------------------");
1458   bool details = false;
1459   if (unformat (input, "details"))
1460     {
1461       details = true;
1462     }
1463
1464   u32 sum_reass_n = 0;
1465   ip4_reass_t *reass;
1466   uword thread_index;
1467   const uword nthreads = vlib_num_workers () + 1;
1468   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1469     {
1470       ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1471       clib_spinlock_lock (&rt->lock);
1472       if (details)
1473         {
1474           /* *INDENT-OFF* */
1475           pool_foreach (reass, rt->pool, {
1476             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1477           });
1478           /* *INDENT-ON* */
1479         }
1480       sum_reass_n += rt->reass_n;
1481       clib_spinlock_unlock (&rt->lock);
1482     }
1483   vlib_cli_output (vm, "---------------------");
1484   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1485                    (long unsigned) sum_reass_n);
1486   vlib_cli_output (vm,
1487                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1488                    (long unsigned) rm->max_reass_n);
1489   return 0;
1490 }
1491
1492 /* *INDENT-OFF* */
1493 VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
1494     .path = "show ip4-reassembly",
1495     .short_help = "show ip4-reassembly [details]",
1496     .function = show_ip4_reass,
1497 };
1498 /* *INDENT-ON* */
1499
1500 #ifndef CLIB_MARCH_VARIANT
1501 vnet_api_error_t
1502 ip4_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1503 {
1504   return vnet_feature_enable_disable ("ip4-unicast",
1505                                       "ip4-reassembly-feature", sw_if_index,
1506                                       enable_disable, 0, 0);
1507 }
1508 #endif /* CLIB_MARCH_VARIANT */
1509
1510
1511 #define foreach_ip4_reassembly_handoff_error                       \
1512 _(CONGESTION_DROP, "congestion drop")
1513
1514
1515 typedef enum
1516 {
1517 #define _(sym,str) IP4_REASSEMBLY_HANDOFF_ERROR_##sym,
1518   foreach_ip4_reassembly_handoff_error
1519 #undef _
1520     IP4_REASSEMBLY_HANDOFF_N_ERROR,
1521 } ip4_reassembly_handoff_error_t;
1522
1523 static char *ip4_reassembly_handoff_error_strings[] = {
1524 #define _(sym,string) string,
1525   foreach_ip4_reassembly_handoff_error
1526 #undef _
1527 };
1528
1529 typedef struct
1530 {
1531   u32 next_worker_index;
1532 } ip4_reassembly_handoff_trace_t;
1533
1534 static u8 *
1535 format_ip4_reassembly_handoff_trace (u8 * s, va_list * args)
1536 {
1537   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1538   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1539   ip4_reassembly_handoff_trace_t *t =
1540     va_arg (*args, ip4_reassembly_handoff_trace_t *);
1541
1542   s =
1543     format (s, "ip4-reassembly-handoff: next-worker %d",
1544             t->next_worker_index);
1545
1546   return s;
1547 }
1548
1549 always_inline uword
1550 ip4_reassembly_handoff_node_inline (vlib_main_t * vm,
1551                                     vlib_node_runtime_t * node,
1552                                     vlib_frame_t * frame, bool is_feature)
1553 {
1554   ip4_reass_main_t *rm = &ip4_reass_main;
1555
1556   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1557   u32 n_enq, n_left_from, *from;
1558   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1559   u32 fq_index;
1560
1561   from = vlib_frame_vector_args (frame);
1562   n_left_from = frame->n_vectors;
1563   vlib_get_buffers (vm, from, bufs, n_left_from);
1564
1565   b = bufs;
1566   ti = thread_indices;
1567
1568   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1569
1570   while (n_left_from > 0)
1571     {
1572       ti[0] =
1573         (is_feature) ? vnet_buffer (b[0])->ip.
1574         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1575         reass.owner_thread_index;
1576
1577       if (PREDICT_FALSE
1578           ((node->flags & VLIB_NODE_FLAG_TRACE)
1579            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1580         {
1581           ip4_reassembly_handoff_trace_t *t =
1582             vlib_add_trace (vm, node, b[0], sizeof (*t));
1583           t->next_worker_index = ti[0];
1584         }
1585
1586       n_left_from -= 1;
1587       ti += 1;
1588       b += 1;
1589     }
1590   n_enq =
1591     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1592                                    frame->n_vectors, 1);
1593
1594   if (n_enq < frame->n_vectors)
1595     vlib_node_increment_counter (vm, node->node_index,
1596                                  IP4_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1597                                  frame->n_vectors - n_enq);
1598   return frame->n_vectors;
1599 }
1600
1601 VLIB_NODE_FN (ip4_reassembly_handoff_node) (vlib_main_t * vm,
1602                                             vlib_node_runtime_t * node,
1603                                             vlib_frame_t * frame)
1604 {
1605   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1606                                              false /* is_feature */ );
1607 }
1608
1609
1610 /* *INDENT-OFF* */
1611 VLIB_REGISTER_NODE (ip4_reassembly_handoff_node) = {
1612   .name = "ip4-reassembly-handoff",
1613   .vector_size = sizeof (u32),
1614   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1615   .error_strings = ip4_reassembly_handoff_error_strings,
1616   .format_trace = format_ip4_reassembly_handoff_trace,
1617
1618   .n_next_nodes = 1,
1619
1620   .next_nodes = {
1621     [0] = "error-drop",
1622   },
1623 };
1624 /* *INDENT-ON* */
1625
1626
1627 /* *INDENT-OFF* */
1628 VLIB_NODE_FN (ip4_reassembly_feature_handoff_node) (vlib_main_t * vm,
1629                                                     vlib_node_runtime_t *
1630                                                     node,
1631                                                     vlib_frame_t * frame)
1632 {
1633   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1634                                              true /* is_feature */ );
1635 }
1636 /* *INDENT-ON* */
1637
1638
1639 /* *INDENT-OFF* */
1640 VLIB_REGISTER_NODE (ip4_reassembly_feature_handoff_node) = {
1641   .name = "ip4-reass-feature-hoff",
1642   .vector_size = sizeof (u32),
1643   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1644   .error_strings = ip4_reassembly_handoff_error_strings,
1645   .format_trace = format_ip4_reassembly_handoff_trace,
1646
1647   .n_next_nodes = 1,
1648
1649   .next_nodes = {
1650     [0] = "error-drop",
1651   },
1652 };
1653 /* *INDENT-ON* */
1654
1655 /*
1656  * fd.io coding-style-patch-verification: ON
1657  *
1658  * Local Variables:
1659  * eval: (c-set-style "gnu")
1660  * End:
1661  */