682cad965afcd672876524345e6c90712502126c
[vpp.git] / src / vnet / ip / ip4_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Reassembly.
19  *
20  * This file contains the source code for IPv4 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_16_8.h>
27 #include <vnet/ip/ip4_reassembly.h>
28 #include <stddef.h>
29
30 #define MSEC_PER_SEC 1000
31 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
32 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
33 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
34 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
35 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
36
37 #define IP4_REASS_DEBUG_BUFFERS 0
38 #if IP4_REASS_DEBUG_BUFFERS
39 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
40   do                                                 \
41     {                                                \
42       u32 _bi = bi;                                  \
43       printf (#what "buffer %u", _bi);               \
44       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
45       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
46         {                                            \
47           _bi = _b->next_buffer;                     \
48           printf ("[%u]", _bi);                      \
49           _b = vlib_get_buffer (vm, _bi);            \
50         }                                            \
51       printf ("\n");                                 \
52       fflush (stdout);                               \
53     }                                                \
54   while (0)
55 #else
56 #define IP4_REASS_DEBUG_BUFFER(...)
57 #endif
58
59 typedef enum
60 {
61   IP4_REASS_RC_OK,
62   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
63   IP4_REASS_RC_INTERNAL_ERROR,
64   IP4_REASS_RC_NO_BUF,
65   IP4_REASS_RC_HANDOFF,
66 } ip4_reass_rc_t;
67
68 typedef struct
69 {
70   union
71   {
72     struct
73     {
74       u32 xx_id;
75       ip4_address_t src;
76       ip4_address_t dst;
77       u16 frag_id;
78       u8 proto;
79       u8 unused;
80     };
81     u64 as_u64[2];
82   };
83 } ip4_reass_key_t;
84
85 typedef union
86 {
87   struct
88   {
89     u32 reass_index;
90     u32 memory_owner_thread_index;
91   };
92   u64 as_u64;
93 } ip4_reass_val_t;
94
95 typedef union
96 {
97   struct
98   {
99     ip4_reass_key_t k;
100     ip4_reass_val_t v;
101   };
102   clib_bihash_kv_16_8_t kv;
103 } ip4_reass_kv_t;
104
105 always_inline u32
106 ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
107 {
108   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
109   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
110 }
111
112 always_inline u16
113 ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
114 {
115   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
116   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
117     (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
118 }
119
120 typedef struct
121 {
122   // hash table key
123   ip4_reass_key_t key;
124   // time when last packet was received
125   f64 last_heard;
126   // internal id of this reassembly
127   u64 id;
128   // buffer index of first buffer in this reassembly context
129   u32 first_bi;
130   // last octet of packet, ~0 until fragment without more_fragments arrives
131   u32 last_packet_octet;
132   // length of data collected so far
133   u32 data_len;
134   // trace operation counter
135   u32 trace_op_counter;
136   // next index - used by non-feature node
137   u32 next_index;
138   // error next index - used by custom apps (~0 if not used)
139   u32 error_next_index;
140   // minimum fragment length for this reassembly - used to estimate MTU
141   u16 min_fragment_length;
142   // number of fragments in this reassembly
143   u32 fragments_n;
144   // thread owning memory for this context (whose pool contains this ctx)
145   u32 memory_owner_thread_index;
146   // thread which received fragment with offset 0 and which sends out the
147   // completed reassembly
148   u32 sendout_thread_index;
149 } ip4_reass_t;
150
151 typedef struct
152 {
153   // pool of reassembly contexts
154   ip4_reass_t *pool;
155   u32 reass_n;
156   u32 id_counter;
157   clib_spinlock_t lock;
158 } ip4_reass_per_thread_t;
159
160 typedef struct
161 {
162   // IPv4 config
163   u32 timeout_ms;
164   f64 timeout;
165   u32 expire_walk_interval_ms;
166   // maximum number of fragments in one reassembly
167   u32 max_reass_len;
168   // maximum number of reassemblies
169   u32 max_reass_n;
170
171   // IPv4 runtime
172   clib_bihash_16_8_t hash;
173   // per-thread data
174   ip4_reass_per_thread_t *per_thread_data;
175
176   // convenience
177   vlib_main_t *vlib_main;
178
179   // node index of ip4-drop node
180   u32 ip4_drop_idx;
181   u32 ip4_reass_expire_node_idx;
182
183   /** Worker handoff */
184   u32 fq_index;
185   u32 fq_feature_index;
186 } ip4_reass_main_t;
187
188 extern ip4_reass_main_t ip4_reass_main;
189
190 #ifndef CLIB_MARCH_VARIANT
191 ip4_reass_main_t ip4_reass_main;
192 #endif /* CLIB_MARCH_VARIANT */
193
194 typedef enum
195 {
196   IP4_REASSEMBLY_NEXT_INPUT,
197   IP4_REASSEMBLY_NEXT_DROP,
198   IP4_REASSEMBLY_NEXT_HANDOFF,
199   IP4_REASSEMBLY_N_NEXT,
200 } ip4_reass_next_t;
201
202 typedef enum
203 {
204   RANGE_NEW,
205   RANGE_SHRINK,
206   RANGE_DISCARD,
207   RANGE_OVERLAP,
208   FINALIZE,
209   HANDOFF,
210 } ip4_reass_trace_operation_e;
211
212 typedef struct
213 {
214   u16 range_first;
215   u16 range_last;
216   u32 range_bi;
217   i32 data_offset;
218   u32 data_len;
219   u32 first_bi;
220 } ip4_reass_range_trace_t;
221
222 typedef struct
223 {
224   ip4_reass_trace_operation_e action;
225   u32 reass_id;
226   ip4_reass_range_trace_t trace_range;
227   u32 size_diff;
228   u32 op_id;
229   u32 thread_id;
230   u32 thread_id_to;
231   u32 fragment_first;
232   u32 fragment_last;
233   u32 total_data_len;
234 } ip4_reass_trace_t;
235
236 extern vlib_node_registration_t ip4_reass_node;
237 extern vlib_node_registration_t ip4_reass_node_feature;
238
239 static void
240 ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
241                          ip4_reass_range_trace_t * trace)
242 {
243   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
244   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
245   trace->range_first = vnb->ip.reass.range_first;
246   trace->range_last = vnb->ip.reass.range_last;
247   trace->data_offset = ip4_reass_buffer_get_data_offset (b);
248   trace->data_len = ip4_reass_buffer_get_data_len (b);
249   trace->range_bi = bi;
250 }
251
252 static u8 *
253 format_ip4_reass_range_trace (u8 * s, va_list * args)
254 {
255   ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
256   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
257               trace->range_last, trace->data_offset, trace->data_len,
258               trace->range_bi);
259   return s;
260 }
261
262 static u8 *
263 format_ip4_reass_trace (u8 * s, va_list * args)
264 {
265   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
266   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
267   ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
268   u32 indent = 0;
269   if (~0 != t->reass_id)
270     {
271       s = format (s, "reass id: %u, op id: %u, ", t->reass_id, t->op_id);
272       indent = format_get_indent (s);
273       s =
274         format (s,
275                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
276                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
277                 t->fragment_last);
278     }
279   switch (t->action)
280     {
281     case RANGE_SHRINK:
282       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
283                   format_ip4_reass_range_trace, &t->trace_range,
284                   t->size_diff);
285       break;
286     case RANGE_DISCARD:
287       s = format (s, "\n%Udiscard %U", format_white_space, indent,
288                   format_ip4_reass_range_trace, &t->trace_range);
289       break;
290     case RANGE_NEW:
291       s = format (s, "\n%Unew %U", format_white_space, indent,
292                   format_ip4_reass_range_trace, &t->trace_range);
293       break;
294     case RANGE_OVERLAP:
295       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
296                   format_ip4_reass_range_trace, &t->trace_range);
297       break;
298     case FINALIZE:
299       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
300       break;
301     case HANDOFF:
302       s =
303         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
304                 t->thread_id_to);
305       break;
306     }
307   return s;
308 }
309
310 static void
311 ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
312                      ip4_reass_main_t * rm, u32 reass_id, u32 op_id,
313                      u32 bi, u32 first_bi, u32 data_len,
314                      ip4_reass_trace_operation_e action, u32 size_diff,
315                      u32 thread_id_to)
316 {
317   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
318   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
319   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
320   t->reass_id = reass_id;
321   t->action = action;
322   ip4_reass_trace_details (vm, bi, &t->trace_range);
323   t->size_diff = size_diff;
324   t->op_id = op_id;
325   t->thread_id = vm->thread_index;
326   t->thread_id_to = thread_id_to;
327   t->fragment_first = vnb->ip.reass.fragment_first;
328   t->fragment_last = vnb->ip.reass.fragment_last;
329   t->trace_range.first_bi = first_bi;
330   t->total_data_len = data_len;
331 #if 0
332   static u8 *s = NULL;
333   s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
334   printf ("%.*s\n", vec_len (s), s);
335   fflush (stdout);
336   vec_reset_length (s);
337 #endif
338 }
339
340 always_inline void
341 ip4_reass_free_ctx (ip4_reass_per_thread_t * rt, ip4_reass_t * reass)
342 {
343   pool_put (rt->pool, reass);
344   --rt->reass_n;
345 }
346
347 always_inline void
348 ip4_reass_free (vlib_main_t * vm, ip4_reass_main_t * rm,
349                 ip4_reass_per_thread_t * rt, ip4_reass_t * reass)
350 {
351   clib_bihash_kv_16_8_t kv;
352   kv.key[0] = reass->key.as_u64[0];
353   kv.key[1] = reass->key.as_u64[1];
354   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
355   return ip4_reass_free_ctx (rt, reass);
356 }
357
358 always_inline void
359 ip4_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
360                     ip4_reass_main_t * rm, ip4_reass_t * reass)
361 {
362   u32 range_bi = reass->first_bi;
363   vlib_buffer_t *range_b;
364   vnet_buffer_opaque_t *range_vnb;
365   u32 *to_free = NULL;
366   while (~0 != range_bi)
367     {
368       range_b = vlib_get_buffer (vm, range_bi);
369       range_vnb = vnet_buffer (range_b);
370       u32 bi = range_bi;
371       while (~0 != bi)
372         {
373           vec_add1 (to_free, bi);
374           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
375           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
376             {
377               bi = b->next_buffer;
378               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
379             }
380           else
381             {
382               bi = ~0;
383             }
384         }
385       range_bi = range_vnb->ip.reass.next_range_bi;
386     }
387   /* send to next_error_index */
388   if (~0 != reass->error_next_index)
389     {
390       u32 n_left_to_next, *to_next, next_index;
391
392       next_index = reass->error_next_index;
393       u32 bi = ~0;
394
395       while (vec_len (to_free) > 0)
396         {
397           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
398
399           while (vec_len (to_free) > 0 && n_left_to_next > 0)
400             {
401               bi = vec_pop (to_free);
402
403               if (~0 != bi)
404                 {
405                   to_next[0] = bi;
406                   to_next += 1;
407                   n_left_to_next -= 1;
408                 }
409             }
410           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
411         }
412     }
413   else
414     {
415       vlib_buffer_free (vm, to_free, vec_len (to_free));
416     }
417 }
418
419 static ip4_reass_t *
420 ip4_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
421                           ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
422                           ip4_reass_kv_t * kv, u8 * do_handoff)
423 {
424   ip4_reass_t *reass;
425   f64 now;
426
427 again:
428
429   reass = NULL;
430   now = vlib_time_now (vm);
431   if (!clib_bihash_search_16_8
432       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
433     {
434       reass =
435         pool_elt_at_index (rm->per_thread_data
436                            [kv->v.memory_owner_thread_index].pool,
437                            kv->v.reass_index);
438       if (vm->thread_index != reass->memory_owner_thread_index)
439         {
440           *do_handoff = 1;
441           return reass;
442         }
443
444       if (now > reass->last_heard + rm->timeout)
445         {
446           ip4_reass_drop_all (vm, node, rm, reass);
447           ip4_reass_free (vm, rm, rt, reass);
448           reass = NULL;
449         }
450     }
451
452   if (reass)
453     {
454       reass->last_heard = now;
455       return reass;
456     }
457
458   if (rt->reass_n >= rm->max_reass_n)
459     {
460       reass = NULL;
461       return reass;
462     }
463   else
464     {
465       pool_get (rt->pool, reass);
466       clib_memset (reass, 0, sizeof (*reass));
467       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
468       reass->memory_owner_thread_index = vm->thread_index;
469       ++rt->id_counter;
470       reass->first_bi = ~0;
471       reass->last_packet_octet = ~0;
472       reass->data_len = 0;
473       reass->next_index = ~0;
474       reass->error_next_index = ~0;
475       ++rt->reass_n;
476     }
477
478   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
479   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
480   kv->v.reass_index = (reass - rt->pool);
481   kv->v.memory_owner_thread_index = vm->thread_index;
482   reass->last_heard = now;
483
484   int rv =
485     clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 2);
486   if (rv)
487     {
488       ip4_reass_free_ctx (rt, reass);
489       reass = NULL;
490       // if other worker created a context already work with the other copy
491       if (-2 == rv)
492         goto again;
493     }
494
495   return reass;
496 }
497
498 always_inline ip4_reass_rc_t
499 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
500                     ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
501                     ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
502                     bool is_custom_app)
503 {
504   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
505   vlib_buffer_t *last_b = NULL;
506   u32 sub_chain_bi = reass->first_bi;
507   u32 total_length = 0;
508   u32 buf_cnt = 0;
509   do
510     {
511       u32 tmp_bi = sub_chain_bi;
512       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
513       ip4_header_t *ip = vlib_buffer_get_current (tmp);
514       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
515       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
516           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
517         {
518           return IP4_REASS_RC_INTERNAL_ERROR;
519         }
520
521       u32 data_len = ip4_reass_buffer_get_data_len (tmp);
522       u32 trim_front =
523         ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
524       u32 trim_end =
525         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
526       if (tmp_bi == reass->first_bi)
527         {
528           /* first buffer - keep ip4 header */
529           if (0 != ip4_reass_buffer_get_data_offset (tmp))
530             {
531               return IP4_REASS_RC_INTERNAL_ERROR;
532             }
533           trim_front = 0;
534           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
535             ip4_header_bytes (ip);
536           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
537             {
538               return IP4_REASS_RC_INTERNAL_ERROR;
539             }
540         }
541       u32 keep_data =
542         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
543       while (1)
544         {
545           ++buf_cnt;
546           if (trim_front)
547             {
548               if (trim_front > tmp->current_length)
549                 {
550                   /* drop whole buffer */
551                   u32 to_be_freed_bi = tmp_bi;
552                   trim_front -= tmp->current_length;
553                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
554                     {
555                       return IP4_REASS_RC_INTERNAL_ERROR;
556                     }
557                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
558                   tmp_bi = tmp->next_buffer;
559                   tmp->next_buffer = 0;
560                   tmp = vlib_get_buffer (vm, tmp_bi);
561                   vlib_buffer_free_one (vm, to_be_freed_bi);
562                   continue;
563                 }
564               else
565                 {
566                   vlib_buffer_advance (tmp, trim_front);
567                   trim_front = 0;
568                 }
569             }
570           if (keep_data)
571             {
572               if (last_b)
573                 {
574                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
575                   last_b->next_buffer = tmp_bi;
576                 }
577               last_b = tmp;
578               if (keep_data <= tmp->current_length)
579                 {
580                   tmp->current_length = keep_data;
581                   keep_data = 0;
582                 }
583               else
584                 {
585                   keep_data -= tmp->current_length;
586                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
587                     {
588                       return IP4_REASS_RC_INTERNAL_ERROR;
589                     }
590                 }
591               total_length += tmp->current_length;
592               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
593                 {
594                   tmp_bi = tmp->next_buffer;
595                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
596                 }
597               else
598                 {
599                   break;
600                 }
601             }
602           else
603             {
604               u32 to_be_freed_bi = tmp_bi;
605               if (reass->first_bi == tmp_bi)
606                 {
607                   return IP4_REASS_RC_INTERNAL_ERROR;
608                 }
609               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
610                 {
611                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
612                   tmp_bi = tmp->next_buffer;
613                   tmp->next_buffer = 0;
614                   tmp = vlib_get_buffer (vm, tmp_bi);
615                   vlib_buffer_free_one (vm, to_be_freed_bi);
616                 }
617               else
618                 {
619                   tmp->next_buffer = 0;
620                   vlib_buffer_free_one (vm, to_be_freed_bi);
621                   break;
622                 }
623             }
624         }
625       sub_chain_bi =
626         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
627         reass.next_range_bi;
628     }
629   while (~0 != sub_chain_bi);
630
631   if (!last_b)
632     {
633       return IP4_REASS_RC_INTERNAL_ERROR;
634     }
635   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
636
637   if (total_length < first_b->current_length)
638     {
639       return IP4_REASS_RC_INTERNAL_ERROR;
640     }
641   total_length -= first_b->current_length;
642   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
643   first_b->total_length_not_including_first_buffer = total_length;
644   ip4_header_t *ip = vlib_buffer_get_current (first_b);
645   ip->flags_and_fragment_offset = 0;
646   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
647   ip->checksum = ip4_header_checksum (ip);
648   if (!vlib_buffer_chain_linearize (vm, first_b))
649     {
650       return IP4_REASS_RC_NO_BUF;
651     }
652   // reset to reconstruct the mbuf linking
653   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
654   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
655     {
656       ip4_reass_add_trace (vm, node, rm, reass->id, reass->trace_op_counter,
657                            reass->first_bi, reass->first_bi, reass->data_len,
658                            FINALIZE, 0, ~0);
659       ++reass->trace_op_counter;
660 #if 0
661       // following code does a hexdump of packet fragments to stdout ...
662       do
663         {
664           u32 bi = reass->first_bi;
665           u8 *s = NULL;
666           while (~0 != bi)
667             {
668               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
669               s = format (s, "%u: %U\n", bi, format_hexdump,
670                           vlib_buffer_get_current (b), b->current_length);
671               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
672                 {
673                   bi = b->next_buffer;
674                 }
675               else
676                 {
677                   break;
678                 }
679             }
680           printf ("%.*s\n", vec_len (s), s);
681           fflush (stdout);
682           vec_free (s);
683         }
684       while (0);
685 #endif
686     }
687   *bi0 = reass->first_bi;
688   if (!is_custom_app)
689     {
690       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
691     }
692   else
693     {
694       *next0 = reass->next_index;
695     }
696   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
697   *error0 = IP4_ERROR_NONE;
698   ip4_reass_free (vm, rm, rt, reass);
699   reass = NULL;
700   return IP4_REASS_RC_OK;
701 }
702
703 always_inline ip4_reass_rc_t
704 ip4_reass_insert_range_in_chain (vlib_main_t * vm,
705                                  ip4_reass_main_t * rm,
706                                  ip4_reass_per_thread_t * rt,
707                                  ip4_reass_t * reass,
708                                  u32 prev_range_bi, u32 new_next_bi)
709 {
710   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
711   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
712   if (~0 != prev_range_bi)
713     {
714       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
715       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
716       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
717       prev_vnb->ip.reass.next_range_bi = new_next_bi;
718     }
719   else
720     {
721       if (~0 != reass->first_bi)
722         {
723           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
724         }
725       reass->first_bi = new_next_bi;
726     }
727   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
728   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
729       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
730     {
731       return IP4_REASS_RC_INTERNAL_ERROR;
732     }
733   reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
734   return IP4_REASS_RC_OK;
735 }
736
737 always_inline ip4_reass_rc_t
738 ip4_reass_remove_range_from_chain (vlib_main_t * vm,
739                                    vlib_node_runtime_t * node,
740                                    ip4_reass_main_t * rm,
741                                    ip4_reass_t * reass, u32 prev_range_bi,
742                                    u32 discard_bi)
743 {
744   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
745   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
746   if (~0 != prev_range_bi)
747     {
748       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
749       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
750       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
751         {
752           return IP4_REASS_RC_INTERNAL_ERROR;
753         }
754       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
755     }
756   else
757     {
758       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
759     }
760   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
761   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
762       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
763     {
764       return IP4_REASS_RC_INTERNAL_ERROR;
765     }
766   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
767   while (1)
768     {
769       u32 to_be_freed_bi = discard_bi;
770       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
771         {
772           ip4_reass_add_trace (vm, node, rm, reass->id,
773                                reass->trace_op_counter, discard_bi,
774                                reass->first_bi, reass->data_len,
775                                RANGE_DISCARD, 0, ~0);
776           ++reass->trace_op_counter;
777         }
778       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
779         {
780           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
781           discard_bi = discard_b->next_buffer;
782           discard_b->next_buffer = 0;
783           discard_b = vlib_get_buffer (vm, discard_bi);
784           vlib_buffer_free_one (vm, to_be_freed_bi);
785         }
786       else
787         {
788           discard_b->next_buffer = 0;
789           vlib_buffer_free_one (vm, to_be_freed_bi);
790           break;
791         }
792     }
793   return IP4_REASS_RC_OK;
794 }
795
796 always_inline ip4_reass_rc_t
797 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
798                   ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
799                   ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
800                   bool is_custom_app, u32 * handoff_thread_idx)
801 {
802   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
803   int consumed = 0;
804   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
805   ip4_header_t *fip = vlib_buffer_get_current (fb);
806   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
807   if (is_custom_app)
808     {
809       // store (error_)next_index before it's overwritten
810       reass->next_index = fvnb->ip.reass.next_index;
811       reass->error_next_index = fvnb->ip.reass.error_next_index;
812     }
813   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
814   const u32 fragment_length =
815     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
816   const u32 fragment_last = fragment_first + fragment_length - 1;
817   fvnb->ip.reass.fragment_first = fragment_first;
818   fvnb->ip.reass.fragment_last = fragment_last;
819   int more_fragments = ip4_get_fragment_more (fip);
820   u32 candidate_range_bi = reass->first_bi;
821   u32 prev_range_bi = ~0;
822   fvnb->ip.reass.range_first = fragment_first;
823   fvnb->ip.reass.range_last = fragment_last;
824   fvnb->ip.reass.next_range_bi = ~0;
825   if (!more_fragments)
826     {
827       reass->last_packet_octet = fragment_last;
828     }
829   if (~0 == reass->first_bi)
830     {
831       // starting a new reassembly
832       rc =
833         ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
834                                          *bi0);
835       if (IP4_REASS_RC_OK != rc)
836         {
837           return rc;
838         }
839       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
840         {
841           ip4_reass_add_trace (vm, node, rm, reass->id,
842                                reass->trace_op_counter, *bi0, reass->first_bi,
843                                reass->data_len, RANGE_NEW, 0, ~0);
844           ++reass->trace_op_counter;
845         }
846       *bi0 = ~0;
847       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
848       reass->fragments_n = 1;
849       return IP4_REASS_RC_OK;
850     }
851   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
852                                          fvnb->ip.reass.estimated_mtu);
853   while (~0 != candidate_range_bi)
854     {
855       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
856       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
857       if (fragment_first > candidate_vnb->ip.reass.range_last)
858         {
859           // this fragments starts after candidate range
860           prev_range_bi = candidate_range_bi;
861           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
862           if (candidate_vnb->ip.reass.range_last < fragment_last &&
863               ~0 == candidate_range_bi)
864             {
865               // special case - this fragment falls beyond all known ranges
866               rc =
867                 ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
868                                                  prev_range_bi, *bi0);
869               if (IP4_REASS_RC_OK != rc)
870                 {
871                   return rc;
872                 }
873               consumed = 1;
874               break;
875             }
876           continue;
877         }
878       if (fragment_last < candidate_vnb->ip.reass.range_first)
879         {
880           // this fragment ends before candidate range without any overlap
881           rc =
882             ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
883                                              *bi0);
884           if (IP4_REASS_RC_OK != rc)
885             {
886               return rc;
887             }
888           consumed = 1;
889         }
890       else
891         {
892           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
893               fragment_last <= candidate_vnb->ip.reass.range_last)
894             {
895               // this fragment is a (sub)part of existing range, ignore it
896               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
897                 {
898                   ip4_reass_add_trace (vm, node, rm, reass->id,
899                                        reass->trace_op_counter, *bi0,
900                                        reass->first_bi, reass->data_len,
901                                        RANGE_OVERLAP, 0, ~0);
902                   ++reass->trace_op_counter;
903                 }
904               break;
905             }
906           int discard_candidate = 0;
907           if (fragment_first < candidate_vnb->ip.reass.range_first)
908             {
909               u32 overlap =
910                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
911               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
912                 {
913                   candidate_vnb->ip.reass.range_first += overlap;
914                   if (reass->data_len < overlap)
915                     {
916                       return IP4_REASS_RC_INTERNAL_ERROR;
917                     }
918                   reass->data_len -= overlap;
919                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
920                     {
921                       ip4_reass_add_trace (vm, node, rm, reass->id,
922                                            reass->trace_op_counter,
923                                            candidate_range_bi,
924                                            reass->first_bi, reass->data_len,
925                                            RANGE_SHRINK, 0, ~0);
926                       ++reass->trace_op_counter;
927                     }
928                   rc =
929                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
930                                                      prev_range_bi, *bi0);
931                   if (IP4_REASS_RC_OK != rc)
932                     {
933                       return rc;
934                     }
935                   consumed = 1;
936                 }
937               else
938                 {
939                   discard_candidate = 1;
940                 }
941             }
942           else if (fragment_last > candidate_vnb->ip.reass.range_last)
943             {
944               u32 overlap =
945                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
946               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
947                 {
948                   fvnb->ip.reass.range_first += overlap;
949                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
950                     {
951                       prev_range_bi = candidate_range_bi;
952                       candidate_range_bi =
953                         candidate_vnb->ip.reass.next_range_bi;
954                       continue;
955                     }
956                   else
957                     {
958                       // special case - last range discarded
959                       rc =
960                         ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
961                                                          candidate_range_bi,
962                                                          *bi0);
963                       if (IP4_REASS_RC_OK != rc)
964                         {
965                           return rc;
966                         }
967                       consumed = 1;
968                     }
969                 }
970               else
971                 {
972                   discard_candidate = 1;
973                 }
974             }
975           else
976             {
977               discard_candidate = 1;
978             }
979           if (discard_candidate)
980             {
981               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
982               // discard candidate range, probe next range
983               rc =
984                 ip4_reass_remove_range_from_chain (vm, node, rm, reass,
985                                                    prev_range_bi,
986                                                    candidate_range_bi);
987               if (IP4_REASS_RC_OK != rc)
988                 {
989                   return rc;
990                 }
991               if (~0 != next_range_bi)
992                 {
993                   candidate_range_bi = next_range_bi;
994                   continue;
995                 }
996               else
997                 {
998                   // special case - last range discarded
999                   rc =
1000                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
1001                                                      prev_range_bi, *bi0);
1002                   if (IP4_REASS_RC_OK != rc)
1003                     {
1004                       return rc;
1005                     }
1006                   consumed = 1;
1007                 }
1008             }
1009         }
1010       break;
1011     }
1012   ++reass->fragments_n;
1013   if (consumed)
1014     {
1015       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1016         {
1017           ip4_reass_add_trace (vm, node, rm, reass->id,
1018                                reass->trace_op_counter, *bi0, reass->first_bi,
1019                                reass->data_len, RANGE_NEW, 0, ~0);
1020           ++reass->trace_op_counter;
1021         }
1022     }
1023   if (~0 != reass->last_packet_octet &&
1024       reass->data_len == reass->last_packet_octet + 1)
1025     {
1026       *handoff_thread_idx = reass->sendout_thread_index;
1027       rc =
1028         ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1029                             is_custom_app);
1030       if (IP4_REASS_RC_OK == rc
1031           && reass->memory_owner_thread_index != reass->sendout_thread_index)
1032         {
1033           rc = IP4_REASS_RC_HANDOFF;
1034         }
1035     }
1036   else
1037     {
1038       if (consumed)
1039         {
1040           *bi0 = ~0;
1041           if (reass->fragments_n > rm->max_reass_len)
1042             {
1043               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1044             }
1045         }
1046       else
1047         {
1048           *next0 = IP4_REASSEMBLY_NEXT_DROP;
1049           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1050         }
1051     }
1052   return rc;
1053 }
1054
1055 always_inline uword
1056 ip4_reassembly_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1057                        vlib_frame_t * frame, bool is_feature,
1058                        bool is_custom_app)
1059 {
1060   u32 *from = vlib_frame_vector_args (frame);
1061   u32 n_left_from, n_left_to_next, *to_next, next_index;
1062   ip4_reass_main_t *rm = &ip4_reass_main;
1063   ip4_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1064   clib_spinlock_lock (&rt->lock);
1065
1066   n_left_from = frame->n_vectors;
1067   next_index = node->cached_next_index;
1068   while (n_left_from > 0)
1069     {
1070       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1071
1072       while (n_left_from > 0 && n_left_to_next > 0)
1073         {
1074           u32 bi0;
1075           vlib_buffer_t *b0;
1076           u32 next0;
1077           u32 error0 = IP4_ERROR_NONE;
1078
1079           bi0 = from[0];
1080           b0 = vlib_get_buffer (vm, bi0);
1081
1082           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1083           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1084             {
1085               // this is a whole packet - no fragmentation
1086               if (!is_custom_app)
1087                 {
1088                   next0 = IP4_REASSEMBLY_NEXT_INPUT;
1089                 }
1090               else
1091                 {
1092                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1093                 }
1094             }
1095           else
1096             {
1097               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1098               const u32 fragment_length =
1099                 clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1100               const u32 fragment_last = fragment_first + fragment_length - 1;
1101               if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
1102                 {
1103                   next0 = IP4_REASSEMBLY_NEXT_DROP;
1104                   error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1105                 }
1106               else
1107                 {
1108                   ip4_reass_kv_t kv;
1109                   u8 do_handoff = 0;
1110
1111                   kv.k.as_u64[0] =
1112                     (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1113                                    vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1114                     (u64) ip0->src_address.as_u32 << 32;
1115                   kv.k.as_u64[1] =
1116                     (u64) ip0->dst_address.as_u32 |
1117                     (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1118
1119                   ip4_reass_t *reass =
1120                     ip4_reass_find_or_create (vm, node, rm, rt, &kv,
1121                                               &do_handoff);
1122                   if (reass)
1123                     {
1124                       const u32 fragment_first =
1125                         ip4_get_fragment_offset_bytes (ip0);
1126                       if (0 == fragment_first)
1127                         {
1128                           reass->sendout_thread_index = vm->thread_index;
1129                         }
1130                     }
1131                   if (PREDICT_FALSE (do_handoff))
1132                     {
1133                       next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
1134                       if (is_feature)
1135                         vnet_buffer (b0)->ip.
1136                           reass.owner_feature_thread_index =
1137                           kv.v.memory_owner_thread_index;
1138                       else
1139                         vnet_buffer (b0)->ip.reass.owner_thread_index =
1140                           kv.v.memory_owner_thread_index;
1141                     }
1142                   else if (reass)
1143                     {
1144                       u32 handoff_thread_idx;
1145                       switch (ip4_reass_update
1146                               (vm, node, rm, rt, reass, &bi0, &next0,
1147                                &error0, is_custom_app, &handoff_thread_idx))
1148                         {
1149                         case IP4_REASS_RC_OK:
1150                           /* nothing to do here */
1151                           break;
1152                         case IP4_REASS_RC_HANDOFF:
1153                           next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
1154                           b0 = vlib_get_buffer (vm, bi0);
1155                           if (is_feature)
1156                             vnet_buffer (b0)->ip.
1157                               reass.owner_feature_thread_index =
1158                               handoff_thread_idx;
1159                           else
1160                             vnet_buffer (b0)->ip.reass.owner_thread_index =
1161                               handoff_thread_idx;
1162                           break;
1163                         case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1164                           vlib_node_increment_counter (vm, node->node_index,
1165                                                        IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1166                                                        1);
1167                           ip4_reass_drop_all (vm, node, rm, reass);
1168                           ip4_reass_free (vm, rm, rt, reass);
1169                           goto next_packet;
1170                           break;
1171                         case IP4_REASS_RC_NO_BUF:
1172                           vlib_node_increment_counter (vm, node->node_index,
1173                                                        IP4_ERROR_REASS_NO_BUF,
1174                                                        1);
1175                           ip4_reass_drop_all (vm, node, rm, reass);
1176                           ip4_reass_free (vm, rm, rt, reass);
1177                           goto next_packet;
1178                           break;
1179                         case IP4_REASS_RC_INTERNAL_ERROR:
1180                           /* drop everything and start with a clean slate */
1181                           vlib_node_increment_counter (vm, node->node_index,
1182                                                        IP4_ERROR_REASS_INTERNAL_ERROR,
1183                                                        1);
1184                           ip4_reass_drop_all (vm, node, rm, reass);
1185                           ip4_reass_free (vm, rm, rt, reass);
1186                           goto next_packet;
1187                           break;
1188                         }
1189                     }
1190                   else
1191                     {
1192                       next0 = IP4_REASSEMBLY_NEXT_DROP;
1193                       error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1194                     }
1195                 }
1196
1197               b0->error = node->errors[error0];
1198             }
1199
1200           if (bi0 != ~0)
1201             {
1202               to_next[0] = bi0;
1203               to_next += 1;
1204               n_left_to_next -= 1;
1205               if (next0 == IP4_REASSEMBLY_NEXT_HANDOFF)
1206                 {
1207                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1208                     {
1209                       if (is_feature)
1210                         ip4_reass_add_trace (vm, node, rm, ~0,
1211                                              ~0,
1212                                              bi0, ~0, ~0, HANDOFF, 0,
1213                                              vnet_buffer (b0)->ip.
1214                                              reass.owner_feature_thread_index);
1215                       else
1216                         ip4_reass_add_trace (vm, node, rm, ~0, ~0, bi0,
1217                                              ~0, ~0, HANDOFF, 0,
1218                                              vnet_buffer (b0)->ip.
1219                                              reass.owner_thread_index);
1220                     }
1221                 }
1222               else if (is_feature && IP4_ERROR_NONE == error0)
1223                 {
1224                   b0 = vlib_get_buffer (vm, bi0);
1225                   vnet_feature_next (&next0, b0);
1226                 }
1227               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1228                                                to_next, n_left_to_next,
1229                                                bi0, next0);
1230               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1231             }
1232
1233         next_packet:
1234           from += 1;
1235           n_left_from -= 1;
1236         }
1237
1238       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1239     }
1240
1241   clib_spinlock_unlock (&rt->lock);
1242   return frame->n_vectors;
1243 }
1244
1245 static char *ip4_reassembly_error_strings[] = {
1246 #define _(sym, string) string,
1247   foreach_ip4_error
1248 #undef _
1249 };
1250
1251 VLIB_NODE_FN (ip4_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1252                                vlib_frame_t * frame)
1253 {
1254   return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ ,
1255                                 false /* is_custom_app */ );
1256 }
1257
1258 /* *INDENT-OFF* */
1259 VLIB_REGISTER_NODE (ip4_reass_node) = {
1260     .name = "ip4-reassembly",
1261     .vector_size = sizeof (u32),
1262     .format_trace = format_ip4_reass_trace,
1263     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1264     .error_strings = ip4_reassembly_error_strings,
1265     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1266     .next_nodes =
1267         {
1268                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1269                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1270                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reassembly-handoff",
1271
1272         },
1273 };
1274 /* *INDENT-ON* */
1275
1276 VLIB_NODE_FN (ip4_reass_node_feature) (vlib_main_t * vm,
1277                                        vlib_node_runtime_t * node,
1278                                        vlib_frame_t * frame)
1279 {
1280   return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ ,
1281                                 false /* is_custom_app */ );
1282 }
1283
1284 /* *INDENT-OFF* */
1285 VLIB_REGISTER_NODE (ip4_reass_node_feature) = {
1286     .name = "ip4-reassembly-feature",
1287     .vector_size = sizeof (u32),
1288     .format_trace = format_ip4_reass_trace,
1289     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1290     .error_strings = ip4_reassembly_error_strings,
1291     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1292     .next_nodes =
1293         {
1294                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1295                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1296                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reass-feature-hoff",
1297         },
1298 };
1299 /* *INDENT-ON* */
1300
1301 /* *INDENT-OFF* */
1302 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
1303     .arc_name = "ip4-unicast",
1304     .node_name = "ip4-reassembly-feature",
1305     .runs_before = VNET_FEATURES ("ip4-lookup",
1306                                   "ipsec4-input-feature"),
1307     .runs_after = 0,
1308 };
1309 /* *INDENT-ON* */
1310
1311 #ifndef CLIB_MARCH_VARIANT
1312 always_inline u32
1313 ip4_reass_get_nbuckets ()
1314 {
1315   ip4_reass_main_t *rm = &ip4_reass_main;
1316   u32 nbuckets;
1317   u8 i;
1318
1319   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1320
1321   for (i = 0; i < 31; i++)
1322     if ((1 << i) >= nbuckets)
1323       break;
1324   nbuckets = 1 << i;
1325
1326   return nbuckets;
1327 }
1328 #endif /* CLIB_MARCH_VARIANT */
1329
1330 typedef enum
1331 {
1332   IP4_EVENT_CONFIG_CHANGED = 1,
1333 } ip4_reass_event_t;
1334
1335 typedef struct
1336 {
1337   int failure;
1338   clib_bihash_16_8_t *new_hash;
1339 } ip4_rehash_cb_ctx;
1340
1341 #ifndef CLIB_MARCH_VARIANT
1342 static void
1343 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1344 {
1345   ip4_rehash_cb_ctx *ctx = _ctx;
1346   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1347     {
1348       ctx->failure = 1;
1349     }
1350 }
1351
1352 static void
1353 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1354                       u32 max_reassembly_length, u32 expire_walk_interval_ms)
1355 {
1356   ip4_reass_main.timeout_ms = timeout_ms;
1357   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1358   ip4_reass_main.max_reass_n = max_reassemblies;
1359   ip4_reass_main.max_reass_len = max_reassembly_length;
1360   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1361 }
1362
1363 vnet_api_error_t
1364 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
1365                u32 max_reassembly_length, u32 expire_walk_interval_ms)
1366 {
1367   u32 old_nbuckets = ip4_reass_get_nbuckets ();
1368   ip4_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
1369                         expire_walk_interval_ms);
1370   vlib_process_signal_event (ip4_reass_main.vlib_main,
1371                              ip4_reass_main.ip4_reass_expire_node_idx,
1372                              IP4_EVENT_CONFIG_CHANGED, 0);
1373   u32 new_nbuckets = ip4_reass_get_nbuckets ();
1374   if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1375     {
1376       clib_bihash_16_8_t new_hash;
1377       clib_memset (&new_hash, 0, sizeof (new_hash));
1378       ip4_rehash_cb_ctx ctx;
1379       ctx.failure = 0;
1380       ctx.new_hash = &new_hash;
1381       clib_bihash_init_16_8 (&new_hash, "ip4-reass", new_nbuckets,
1382                              new_nbuckets * 1024);
1383       clib_bihash_foreach_key_value_pair_16_8 (&ip4_reass_main.hash,
1384                                                ip4_rehash_cb, &ctx);
1385       if (ctx.failure)
1386         {
1387           clib_bihash_free_16_8 (&new_hash);
1388           return -1;
1389         }
1390       else
1391         {
1392           clib_bihash_free_16_8 (&ip4_reass_main.hash);
1393           clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
1394                             sizeof (ip4_reass_main.hash));
1395           clib_bihash_copied (&ip4_reass_main.hash, &new_hash);
1396         }
1397     }
1398   return 0;
1399 }
1400
1401 vnet_api_error_t
1402 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1403                u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1404 {
1405   *timeout_ms = ip4_reass_main.timeout_ms;
1406   *max_reassemblies = ip4_reass_main.max_reass_n;
1407   *max_reassembly_length = ip4_reass_main.max_reass_len;
1408   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
1409   return 0;
1410 }
1411
1412 static clib_error_t *
1413 ip4_reass_init_function (vlib_main_t * vm)
1414 {
1415   ip4_reass_main_t *rm = &ip4_reass_main;
1416   clib_error_t *error = 0;
1417   u32 nbuckets;
1418   vlib_node_t *node;
1419
1420   rm->vlib_main = vm;
1421
1422   vec_validate (rm->per_thread_data, vlib_num_workers ());
1423   ip4_reass_per_thread_t *rt;
1424   vec_foreach (rt, rm->per_thread_data)
1425   {
1426     clib_spinlock_init (&rt->lock);
1427     pool_alloc (rt->pool, rm->max_reass_n);
1428   }
1429
1430   node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
1431   ASSERT (node);
1432   rm->ip4_reass_expire_node_idx = node->index;
1433
1434   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1435                         IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1436                         IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1437                         IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1438
1439   nbuckets = ip4_reass_get_nbuckets ();
1440   clib_bihash_init_16_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
1441
1442   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1443   ASSERT (node);
1444   rm->ip4_drop_idx = node->index;
1445
1446   rm->fq_index = vlib_frame_queue_main_init (ip4_reass_node.index, 0);
1447   rm->fq_feature_index =
1448     vlib_frame_queue_main_init (ip4_reass_node_feature.index, 0);
1449
1450   return error;
1451 }
1452
1453 VLIB_INIT_FUNCTION (ip4_reass_init_function);
1454 #endif /* CLIB_MARCH_VARIANT */
1455
1456 static uword
1457 ip4_reass_walk_expired (vlib_main_t * vm,
1458                         vlib_node_runtime_t * node, vlib_frame_t * f)
1459 {
1460   ip4_reass_main_t *rm = &ip4_reass_main;
1461   uword event_type, *event_data = 0;
1462
1463   while (true)
1464     {
1465       vlib_process_wait_for_event_or_clock (vm,
1466                                             (f64)
1467                                             rm->expire_walk_interval_ms /
1468                                             (f64) MSEC_PER_SEC);
1469       event_type = vlib_process_get_events (vm, &event_data);
1470
1471       switch (event_type)
1472         {
1473         case ~0:                /* no events => timeout */
1474           /* nothing to do here */
1475           break;
1476         case IP4_EVENT_CONFIG_CHANGED:
1477           break;
1478         default:
1479           clib_warning ("BUG: event type 0x%wx", event_type);
1480           break;
1481         }
1482       f64 now = vlib_time_now (vm);
1483
1484       ip4_reass_t *reass;
1485       int *pool_indexes_to_free = NULL;
1486
1487       uword thread_index = 0;
1488       int index;
1489       const uword nthreads = vlib_num_workers () + 1;
1490       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1491         {
1492           ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1493           clib_spinlock_lock (&rt->lock);
1494
1495           vec_reset_length (pool_indexes_to_free);
1496           /* *INDENT-OFF* */
1497           pool_foreach_index (index, rt->pool, ({
1498                                 reass = pool_elt_at_index (rt->pool, index);
1499                                 if (now > reass->last_heard + rm->timeout)
1500                                   {
1501                                     vec_add1 (pool_indexes_to_free, index);
1502                                   }
1503                               }));
1504           /* *INDENT-ON* */
1505           int *i;
1506           /* *INDENT-OFF* */
1507           vec_foreach (i, pool_indexes_to_free)
1508           {
1509             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1510             ip4_reass_drop_all (vm, node, rm, reass);
1511             ip4_reass_free (vm, rm, rt, reass);
1512           }
1513           /* *INDENT-ON* */
1514
1515           clib_spinlock_unlock (&rt->lock);
1516         }
1517
1518       vec_free (pool_indexes_to_free);
1519       if (event_data)
1520         {
1521           _vec_len (event_data) = 0;
1522         }
1523     }
1524
1525   return 0;
1526 }
1527
1528 /* *INDENT-OFF* */
1529 VLIB_REGISTER_NODE (ip4_reass_expire_node) = {
1530     .function = ip4_reass_walk_expired,
1531     .type = VLIB_NODE_TYPE_PROCESS,
1532     .name = "ip4-reassembly-expire-walk",
1533     .format_trace = format_ip4_reass_trace,
1534     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1535     .error_strings = ip4_reassembly_error_strings,
1536
1537 };
1538 /* *INDENT-ON* */
1539
1540 static u8 *
1541 format_ip4_reass_key (u8 * s, va_list * args)
1542 {
1543   ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
1544   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1545               key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1546               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1547   return s;
1548 }
1549
1550 static u8 *
1551 format_ip4_reass (u8 * s, va_list * args)
1552 {
1553   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1554   ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
1555
1556   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1557               "last_packet_octet: %u, trace_op_counter: %u\n",
1558               reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
1559               reass->data_len, reass->last_packet_octet,
1560               reass->trace_op_counter);
1561   u32 bi = reass->first_bi;
1562   u32 counter = 0;
1563   while (~0 != bi)
1564     {
1565       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1566       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1567       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1568                   "fragment[%u, %u]\n",
1569                   counter, vnb->ip.reass.range_first,
1570                   vnb->ip.reass.range_last, bi,
1571                   ip4_reass_buffer_get_data_offset (b),
1572                   ip4_reass_buffer_get_data_len (b),
1573                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1574       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1575         {
1576           bi = b->next_buffer;
1577         }
1578       else
1579         {
1580           bi = ~0;
1581         }
1582     }
1583   return s;
1584 }
1585
1586 static clib_error_t *
1587 show_ip4_reass (vlib_main_t * vm,
1588                 unformat_input_t * input,
1589                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1590 {
1591   ip4_reass_main_t *rm = &ip4_reass_main;
1592
1593   vlib_cli_output (vm, "---------------------");
1594   vlib_cli_output (vm, "IP4 reassembly status");
1595   vlib_cli_output (vm, "---------------------");
1596   bool details = false;
1597   if (unformat (input, "details"))
1598     {
1599       details = true;
1600     }
1601
1602   u32 sum_reass_n = 0;
1603   ip4_reass_t *reass;
1604   uword thread_index;
1605   const uword nthreads = vlib_num_workers () + 1;
1606   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1607     {
1608       ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1609       clib_spinlock_lock (&rt->lock);
1610       if (details)
1611         {
1612           /* *INDENT-OFF* */
1613           pool_foreach (reass, rt->pool, {
1614             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1615           });
1616           /* *INDENT-ON* */
1617         }
1618       sum_reass_n += rt->reass_n;
1619       clib_spinlock_unlock (&rt->lock);
1620     }
1621   vlib_cli_output (vm, "---------------------");
1622   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1623                    (long unsigned) sum_reass_n);
1624   vlib_cli_output (vm,
1625                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1626                    (long unsigned) rm->max_reass_n);
1627   return 0;
1628 }
1629
1630 /* *INDENT-OFF* */
1631 VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
1632     .path = "show ip4-reassembly",
1633     .short_help = "show ip4-reassembly [details]",
1634     .function = show_ip4_reass,
1635 };
1636 /* *INDENT-ON* */
1637
1638 #ifndef CLIB_MARCH_VARIANT
1639 vnet_api_error_t
1640 ip4_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1641 {
1642   return vnet_feature_enable_disable ("ip4-unicast",
1643                                       "ip4-reassembly-feature", sw_if_index,
1644                                       enable_disable, 0, 0);
1645 }
1646 #endif /* CLIB_MARCH_VARIANT */
1647
1648
1649 #define foreach_ip4_reassembly_handoff_error                       \
1650 _(CONGESTION_DROP, "congestion drop")
1651
1652
1653 typedef enum
1654 {
1655 #define _(sym,str) IP4_REASSEMBLY_HANDOFF_ERROR_##sym,
1656   foreach_ip4_reassembly_handoff_error
1657 #undef _
1658     IP4_REASSEMBLY_HANDOFF_N_ERROR,
1659 } ip4_reassembly_handoff_error_t;
1660
1661 static char *ip4_reassembly_handoff_error_strings[] = {
1662 #define _(sym,string) string,
1663   foreach_ip4_reassembly_handoff_error
1664 #undef _
1665 };
1666
1667 typedef struct
1668 {
1669   u32 next_worker_index;
1670 } ip4_reassembly_handoff_trace_t;
1671
1672 static u8 *
1673 format_ip4_reassembly_handoff_trace (u8 * s, va_list * args)
1674 {
1675   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1676   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1677   ip4_reassembly_handoff_trace_t *t =
1678     va_arg (*args, ip4_reassembly_handoff_trace_t *);
1679
1680   s =
1681     format (s, "ip4-reassembly-handoff: next-worker %d",
1682             t->next_worker_index);
1683
1684   return s;
1685 }
1686
1687 always_inline uword
1688 ip4_reassembly_handoff_node_inline (vlib_main_t * vm,
1689                                     vlib_node_runtime_t * node,
1690                                     vlib_frame_t * frame, bool is_feature)
1691 {
1692   ip4_reass_main_t *rm = &ip4_reass_main;
1693
1694   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1695   u32 n_enq, n_left_from, *from;
1696   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1697   u32 fq_index;
1698
1699   from = vlib_frame_vector_args (frame);
1700   n_left_from = frame->n_vectors;
1701   vlib_get_buffers (vm, from, bufs, n_left_from);
1702
1703   b = bufs;
1704   ti = thread_indices;
1705
1706   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1707
1708   while (n_left_from > 0)
1709     {
1710       ti[0] =
1711         (is_feature) ? vnet_buffer (b[0])->ip.
1712         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1713         reass.owner_thread_index;
1714
1715       if (PREDICT_FALSE
1716           ((node->flags & VLIB_NODE_FLAG_TRACE)
1717            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1718         {
1719           ip4_reassembly_handoff_trace_t *t =
1720             vlib_add_trace (vm, node, b[0], sizeof (*t));
1721           t->next_worker_index = ti[0];
1722         }
1723
1724       n_left_from -= 1;
1725       ti += 1;
1726       b += 1;
1727     }
1728   n_enq =
1729     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1730                                    frame->n_vectors, 1);
1731
1732   if (n_enq < frame->n_vectors)
1733     vlib_node_increment_counter (vm, node->node_index,
1734                                  IP4_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1735                                  frame->n_vectors - n_enq);
1736   return frame->n_vectors;
1737 }
1738
1739 VLIB_NODE_FN (ip4_reassembly_handoff_node) (vlib_main_t * vm,
1740                                             vlib_node_runtime_t * node,
1741                                             vlib_frame_t * frame)
1742 {
1743   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1744                                              false /* is_feature */ );
1745 }
1746
1747
1748 /* *INDENT-OFF* */
1749 VLIB_REGISTER_NODE (ip4_reassembly_handoff_node) = {
1750   .name = "ip4-reassembly-handoff",
1751   .vector_size = sizeof (u32),
1752   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1753   .error_strings = ip4_reassembly_handoff_error_strings,
1754   .format_trace = format_ip4_reassembly_handoff_trace,
1755
1756   .n_next_nodes = 1,
1757
1758   .next_nodes = {
1759     [0] = "error-drop",
1760   },
1761 };
1762 /* *INDENT-ON* */
1763
1764
1765 /* *INDENT-OFF* */
1766 VLIB_NODE_FN (ip4_reassembly_feature_handoff_node) (vlib_main_t * vm,
1767                                                     vlib_node_runtime_t *
1768                                                     node,
1769                                                     vlib_frame_t * frame)
1770 {
1771   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1772                                              true /* is_feature */ );
1773 }
1774 /* *INDENT-ON* */
1775
1776
1777 /* *INDENT-OFF* */
1778 VLIB_REGISTER_NODE (ip4_reassembly_feature_handoff_node) = {
1779   .name = "ip4-reass-feature-hoff",
1780   .vector_size = sizeof (u32),
1781   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1782   .error_strings = ip4_reassembly_handoff_error_strings,
1783   .format_trace = format_ip4_reassembly_handoff_trace,
1784
1785   .n_next_nodes = 1,
1786
1787   .next_nodes = {
1788     [0] = "error-drop",
1789   },
1790 };
1791 /* *INDENT-ON* */
1792
1793 /*
1794  * fd.io coding-style-patch-verification: ON
1795  *
1796  * Local Variables:
1797  * eval: (c-set-style "gnu")
1798  * End:
1799  */