ip: reassembly: send packet out on correct worker
[vpp.git] / src / vnet / ip / ip4_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Reassembly.
19  *
20  * This file contains the source code for IPv4 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_16_8.h>
27 #include <vnet/ip/ip4_reassembly.h>
28 #include <stddef.h>
29
30 #define MSEC_PER_SEC 1000
31 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
32 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
33 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
34 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
35 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
36
37 #define IP4_REASS_DEBUG_BUFFERS 0
38 #if IP4_REASS_DEBUG_BUFFERS
39 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
40   do                                                 \
41     {                                                \
42       u32 _bi = bi;                                  \
43       printf (#what "buffer %u", _bi);               \
44       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
45       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
46         {                                            \
47           _bi = _b->next_buffer;                     \
48           printf ("[%u]", _bi);                      \
49           _b = vlib_get_buffer (vm, _bi);            \
50         }                                            \
51       printf ("\n");                                 \
52       fflush (stdout);                               \
53     }                                                \
54   while (0)
55 #else
56 #define IP4_REASS_DEBUG_BUFFER(...)
57 #endif
58
59 typedef enum
60 {
61   IP4_REASS_RC_OK,
62   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
63   IP4_REASS_RC_INTERNAL_ERROR,
64   IP4_REASS_RC_NO_BUF,
65   IP4_REASS_RC_HANDOFF,
66 } ip4_reass_rc_t;
67
68 typedef struct
69 {
70   union
71   {
72     struct
73     {
74       u32 xx_id;
75       ip4_address_t src;
76       ip4_address_t dst;
77       u16 frag_id;
78       u8 proto;
79       u8 unused;
80     };
81     u64 as_u64[2];
82   };
83 } ip4_reass_key_t;
84
85 typedef union
86 {
87   struct
88   {
89     u32 reass_index;
90     u32 memory_owner_thread_index;
91   };
92   u64 as_u64;
93 } ip4_reass_val_t;
94
95 typedef union
96 {
97   struct
98   {
99     ip4_reass_key_t k;
100     ip4_reass_val_t v;
101   };
102   clib_bihash_kv_16_8_t kv;
103 } ip4_reass_kv_t;
104
105 always_inline u32
106 ip4_reass_buffer_get_data_offset (vlib_buffer_t * b)
107 {
108   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
109   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
110 }
111
112 always_inline u16
113 ip4_reass_buffer_get_data_len (vlib_buffer_t * b)
114 {
115   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
116   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
117     (vnb->ip.reass.fragment_first + ip4_reass_buffer_get_data_offset (b)) + 1;
118 }
119
120 typedef struct
121 {
122   // hash table key
123   ip4_reass_key_t key;
124   // time when last packet was received
125   f64 last_heard;
126   // internal id of this reassembly
127   u64 id;
128   // buffer index of first buffer in this reassembly context
129   u32 first_bi;
130   // last octet of packet, ~0 until fragment without more_fragments arrives
131   u32 last_packet_octet;
132   // length of data collected so far
133   u32 data_len;
134   // trace operation counter
135   u32 trace_op_counter;
136   // next index - used by non-feature node
137   u32 next_index;
138   // error next index - used by custom apps (~0 if not used)
139   u32 error_next_index;
140   // minimum fragment length for this reassembly - used to estimate MTU
141   u16 min_fragment_length;
142   // number of fragments in this reassembly
143   u32 fragments_n;
144   // thread owning memory for this context (whose pool contains this ctx)
145   u32 memory_owner_thread_index;
146   // thread which received fragment with offset 0 and which sends out the
147   // completed reassembly
148   u32 sendout_thread_index;
149 } ip4_reass_t;
150
151 typedef struct
152 {
153   // pool of reassembly contexts
154   ip4_reass_t *pool;
155   u32 reass_n;
156   u32 id_counter;
157   clib_spinlock_t lock;
158 } ip4_reass_per_thread_t;
159
160 typedef struct
161 {
162   // IPv4 config
163   u32 timeout_ms;
164   f64 timeout;
165   u32 expire_walk_interval_ms;
166   // maximum number of fragments in one reassembly
167   u32 max_reass_len;
168   // maximum number of reassemblies
169   u32 max_reass_n;
170
171   // IPv4 runtime
172   clib_bihash_16_8_t hash;
173   // per-thread data
174   ip4_reass_per_thread_t *per_thread_data;
175
176   // convenience
177   vlib_main_t *vlib_main;
178
179   // node index of ip4-drop node
180   u32 ip4_drop_idx;
181   u32 ip4_reass_expire_node_idx;
182
183   /** Worker handoff */
184   u32 fq_index;
185   u32 fq_feature_index;
186 } ip4_reass_main_t;
187
188 extern ip4_reass_main_t ip4_reass_main;
189
190 #ifndef CLIB_MARCH_VARIANT
191 ip4_reass_main_t ip4_reass_main;
192 #endif /* CLIB_MARCH_VARIANT */
193
194 typedef enum
195 {
196   IP4_REASSEMBLY_NEXT_INPUT,
197   IP4_REASSEMBLY_NEXT_DROP,
198   IP4_REASSEMBLY_NEXT_HANDOFF,
199   IP4_REASSEMBLY_N_NEXT,
200 } ip4_reass_next_t;
201
202 typedef enum
203 {
204   RANGE_NEW,
205   RANGE_SHRINK,
206   RANGE_DISCARD,
207   RANGE_OVERLAP,
208   FINALIZE,
209   HANDOFF,
210 } ip4_reass_trace_operation_e;
211
212 typedef struct
213 {
214   u16 range_first;
215   u16 range_last;
216   u32 range_bi;
217   i32 data_offset;
218   u32 data_len;
219   u32 first_bi;
220 } ip4_reass_range_trace_t;
221
222 typedef struct
223 {
224   ip4_reass_trace_operation_e action;
225   u32 reass_id;
226   ip4_reass_range_trace_t trace_range;
227   u32 size_diff;
228   u32 op_id;
229   u32 thread_id;
230   u32 thread_id_to;
231   u32 fragment_first;
232   u32 fragment_last;
233   u32 total_data_len;
234 } ip4_reass_trace_t;
235
236 extern vlib_node_registration_t ip4_reass_node;
237 extern vlib_node_registration_t ip4_reass_node_feature;
238
239 static void
240 ip4_reass_trace_details (vlib_main_t * vm, u32 bi,
241                          ip4_reass_range_trace_t * trace)
242 {
243   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
244   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
245   trace->range_first = vnb->ip.reass.range_first;
246   trace->range_last = vnb->ip.reass.range_last;
247   trace->data_offset = ip4_reass_buffer_get_data_offset (b);
248   trace->data_len = ip4_reass_buffer_get_data_len (b);
249   trace->range_bi = bi;
250 }
251
252 static u8 *
253 format_ip4_reass_range_trace (u8 * s, va_list * args)
254 {
255   ip4_reass_range_trace_t *trace = va_arg (*args, ip4_reass_range_trace_t *);
256   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
257               trace->range_last, trace->data_offset, trace->data_len,
258               trace->range_bi);
259   return s;
260 }
261
262 static u8 *
263 format_ip4_reass_trace (u8 * s, va_list * args)
264 {
265   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
266   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
267   ip4_reass_trace_t *t = va_arg (*args, ip4_reass_trace_t *);
268   u32 indent = 0;
269   if (~0 != t->reass_id)
270     {
271       s = format (s, "reass id: %u, op id: %u, ", t->reass_id, t->op_id);
272       indent = format_get_indent (s);
273       s =
274         format (s,
275                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
276                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
277                 t->fragment_last);
278     }
279   switch (t->action)
280     {
281     case RANGE_SHRINK:
282       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
283                   format_ip4_reass_range_trace, &t->trace_range,
284                   t->size_diff);
285       break;
286     case RANGE_DISCARD:
287       s = format (s, "\n%Udiscard %U", format_white_space, indent,
288                   format_ip4_reass_range_trace, &t->trace_range);
289       break;
290     case RANGE_NEW:
291       s = format (s, "\n%Unew %U", format_white_space, indent,
292                   format_ip4_reass_range_trace, &t->trace_range);
293       break;
294     case RANGE_OVERLAP:
295       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
296                   format_ip4_reass_range_trace, &t->trace_range);
297       break;
298     case FINALIZE:
299       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
300       break;
301     case HANDOFF:
302       s =
303         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
304                 t->thread_id_to);
305       break;
306     }
307   return s;
308 }
309
310 static void
311 ip4_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
312                      ip4_reass_main_t * rm, u32 reass_id, u32 op_id,
313                      u32 bi, u32 first_bi, u32 data_len,
314                      ip4_reass_trace_operation_e action, u32 size_diff,
315                      u32 thread_id_to)
316 {
317   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
318   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
319   ip4_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
320   t->reass_id = reass_id;
321   t->action = action;
322   ip4_reass_trace_details (vm, bi, &t->trace_range);
323   t->size_diff = size_diff;
324   t->op_id = op_id;
325   t->thread_id = vm->thread_index;
326   t->thread_id_to = thread_id_to;
327   t->fragment_first = vnb->ip.reass.fragment_first;
328   t->fragment_last = vnb->ip.reass.fragment_last;
329   t->trace_range.first_bi = first_bi;
330   t->total_data_len = data_len;
331 #if 0
332   static u8 *s = NULL;
333   s = format (s, "%U", format_ip4_reass_trace, NULL, NULL, t);
334   printf ("%.*s\n", vec_len (s), s);
335   fflush (stdout);
336   vec_reset_length (s);
337 #endif
338 }
339
340 always_inline void
341 ip4_reass_free_ctx (ip4_reass_per_thread_t * rt, ip4_reass_t * reass)
342 {
343   pool_put (rt->pool, reass);
344   --rt->reass_n;
345 }
346
347 always_inline void
348 ip4_reass_free (vlib_main_t * vm, ip4_reass_main_t * rm,
349                 ip4_reass_per_thread_t * rt, ip4_reass_t * reass)
350 {
351   clib_bihash_kv_16_8_t kv;
352   kv.key[0] = reass->key.as_u64[0];
353   kv.key[1] = reass->key.as_u64[1];
354   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
355   return ip4_reass_free_ctx (rt, reass);
356 }
357
358 always_inline void
359 ip4_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
360                     ip4_reass_main_t * rm, ip4_reass_t * reass)
361 {
362   u32 range_bi = reass->first_bi;
363   vlib_buffer_t *range_b;
364   vnet_buffer_opaque_t *range_vnb;
365   u32 *to_free = NULL;
366   while (~0 != range_bi)
367     {
368       range_b = vlib_get_buffer (vm, range_bi);
369       range_vnb = vnet_buffer (range_b);
370       u32 bi = range_bi;
371       while (~0 != bi)
372         {
373           vec_add1 (to_free, bi);
374           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
375           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
376             {
377               bi = b->next_buffer;
378               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
379             }
380           else
381             {
382               bi = ~0;
383             }
384         }
385       range_bi = range_vnb->ip.reass.next_range_bi;
386     }
387   /* send to next_error_index */
388   if (~0 != reass->error_next_index)
389     {
390       u32 n_left_to_next, *to_next, next_index;
391
392       next_index = reass->error_next_index;
393       u32 bi = ~0;
394
395       while (vec_len (to_free) > 0)
396         {
397           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
398
399           while (vec_len (to_free) > 0 && n_left_to_next > 0)
400             {
401               bi = vec_pop (to_free);
402
403               if (~0 != bi)
404                 {
405                   to_next[0] = bi;
406                   to_next += 1;
407                   n_left_to_next -= 1;
408                   vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
409                                                    to_next, n_left_to_next,
410                                                    bi, next_index);
411                 }
412             }
413           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
414         }
415     }
416   else
417     {
418       vlib_buffer_free (vm, to_free, vec_len (to_free));
419     }
420 }
421
422 static ip4_reass_t *
423 ip4_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
424                           ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
425                           ip4_reass_kv_t * kv, u8 * do_handoff)
426 {
427   ip4_reass_t *reass;
428   f64 now;
429
430 again:
431
432   reass = NULL;
433   now = vlib_time_now (vm);
434   if (!clib_bihash_search_16_8
435       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
436     {
437       reass =
438         pool_elt_at_index (rm->per_thread_data
439                            [kv->v.memory_owner_thread_index].pool,
440                            kv->v.reass_index);
441       if (vm->thread_index != reass->memory_owner_thread_index)
442         {
443           *do_handoff = 1;
444           return reass;
445         }
446
447       if (now > reass->last_heard + rm->timeout)
448         {
449           ip4_reass_drop_all (vm, node, rm, reass);
450           ip4_reass_free (vm, rm, rt, reass);
451           reass = NULL;
452         }
453     }
454
455   if (reass)
456     {
457       reass->last_heard = now;
458       return reass;
459     }
460
461   if (rt->reass_n >= rm->max_reass_n)
462     {
463       reass = NULL;
464       return reass;
465     }
466   else
467     {
468       pool_get (rt->pool, reass);
469       clib_memset (reass, 0, sizeof (*reass));
470       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
471       reass->memory_owner_thread_index = vm->thread_index;
472       ++rt->id_counter;
473       reass->first_bi = ~0;
474       reass->last_packet_octet = ~0;
475       reass->data_len = 0;
476       reass->next_index = ~0;
477       reass->error_next_index = ~0;
478       ++rt->reass_n;
479     }
480
481   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
482   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
483   kv->v.reass_index = (reass - rt->pool);
484   kv->v.memory_owner_thread_index = vm->thread_index;
485   reass->last_heard = now;
486
487   int rv =
488     clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 2);
489   if (rv)
490     {
491       ip4_reass_free_ctx (rt, reass);
492       reass = NULL;
493       // if other worker created a context already work with the other copy
494       if (-2 == rv)
495         goto again;
496     }
497
498   return reass;
499 }
500
501 always_inline ip4_reass_rc_t
502 ip4_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
503                     ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
504                     ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
505                     bool is_custom_app)
506 {
507   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
508   vlib_buffer_t *last_b = NULL;
509   u32 sub_chain_bi = reass->first_bi;
510   u32 total_length = 0;
511   u32 buf_cnt = 0;
512   do
513     {
514       u32 tmp_bi = sub_chain_bi;
515       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
516       ip4_header_t *ip = vlib_buffer_get_current (tmp);
517       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
518       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
519           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
520         {
521           return IP4_REASS_RC_INTERNAL_ERROR;
522         }
523
524       u32 data_len = ip4_reass_buffer_get_data_len (tmp);
525       u32 trim_front =
526         ip4_header_bytes (ip) + ip4_reass_buffer_get_data_offset (tmp);
527       u32 trim_end =
528         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
529       if (tmp_bi == reass->first_bi)
530         {
531           /* first buffer - keep ip4 header */
532           if (0 != ip4_reass_buffer_get_data_offset (tmp))
533             {
534               return IP4_REASS_RC_INTERNAL_ERROR;
535             }
536           trim_front = 0;
537           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
538             ip4_header_bytes (ip);
539           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
540             {
541               return IP4_REASS_RC_INTERNAL_ERROR;
542             }
543         }
544       u32 keep_data =
545         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
546       while (1)
547         {
548           ++buf_cnt;
549           if (trim_front)
550             {
551               if (trim_front > tmp->current_length)
552                 {
553                   /* drop whole buffer */
554                   u32 to_be_freed_bi = tmp_bi;
555                   trim_front -= tmp->current_length;
556                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
557                     {
558                       return IP4_REASS_RC_INTERNAL_ERROR;
559                     }
560                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
561                   tmp_bi = tmp->next_buffer;
562                   tmp->next_buffer = 0;
563                   tmp = vlib_get_buffer (vm, tmp_bi);
564                   vlib_buffer_free_one (vm, to_be_freed_bi);
565                   continue;
566                 }
567               else
568                 {
569                   vlib_buffer_advance (tmp, trim_front);
570                   trim_front = 0;
571                 }
572             }
573           if (keep_data)
574             {
575               if (last_b)
576                 {
577                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
578                   last_b->next_buffer = tmp_bi;
579                 }
580               last_b = tmp;
581               if (keep_data <= tmp->current_length)
582                 {
583                   tmp->current_length = keep_data;
584                   keep_data = 0;
585                 }
586               else
587                 {
588                   keep_data -= tmp->current_length;
589                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
590                     {
591                       return IP4_REASS_RC_INTERNAL_ERROR;
592                     }
593                 }
594               total_length += tmp->current_length;
595               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
596                 {
597                   tmp_bi = tmp->next_buffer;
598                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
599                 }
600               else
601                 {
602                   break;
603                 }
604             }
605           else
606             {
607               u32 to_be_freed_bi = tmp_bi;
608               if (reass->first_bi == tmp_bi)
609                 {
610                   return IP4_REASS_RC_INTERNAL_ERROR;
611                 }
612               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
613                 {
614                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
615                   tmp_bi = tmp->next_buffer;
616                   tmp->next_buffer = 0;
617                   tmp = vlib_get_buffer (vm, tmp_bi);
618                   vlib_buffer_free_one (vm, to_be_freed_bi);
619                 }
620               else
621                 {
622                   tmp->next_buffer = 0;
623                   vlib_buffer_free_one (vm, to_be_freed_bi);
624                   break;
625                 }
626             }
627         }
628       sub_chain_bi =
629         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
630         reass.next_range_bi;
631     }
632   while (~0 != sub_chain_bi);
633
634   if (!last_b)
635     {
636       return IP4_REASS_RC_INTERNAL_ERROR;
637     }
638   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
639
640   if (total_length < first_b->current_length)
641     {
642       return IP4_REASS_RC_INTERNAL_ERROR;
643     }
644   total_length -= first_b->current_length;
645   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
646   first_b->total_length_not_including_first_buffer = total_length;
647   ip4_header_t *ip = vlib_buffer_get_current (first_b);
648   ip->flags_and_fragment_offset = 0;
649   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
650   ip->checksum = ip4_header_checksum (ip);
651   if (!vlib_buffer_chain_linearize (vm, first_b))
652     {
653       return IP4_REASS_RC_NO_BUF;
654     }
655   // reset to reconstruct the mbuf linking
656   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
657   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
658     {
659       ip4_reass_add_trace (vm, node, rm, reass->id, reass->trace_op_counter,
660                            reass->first_bi, reass->first_bi, reass->data_len,
661                            FINALIZE, 0, ~0);
662       ++reass->trace_op_counter;
663 #if 0
664       // following code does a hexdump of packet fragments to stdout ...
665       do
666         {
667           u32 bi = reass->first_bi;
668           u8 *s = NULL;
669           while (~0 != bi)
670             {
671               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
672               s = format (s, "%u: %U\n", bi, format_hexdump,
673                           vlib_buffer_get_current (b), b->current_length);
674               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
675                 {
676                   bi = b->next_buffer;
677                 }
678               else
679                 {
680                   break;
681                 }
682             }
683           printf ("%.*s\n", vec_len (s), s);
684           fflush (stdout);
685           vec_free (s);
686         }
687       while (0);
688 #endif
689     }
690   *bi0 = reass->first_bi;
691   if (!is_custom_app)
692     {
693       *next0 = IP4_REASSEMBLY_NEXT_INPUT;
694     }
695   else
696     {
697       *next0 = reass->next_index;
698     }
699   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
700   *error0 = IP4_ERROR_NONE;
701   ip4_reass_free (vm, rm, rt, reass);
702   reass = NULL;
703   return IP4_REASS_RC_OK;
704 }
705
706 always_inline ip4_reass_rc_t
707 ip4_reass_insert_range_in_chain (vlib_main_t * vm,
708                                  ip4_reass_main_t * rm,
709                                  ip4_reass_per_thread_t * rt,
710                                  ip4_reass_t * reass,
711                                  u32 prev_range_bi, u32 new_next_bi)
712 {
713   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
714   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
715   if (~0 != prev_range_bi)
716     {
717       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
718       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
719       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
720       prev_vnb->ip.reass.next_range_bi = new_next_bi;
721     }
722   else
723     {
724       if (~0 != reass->first_bi)
725         {
726           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
727         }
728       reass->first_bi = new_next_bi;
729     }
730   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
731   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
732       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
733     {
734       return IP4_REASS_RC_INTERNAL_ERROR;
735     }
736   reass->data_len += ip4_reass_buffer_get_data_len (new_next_b);
737   return IP4_REASS_RC_OK;
738 }
739
740 always_inline ip4_reass_rc_t
741 ip4_reass_remove_range_from_chain (vlib_main_t * vm,
742                                    vlib_node_runtime_t * node,
743                                    ip4_reass_main_t * rm,
744                                    ip4_reass_t * reass, u32 prev_range_bi,
745                                    u32 discard_bi)
746 {
747   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
748   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
749   if (~0 != prev_range_bi)
750     {
751       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
752       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
753       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
754         {
755           return IP4_REASS_RC_INTERNAL_ERROR;
756         }
757       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
758     }
759   else
760     {
761       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
762     }
763   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
764   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
765       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
766     {
767       return IP4_REASS_RC_INTERNAL_ERROR;
768     }
769   reass->data_len -= ip4_reass_buffer_get_data_len (discard_b);
770   while (1)
771     {
772       u32 to_be_freed_bi = discard_bi;
773       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
774         {
775           ip4_reass_add_trace (vm, node, rm, reass->id,
776                                reass->trace_op_counter, discard_bi,
777                                reass->first_bi, reass->data_len,
778                                RANGE_DISCARD, 0, ~0);
779           ++reass->trace_op_counter;
780         }
781       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
782         {
783           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
784           discard_bi = discard_b->next_buffer;
785           discard_b->next_buffer = 0;
786           discard_b = vlib_get_buffer (vm, discard_bi);
787           vlib_buffer_free_one (vm, to_be_freed_bi);
788         }
789       else
790         {
791           discard_b->next_buffer = 0;
792           vlib_buffer_free_one (vm, to_be_freed_bi);
793           break;
794         }
795     }
796   return IP4_REASS_RC_OK;
797 }
798
799 always_inline ip4_reass_rc_t
800 ip4_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
801                   ip4_reass_main_t * rm, ip4_reass_per_thread_t * rt,
802                   ip4_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
803                   bool is_custom_app, u32 * handoff_thread_idx)
804 {
805   ip4_reass_rc_t rc = IP4_REASS_RC_OK;
806   int consumed = 0;
807   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
808   ip4_header_t *fip = vlib_buffer_get_current (fb);
809   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
810   if (is_custom_app)
811     {
812       // store (error_)next_index before it's overwritten
813       reass->next_index = fvnb->ip.reass.next_index;
814       reass->error_next_index = fvnb->ip.reass.error_next_index;
815     }
816   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
817   const u32 fragment_length =
818     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
819   const u32 fragment_last = fragment_first + fragment_length - 1;
820   fvnb->ip.reass.fragment_first = fragment_first;
821   fvnb->ip.reass.fragment_last = fragment_last;
822   int more_fragments = ip4_get_fragment_more (fip);
823   u32 candidate_range_bi = reass->first_bi;
824   u32 prev_range_bi = ~0;
825   fvnb->ip.reass.range_first = fragment_first;
826   fvnb->ip.reass.range_last = fragment_last;
827   fvnb->ip.reass.next_range_bi = ~0;
828   if (!more_fragments)
829     {
830       reass->last_packet_octet = fragment_last;
831     }
832   if (~0 == reass->first_bi)
833     {
834       // starting a new reassembly
835       rc =
836         ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
837                                          *bi0);
838       if (IP4_REASS_RC_OK != rc)
839         {
840           return rc;
841         }
842       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
843         {
844           ip4_reass_add_trace (vm, node, rm, reass->id,
845                                reass->trace_op_counter, *bi0, reass->first_bi,
846                                reass->data_len, RANGE_NEW, 0, ~0);
847           ++reass->trace_op_counter;
848         }
849       *bi0 = ~0;
850       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
851       reass->fragments_n = 1;
852       return IP4_REASS_RC_OK;
853     }
854   reass->min_fragment_length = clib_min (clib_net_to_host_u16 (fip->length),
855                                          fvnb->ip.reass.estimated_mtu);
856   while (~0 != candidate_range_bi)
857     {
858       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
859       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
860       if (fragment_first > candidate_vnb->ip.reass.range_last)
861         {
862           // this fragments starts after candidate range
863           prev_range_bi = candidate_range_bi;
864           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
865           if (candidate_vnb->ip.reass.range_last < fragment_last &&
866               ~0 == candidate_range_bi)
867             {
868               // special case - this fragment falls beyond all known ranges
869               rc =
870                 ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
871                                                  prev_range_bi, *bi0);
872               if (IP4_REASS_RC_OK != rc)
873                 {
874                   return rc;
875                 }
876               consumed = 1;
877               break;
878             }
879           continue;
880         }
881       if (fragment_last < candidate_vnb->ip.reass.range_first)
882         {
883           // this fragment ends before candidate range without any overlap
884           rc =
885             ip4_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
886                                              *bi0);
887           if (IP4_REASS_RC_OK != rc)
888             {
889               return rc;
890             }
891           consumed = 1;
892         }
893       else
894         {
895           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
896               fragment_last <= candidate_vnb->ip.reass.range_last)
897             {
898               // this fragment is a (sub)part of existing range, ignore it
899               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
900                 {
901                   ip4_reass_add_trace (vm, node, rm, reass->id,
902                                        reass->trace_op_counter, *bi0,
903                                        reass->first_bi, reass->data_len,
904                                        RANGE_OVERLAP, 0, ~0);
905                   ++reass->trace_op_counter;
906                 }
907               break;
908             }
909           int discard_candidate = 0;
910           if (fragment_first < candidate_vnb->ip.reass.range_first)
911             {
912               u32 overlap =
913                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
914               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
915                 {
916                   candidate_vnb->ip.reass.range_first += overlap;
917                   if (reass->data_len < overlap)
918                     {
919                       return IP4_REASS_RC_INTERNAL_ERROR;
920                     }
921                   reass->data_len -= overlap;
922                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
923                     {
924                       ip4_reass_add_trace (vm, node, rm, reass->id,
925                                            reass->trace_op_counter,
926                                            candidate_range_bi,
927                                            reass->first_bi, reass->data_len,
928                                            RANGE_SHRINK, 0, ~0);
929                       ++reass->trace_op_counter;
930                     }
931                   rc =
932                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
933                                                      prev_range_bi, *bi0);
934                   if (IP4_REASS_RC_OK != rc)
935                     {
936                       return rc;
937                     }
938                   consumed = 1;
939                 }
940               else
941                 {
942                   discard_candidate = 1;
943                 }
944             }
945           else if (fragment_last > candidate_vnb->ip.reass.range_last)
946             {
947               u32 overlap =
948                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
949               if (overlap < ip4_reass_buffer_get_data_len (candidate_b))
950                 {
951                   fvnb->ip.reass.range_first += overlap;
952                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
953                     {
954                       prev_range_bi = candidate_range_bi;
955                       candidate_range_bi =
956                         candidate_vnb->ip.reass.next_range_bi;
957                       continue;
958                     }
959                   else
960                     {
961                       // special case - last range discarded
962                       rc =
963                         ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
964                                                          candidate_range_bi,
965                                                          *bi0);
966                       if (IP4_REASS_RC_OK != rc)
967                         {
968                           return rc;
969                         }
970                       consumed = 1;
971                     }
972                 }
973               else
974                 {
975                   discard_candidate = 1;
976                 }
977             }
978           else
979             {
980               discard_candidate = 1;
981             }
982           if (discard_candidate)
983             {
984               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
985               // discard candidate range, probe next range
986               rc =
987                 ip4_reass_remove_range_from_chain (vm, node, rm, reass,
988                                                    prev_range_bi,
989                                                    candidate_range_bi);
990               if (IP4_REASS_RC_OK != rc)
991                 {
992                   return rc;
993                 }
994               if (~0 != next_range_bi)
995                 {
996                   candidate_range_bi = next_range_bi;
997                   continue;
998                 }
999               else
1000                 {
1001                   // special case - last range discarded
1002                   rc =
1003                     ip4_reass_insert_range_in_chain (vm, rm, rt, reass,
1004                                                      prev_range_bi, *bi0);
1005                   if (IP4_REASS_RC_OK != rc)
1006                     {
1007                       return rc;
1008                     }
1009                   consumed = 1;
1010                 }
1011             }
1012         }
1013       break;
1014     }
1015   ++reass->fragments_n;
1016   if (consumed)
1017     {
1018       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1019         {
1020           ip4_reass_add_trace (vm, node, rm, reass->id,
1021                                reass->trace_op_counter, *bi0, reass->first_bi,
1022                                reass->data_len, RANGE_NEW, 0, ~0);
1023           ++reass->trace_op_counter;
1024         }
1025     }
1026   if (~0 != reass->last_packet_octet &&
1027       reass->data_len == reass->last_packet_octet + 1)
1028     {
1029       *handoff_thread_idx = reass->sendout_thread_index;
1030       rc =
1031         ip4_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1032                             is_custom_app);
1033       if (IP4_REASS_RC_OK == rc
1034           && reass->memory_owner_thread_index != reass->sendout_thread_index)
1035         {
1036           rc = IP4_REASS_RC_HANDOFF;
1037         }
1038     }
1039   else
1040     {
1041       if (consumed)
1042         {
1043           *bi0 = ~0;
1044           if (reass->fragments_n > rm->max_reass_len)
1045             {
1046               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1047             }
1048         }
1049       else
1050         {
1051           *next0 = IP4_REASSEMBLY_NEXT_DROP;
1052           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1053         }
1054     }
1055   return rc;
1056 }
1057
1058 always_inline uword
1059 ip4_reassembly_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1060                        vlib_frame_t * frame, bool is_feature,
1061                        bool is_custom_app)
1062 {
1063   u32 *from = vlib_frame_vector_args (frame);
1064   u32 n_left_from, n_left_to_next, *to_next, next_index;
1065   ip4_reass_main_t *rm = &ip4_reass_main;
1066   ip4_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1067   clib_spinlock_lock (&rt->lock);
1068
1069   n_left_from = frame->n_vectors;
1070   next_index = node->cached_next_index;
1071   while (n_left_from > 0)
1072     {
1073       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1074
1075       while (n_left_from > 0 && n_left_to_next > 0)
1076         {
1077           u32 bi0;
1078           vlib_buffer_t *b0;
1079           u32 next0;
1080           u32 error0 = IP4_ERROR_NONE;
1081
1082           bi0 = from[0];
1083           b0 = vlib_get_buffer (vm, bi0);
1084
1085           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1086           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1087             {
1088               // this is a whole packet - no fragmentation
1089               if (!is_custom_app)
1090                 {
1091                   next0 = IP4_REASSEMBLY_NEXT_INPUT;
1092                 }
1093               else
1094                 {
1095                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1096                 }
1097             }
1098           else
1099             {
1100               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1101               const u32 fragment_length =
1102                 clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1103               const u32 fragment_last = fragment_first + fragment_length - 1;
1104               if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
1105                 {
1106                   next0 = IP4_REASSEMBLY_NEXT_DROP;
1107                   error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1108                 }
1109               else
1110                 {
1111                   ip4_reass_kv_t kv;
1112                   u8 do_handoff = 0;
1113
1114                   kv.k.as_u64[0] =
1115                     (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1116                                    vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1117                     (u64) ip0->src_address.as_u32 << 32;
1118                   kv.k.as_u64[1] =
1119                     (u64) ip0->dst_address.as_u32 |
1120                     (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1121
1122                   ip4_reass_t *reass =
1123                     ip4_reass_find_or_create (vm, node, rm, rt, &kv,
1124                                               &do_handoff);
1125                   if (reass)
1126                     {
1127                       const u32 fragment_first =
1128                         ip4_get_fragment_offset_bytes (ip0);
1129                       if (0 == fragment_first)
1130                         {
1131                           reass->sendout_thread_index = vm->thread_index;
1132                         }
1133                     }
1134                   if (PREDICT_FALSE (do_handoff))
1135                     {
1136                       next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
1137                       if (is_feature)
1138                         vnet_buffer (b0)->ip.
1139                           reass.owner_feature_thread_index =
1140                           kv.v.memory_owner_thread_index;
1141                       else
1142                         vnet_buffer (b0)->ip.reass.owner_thread_index =
1143                           kv.v.memory_owner_thread_index;
1144                     }
1145                   else if (reass)
1146                     {
1147                       u32 handoff_thread_idx;
1148                       switch (ip4_reass_update
1149                               (vm, node, rm, rt, reass, &bi0, &next0,
1150                                &error0, is_custom_app, &handoff_thread_idx))
1151                         {
1152                         case IP4_REASS_RC_OK:
1153                           /* nothing to do here */
1154                           break;
1155                         case IP4_REASS_RC_HANDOFF:
1156                           next0 = IP4_REASSEMBLY_NEXT_HANDOFF;
1157                           b0 = vlib_get_buffer (vm, bi0);
1158                           if (is_feature)
1159                             vnet_buffer (b0)->ip.
1160                               reass.owner_feature_thread_index =
1161                               handoff_thread_idx;
1162                           else
1163                             vnet_buffer (b0)->ip.reass.owner_thread_index =
1164                               handoff_thread_idx;
1165                           break;
1166                         case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1167                           vlib_node_increment_counter (vm, node->node_index,
1168                                                        IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1169                                                        1);
1170                           ip4_reass_drop_all (vm, node, rm, reass);
1171                           ip4_reass_free (vm, rm, rt, reass);
1172                           goto next_packet;
1173                           break;
1174                         case IP4_REASS_RC_NO_BUF:
1175                           vlib_node_increment_counter (vm, node->node_index,
1176                                                        IP4_ERROR_REASS_NO_BUF,
1177                                                        1);
1178                           ip4_reass_drop_all (vm, node, rm, reass);
1179                           ip4_reass_free (vm, rm, rt, reass);
1180                           goto next_packet;
1181                           break;
1182                         case IP4_REASS_RC_INTERNAL_ERROR:
1183                           /* drop everything and start with a clean slate */
1184                           vlib_node_increment_counter (vm, node->node_index,
1185                                                        IP4_ERROR_REASS_INTERNAL_ERROR,
1186                                                        1);
1187                           ip4_reass_drop_all (vm, node, rm, reass);
1188                           ip4_reass_free (vm, rm, rt, reass);
1189                           goto next_packet;
1190                           break;
1191                         }
1192                     }
1193                   else
1194                     {
1195                       next0 = IP4_REASSEMBLY_NEXT_DROP;
1196                       error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1197                     }
1198                 }
1199
1200               b0->error = node->errors[error0];
1201             }
1202
1203           if (bi0 != ~0)
1204             {
1205               to_next[0] = bi0;
1206               to_next += 1;
1207               n_left_to_next -= 1;
1208               if (next0 == IP4_REASSEMBLY_NEXT_HANDOFF)
1209                 {
1210                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1211                     {
1212                       if (is_feature)
1213                         ip4_reass_add_trace (vm, node, rm, ~0,
1214                                              ~0,
1215                                              bi0, ~0, ~0, HANDOFF, 0,
1216                                              vnet_buffer (b0)->ip.
1217                                              reass.owner_feature_thread_index);
1218                       else
1219                         ip4_reass_add_trace (vm, node, rm, ~0, ~0, bi0,
1220                                              ~0, ~0, HANDOFF, 0,
1221                                              vnet_buffer (b0)->ip.
1222                                              reass.owner_thread_index);
1223                     }
1224                 }
1225               else if (is_feature && IP4_ERROR_NONE == error0)
1226                 {
1227                   b0 = vlib_get_buffer (vm, bi0);
1228                   vnet_feature_next (&next0, b0);
1229                 }
1230               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1231                                                to_next, n_left_to_next,
1232                                                bi0, next0);
1233               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1234             }
1235
1236         next_packet:
1237           from += 1;
1238           n_left_from -= 1;
1239         }
1240
1241       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1242     }
1243
1244   clib_spinlock_unlock (&rt->lock);
1245   return frame->n_vectors;
1246 }
1247
1248 static char *ip4_reassembly_error_strings[] = {
1249 #define _(sym, string) string,
1250   foreach_ip4_error
1251 #undef _
1252 };
1253
1254 VLIB_NODE_FN (ip4_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1255                                vlib_frame_t * frame)
1256 {
1257   return ip4_reassembly_inline (vm, node, frame, false /* is_feature */ ,
1258                                 false /* is_custom_app */ );
1259 }
1260
1261 /* *INDENT-OFF* */
1262 VLIB_REGISTER_NODE (ip4_reass_node) = {
1263     .name = "ip4-reassembly",
1264     .vector_size = sizeof (u32),
1265     .format_trace = format_ip4_reass_trace,
1266     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1267     .error_strings = ip4_reassembly_error_strings,
1268     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1269     .next_nodes =
1270         {
1271                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1272                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1273                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reassembly-handoff",
1274
1275         },
1276 };
1277 /* *INDENT-ON* */
1278
1279 VLIB_NODE_FN (ip4_reass_node_feature) (vlib_main_t * vm,
1280                                        vlib_node_runtime_t * node,
1281                                        vlib_frame_t * frame)
1282 {
1283   return ip4_reassembly_inline (vm, node, frame, true /* is_feature */ ,
1284                                 false /* is_custom_app */ );
1285 }
1286
1287 /* *INDENT-OFF* */
1288 VLIB_REGISTER_NODE (ip4_reass_node_feature) = {
1289     .name = "ip4-reassembly-feature",
1290     .vector_size = sizeof (u32),
1291     .format_trace = format_ip4_reass_trace,
1292     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1293     .error_strings = ip4_reassembly_error_strings,
1294     .n_next_nodes = IP4_REASSEMBLY_N_NEXT,
1295     .next_nodes =
1296         {
1297                 [IP4_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1298                 [IP4_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1299                 [IP4_REASSEMBLY_NEXT_HANDOFF] = "ip4-reass-feature-hoff",
1300         },
1301 };
1302 /* *INDENT-ON* */
1303
1304 /* *INDENT-OFF* */
1305 VNET_FEATURE_INIT (ip4_reassembly_feature, static) = {
1306     .arc_name = "ip4-unicast",
1307     .node_name = "ip4-reassembly-feature",
1308     .runs_before = VNET_FEATURES ("ip4-lookup",
1309                                   "ipsec4-input-feature"),
1310     .runs_after = 0,
1311 };
1312 /* *INDENT-ON* */
1313
1314 #ifndef CLIB_MARCH_VARIANT
1315 always_inline u32
1316 ip4_reass_get_nbuckets ()
1317 {
1318   ip4_reass_main_t *rm = &ip4_reass_main;
1319   u32 nbuckets;
1320   u8 i;
1321
1322   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1323
1324   for (i = 0; i < 31; i++)
1325     if ((1 << i) >= nbuckets)
1326       break;
1327   nbuckets = 1 << i;
1328
1329   return nbuckets;
1330 }
1331 #endif /* CLIB_MARCH_VARIANT */
1332
1333 typedef enum
1334 {
1335   IP4_EVENT_CONFIG_CHANGED = 1,
1336 } ip4_reass_event_t;
1337
1338 typedef struct
1339 {
1340   int failure;
1341   clib_bihash_16_8_t *new_hash;
1342 } ip4_rehash_cb_ctx;
1343
1344 #ifndef CLIB_MARCH_VARIANT
1345 static void
1346 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1347 {
1348   ip4_rehash_cb_ctx *ctx = _ctx;
1349   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1350     {
1351       ctx->failure = 1;
1352     }
1353 }
1354
1355 static void
1356 ip4_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1357                       u32 max_reassembly_length, u32 expire_walk_interval_ms)
1358 {
1359   ip4_reass_main.timeout_ms = timeout_ms;
1360   ip4_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1361   ip4_reass_main.max_reass_n = max_reassemblies;
1362   ip4_reass_main.max_reass_len = max_reassembly_length;
1363   ip4_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1364 }
1365
1366 vnet_api_error_t
1367 ip4_reass_set (u32 timeout_ms, u32 max_reassemblies,
1368                u32 max_reassembly_length, u32 expire_walk_interval_ms)
1369 {
1370   u32 old_nbuckets = ip4_reass_get_nbuckets ();
1371   ip4_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
1372                         expire_walk_interval_ms);
1373   vlib_process_signal_event (ip4_reass_main.vlib_main,
1374                              ip4_reass_main.ip4_reass_expire_node_idx,
1375                              IP4_EVENT_CONFIG_CHANGED, 0);
1376   u32 new_nbuckets = ip4_reass_get_nbuckets ();
1377   if (ip4_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1378     {
1379       clib_bihash_16_8_t new_hash;
1380       clib_memset (&new_hash, 0, sizeof (new_hash));
1381       ip4_rehash_cb_ctx ctx;
1382       ctx.failure = 0;
1383       ctx.new_hash = &new_hash;
1384       clib_bihash_init_16_8 (&new_hash, "ip4-reass", new_nbuckets,
1385                              new_nbuckets * 1024);
1386       clib_bihash_foreach_key_value_pair_16_8 (&ip4_reass_main.hash,
1387                                                ip4_rehash_cb, &ctx);
1388       if (ctx.failure)
1389         {
1390           clib_bihash_free_16_8 (&new_hash);
1391           return -1;
1392         }
1393       else
1394         {
1395           clib_bihash_free_16_8 (&ip4_reass_main.hash);
1396           clib_memcpy_fast (&ip4_reass_main.hash, &new_hash,
1397                             sizeof (ip4_reass_main.hash));
1398           clib_bihash_copied (&ip4_reass_main.hash, &new_hash);
1399         }
1400     }
1401   return 0;
1402 }
1403
1404 vnet_api_error_t
1405 ip4_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1406                u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1407 {
1408   *timeout_ms = ip4_reass_main.timeout_ms;
1409   *max_reassemblies = ip4_reass_main.max_reass_n;
1410   *max_reassembly_length = ip4_reass_main.max_reass_len;
1411   *expire_walk_interval_ms = ip4_reass_main.expire_walk_interval_ms;
1412   return 0;
1413 }
1414
1415 static clib_error_t *
1416 ip4_reass_init_function (vlib_main_t * vm)
1417 {
1418   ip4_reass_main_t *rm = &ip4_reass_main;
1419   clib_error_t *error = 0;
1420   u32 nbuckets;
1421   vlib_node_t *node;
1422
1423   rm->vlib_main = vm;
1424
1425   vec_validate (rm->per_thread_data, vlib_num_workers ());
1426   ip4_reass_per_thread_t *rt;
1427   vec_foreach (rt, rm->per_thread_data)
1428   {
1429     clib_spinlock_init (&rt->lock);
1430     pool_alloc (rt->pool, rm->max_reass_n);
1431   }
1432
1433   node = vlib_get_node_by_name (vm, (u8 *) "ip4-reassembly-expire-walk");
1434   ASSERT (node);
1435   rm->ip4_reass_expire_node_idx = node->index;
1436
1437   ip4_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1438                         IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1439                         IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1440                         IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1441
1442   nbuckets = ip4_reass_get_nbuckets ();
1443   clib_bihash_init_16_8 (&rm->hash, "ip4-reass", nbuckets, nbuckets * 1024);
1444
1445   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1446   ASSERT (node);
1447   rm->ip4_drop_idx = node->index;
1448
1449   rm->fq_index = vlib_frame_queue_main_init (ip4_reass_node.index, 0);
1450   rm->fq_feature_index =
1451     vlib_frame_queue_main_init (ip4_reass_node_feature.index, 0);
1452
1453   return error;
1454 }
1455
1456 VLIB_INIT_FUNCTION (ip4_reass_init_function);
1457 #endif /* CLIB_MARCH_VARIANT */
1458
1459 static uword
1460 ip4_reass_walk_expired (vlib_main_t * vm,
1461                         vlib_node_runtime_t * node, vlib_frame_t * f)
1462 {
1463   ip4_reass_main_t *rm = &ip4_reass_main;
1464   uword event_type, *event_data = 0;
1465
1466   while (true)
1467     {
1468       vlib_process_wait_for_event_or_clock (vm,
1469                                             (f64)
1470                                             rm->expire_walk_interval_ms /
1471                                             (f64) MSEC_PER_SEC);
1472       event_type = vlib_process_get_events (vm, &event_data);
1473
1474       switch (event_type)
1475         {
1476         case ~0:                /* no events => timeout */
1477           /* nothing to do here */
1478           break;
1479         case IP4_EVENT_CONFIG_CHANGED:
1480           break;
1481         default:
1482           clib_warning ("BUG: event type 0x%wx", event_type);
1483           break;
1484         }
1485       f64 now = vlib_time_now (vm);
1486
1487       ip4_reass_t *reass;
1488       int *pool_indexes_to_free = NULL;
1489
1490       uword thread_index = 0;
1491       int index;
1492       const uword nthreads = vlib_num_workers () + 1;
1493       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1494         {
1495           ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1496           clib_spinlock_lock (&rt->lock);
1497
1498           vec_reset_length (pool_indexes_to_free);
1499           /* *INDENT-OFF* */
1500           pool_foreach_index (index, rt->pool, ({
1501                                 reass = pool_elt_at_index (rt->pool, index);
1502                                 if (now > reass->last_heard + rm->timeout)
1503                                   {
1504                                     vec_add1 (pool_indexes_to_free, index);
1505                                   }
1506                               }));
1507           /* *INDENT-ON* */
1508           int *i;
1509           /* *INDENT-OFF* */
1510           vec_foreach (i, pool_indexes_to_free)
1511           {
1512             ip4_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1513             ip4_reass_drop_all (vm, node, rm, reass);
1514             ip4_reass_free (vm, rm, rt, reass);
1515           }
1516           /* *INDENT-ON* */
1517
1518           clib_spinlock_unlock (&rt->lock);
1519         }
1520
1521       vec_free (pool_indexes_to_free);
1522       if (event_data)
1523         {
1524           _vec_len (event_data) = 0;
1525         }
1526     }
1527
1528   return 0;
1529 }
1530
1531 /* *INDENT-OFF* */
1532 VLIB_REGISTER_NODE (ip4_reass_expire_node, static) = {
1533     .function = ip4_reass_walk_expired,
1534     .type = VLIB_NODE_TYPE_PROCESS,
1535     .name = "ip4-reassembly-expire-walk",
1536     .format_trace = format_ip4_reass_trace,
1537     .n_errors = ARRAY_LEN (ip4_reassembly_error_strings),
1538     .error_strings = ip4_reassembly_error_strings,
1539
1540 };
1541 /* *INDENT-ON* */
1542
1543 static u8 *
1544 format_ip4_reass_key (u8 * s, va_list * args)
1545 {
1546   ip4_reass_key_t *key = va_arg (*args, ip4_reass_key_t *);
1547   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1548               key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1549               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1550   return s;
1551 }
1552
1553 static u8 *
1554 format_ip4_reass (u8 * s, va_list * args)
1555 {
1556   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1557   ip4_reass_t *reass = va_arg (*args, ip4_reass_t *);
1558
1559   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1560               "last_packet_octet: %u, trace_op_counter: %u\n",
1561               reass->id, format_ip4_reass_key, &reass->key, reass->first_bi,
1562               reass->data_len, reass->last_packet_octet,
1563               reass->trace_op_counter);
1564   u32 bi = reass->first_bi;
1565   u32 counter = 0;
1566   while (~0 != bi)
1567     {
1568       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1569       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1570       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1571                   "fragment[%u, %u]\n",
1572                   counter, vnb->ip.reass.range_first,
1573                   vnb->ip.reass.range_last, bi,
1574                   ip4_reass_buffer_get_data_offset (b),
1575                   ip4_reass_buffer_get_data_len (b),
1576                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1577       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1578         {
1579           bi = b->next_buffer;
1580         }
1581       else
1582         {
1583           bi = ~0;
1584         }
1585     }
1586   return s;
1587 }
1588
1589 static clib_error_t *
1590 show_ip4_reass (vlib_main_t * vm,
1591                 unformat_input_t * input,
1592                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1593 {
1594   ip4_reass_main_t *rm = &ip4_reass_main;
1595
1596   vlib_cli_output (vm, "---------------------");
1597   vlib_cli_output (vm, "IP4 reassembly status");
1598   vlib_cli_output (vm, "---------------------");
1599   bool details = false;
1600   if (unformat (input, "details"))
1601     {
1602       details = true;
1603     }
1604
1605   u32 sum_reass_n = 0;
1606   ip4_reass_t *reass;
1607   uword thread_index;
1608   const uword nthreads = vlib_num_workers () + 1;
1609   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1610     {
1611       ip4_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1612       clib_spinlock_lock (&rt->lock);
1613       if (details)
1614         {
1615           /* *INDENT-OFF* */
1616           pool_foreach (reass, rt->pool, {
1617             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1618           });
1619           /* *INDENT-ON* */
1620         }
1621       sum_reass_n += rt->reass_n;
1622       clib_spinlock_unlock (&rt->lock);
1623     }
1624   vlib_cli_output (vm, "---------------------");
1625   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1626                    (long unsigned) sum_reass_n);
1627   vlib_cli_output (vm,
1628                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1629                    (long unsigned) rm->max_reass_n);
1630   return 0;
1631 }
1632
1633 /* *INDENT-OFF* */
1634 VLIB_CLI_COMMAND (show_ip4_reassembly_cmd, static) = {
1635     .path = "show ip4-reassembly",
1636     .short_help = "show ip4-reassembly [details]",
1637     .function = show_ip4_reass,
1638 };
1639 /* *INDENT-ON* */
1640
1641 #ifndef CLIB_MARCH_VARIANT
1642 vnet_api_error_t
1643 ip4_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1644 {
1645   return vnet_feature_enable_disable ("ip4-unicast",
1646                                       "ip4-reassembly-feature", sw_if_index,
1647                                       enable_disable, 0, 0);
1648 }
1649 #endif /* CLIB_MARCH_VARIANT */
1650
1651
1652 #define foreach_ip4_reassembly_handoff_error                       \
1653 _(CONGESTION_DROP, "congestion drop")
1654
1655
1656 typedef enum
1657 {
1658 #define _(sym,str) IP4_REASSEMBLY_HANDOFF_ERROR_##sym,
1659   foreach_ip4_reassembly_handoff_error
1660 #undef _
1661     IP4_REASSEMBLY_HANDOFF_N_ERROR,
1662 } ip4_reassembly_handoff_error_t;
1663
1664 static char *ip4_reassembly_handoff_error_strings[] = {
1665 #define _(sym,string) string,
1666   foreach_ip4_reassembly_handoff_error
1667 #undef _
1668 };
1669
1670 typedef struct
1671 {
1672   u32 next_worker_index;
1673 } ip4_reassembly_handoff_trace_t;
1674
1675 static u8 *
1676 format_ip4_reassembly_handoff_trace (u8 * s, va_list * args)
1677 {
1678   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1679   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1680   ip4_reassembly_handoff_trace_t *t =
1681     va_arg (*args, ip4_reassembly_handoff_trace_t *);
1682
1683   s =
1684     format (s, "ip4-reassembly-handoff: next-worker %d",
1685             t->next_worker_index);
1686
1687   return s;
1688 }
1689
1690 always_inline uword
1691 ip4_reassembly_handoff_node_inline (vlib_main_t * vm,
1692                                     vlib_node_runtime_t * node,
1693                                     vlib_frame_t * frame, bool is_feature)
1694 {
1695   ip4_reass_main_t *rm = &ip4_reass_main;
1696
1697   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1698   u32 n_enq, n_left_from, *from;
1699   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1700   u32 fq_index;
1701
1702   from = vlib_frame_vector_args (frame);
1703   n_left_from = frame->n_vectors;
1704   vlib_get_buffers (vm, from, bufs, n_left_from);
1705
1706   b = bufs;
1707   ti = thread_indices;
1708
1709   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1710
1711   while (n_left_from > 0)
1712     {
1713       ti[0] =
1714         (is_feature) ? vnet_buffer (b[0])->ip.
1715         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1716         reass.owner_thread_index;
1717
1718       if (PREDICT_FALSE
1719           ((node->flags & VLIB_NODE_FLAG_TRACE)
1720            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1721         {
1722           ip4_reassembly_handoff_trace_t *t =
1723             vlib_add_trace (vm, node, b[0], sizeof (*t));
1724           t->next_worker_index = ti[0];
1725         }
1726
1727       n_left_from -= 1;
1728       ti += 1;
1729       b += 1;
1730     }
1731   n_enq =
1732     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1733                                    frame->n_vectors, 1);
1734
1735   if (n_enq < frame->n_vectors)
1736     vlib_node_increment_counter (vm, node->node_index,
1737                                  IP4_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1738                                  frame->n_vectors - n_enq);
1739   return frame->n_vectors;
1740 }
1741
1742 VLIB_NODE_FN (ip4_reassembly_handoff_node) (vlib_main_t * vm,
1743                                             vlib_node_runtime_t * node,
1744                                             vlib_frame_t * frame)
1745 {
1746   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1747                                              false /* is_feature */ );
1748 }
1749
1750
1751 /* *INDENT-OFF* */
1752 VLIB_REGISTER_NODE (ip4_reassembly_handoff_node) = {
1753   .name = "ip4-reassembly-handoff",
1754   .vector_size = sizeof (u32),
1755   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1756   .error_strings = ip4_reassembly_handoff_error_strings,
1757   .format_trace = format_ip4_reassembly_handoff_trace,
1758
1759   .n_next_nodes = 1,
1760
1761   .next_nodes = {
1762     [0] = "error-drop",
1763   },
1764 };
1765 /* *INDENT-ON* */
1766
1767
1768 /* *INDENT-OFF* */
1769 VLIB_NODE_FN (ip4_reassembly_feature_handoff_node) (vlib_main_t * vm,
1770                                                     vlib_node_runtime_t *
1771                                                     node,
1772                                                     vlib_frame_t * frame)
1773 {
1774   return ip4_reassembly_handoff_node_inline (vm, node, frame,
1775                                              true /* is_feature */ );
1776 }
1777 /* *INDENT-ON* */
1778
1779
1780 /* *INDENT-OFF* */
1781 VLIB_REGISTER_NODE (ip4_reassembly_feature_handoff_node) = {
1782   .name = "ip4-reass-feature-hoff",
1783   .vector_size = sizeof (u32),
1784   .n_errors = ARRAY_LEN(ip4_reassembly_handoff_error_strings),
1785   .error_strings = ip4_reassembly_handoff_error_strings,
1786   .format_trace = format_ip4_reassembly_handoff_trace,
1787
1788   .n_next_nodes = 1,
1789
1790   .next_nodes = {
1791     [0] = "error-drop",
1792   },
1793 };
1794 /* *INDENT-ON* */
1795
1796 /*
1797  * fd.io coding-style-patch-verification: ON
1798  *
1799  * Local Variables:
1800  * eval: (c-set-style "gnu")
1801  * End:
1802  */