ip: more detailed show reassembly commands
[vpp.git] / src / vnet / ip / reass / ip4_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Full Reassembly.
19  *
20  * This file contains the source code for IPv4 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/fifo.h>
27 #include <vppinfra/bihash_16_8.h>
28 #include <vnet/ip/reass/ip4_full_reass.h>
29 #include <stddef.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
34 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38 #define IP4_REASS_DEBUG_BUFFERS 0
39 #if IP4_REASS_DEBUG_BUFFERS
40 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41   do                                                 \
42     {                                                \
43       u32 _bi = bi;                                  \
44       printf (#what "buffer %u", _bi);               \
45       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47         {                                            \
48           _bi = _b->next_buffer;                     \
49           printf ("[%u]", _bi);                      \
50           _b = vlib_get_buffer (vm, _bi);            \
51         }                                            \
52       printf ("\n");                                 \
53       fflush (stdout);                               \
54     }                                                \
55   while (0)
56 #else
57 #define IP4_REASS_DEBUG_BUFFER(...)
58 #endif
59
60 typedef enum
61 {
62   IP4_REASS_RC_OK,
63   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64   IP4_REASS_RC_INTERNAL_ERROR,
65   IP4_REASS_RC_NO_BUF,
66   IP4_REASS_RC_HANDOFF,
67 } ip4_full_reass_rc_t;
68
69 typedef struct
70 {
71   union
72   {
73     struct
74     {
75       u32 xx_id;
76       ip4_address_t src;
77       ip4_address_t dst;
78       u16 frag_id;
79       u8 proto;
80       u8 unused;
81     };
82     u64 as_u64[2];
83   };
84 } ip4_full_reass_key_t;
85
86 typedef union
87 {
88   struct
89   {
90     u32 reass_index;
91     u32 memory_owner_thread_index;
92   };
93   u64 as_u64;
94 } ip4_full_reass_val_t;
95
96 typedef union
97 {
98   struct
99   {
100     ip4_full_reass_key_t k;
101     ip4_full_reass_val_t v;
102   };
103   clib_bihash_kv_16_8_t kv;
104 } ip4_full_reass_kv_t;
105
106 always_inline u32
107 ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108 {
109   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111 }
112
113 always_inline u16
114 ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115 {
116   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118     (vnb->ip.reass.fragment_first +
119      ip4_full_reass_buffer_get_data_offset (b)) + 1;
120 }
121
122 typedef struct
123 {
124   // hash table key
125   ip4_full_reass_key_t key;
126   // time when last packet was received
127   f64 last_heard;
128   // internal id of this reassembly
129   u64 id;
130   // buffer index of first buffer in this reassembly context
131   u32 first_bi;
132   // last octet of packet, ~0 until fragment without more_fragments arrives
133   u32 last_packet_octet;
134   // length of data collected so far
135   u32 data_len;
136   // trace operation counter
137   u32 trace_op_counter;
138   // next index - used by non-feature node
139   u32 next_index;
140   // error next index - used by custom apps (~0 if not used)
141   u32 error_next_index;
142   // minimum fragment length for this reassembly - used to estimate MTU
143   u16 min_fragment_length;
144   // number of fragments in this reassembly
145   u32 fragments_n;
146   // thread owning memory for this context (whose pool contains this ctx)
147   u32 memory_owner_thread_index;
148   // thread which received fragment with offset 0 and which sends out the
149   // completed reassembly
150   u32 sendout_thread_index;
151 } ip4_full_reass_t;
152
153 typedef struct
154 {
155   ip4_full_reass_t *pool;
156   u32 reass_n;
157   u32 id_counter;
158   clib_spinlock_t lock;
159 } ip4_full_reass_per_thread_t;
160
161 typedef struct
162 {
163   // IPv4 config
164   u32 timeout_ms;
165   f64 timeout;
166   u32 expire_walk_interval_ms;
167   // maximum number of fragments in one reassembly
168   u32 max_reass_len;
169   // maximum number of reassemblies
170   u32 max_reass_n;
171
172   // IPv4 runtime
173   clib_bihash_16_8_t hash;
174   // per-thread data
175   ip4_full_reass_per_thread_t *per_thread_data;
176
177   // convenience
178   vlib_main_t *vlib_main;
179
180   // node index of ip4-drop node
181   u32 ip4_drop_idx;
182   u32 ip4_full_reass_expire_node_idx;
183
184   /** Worker handoff */
185   u32 fq_index;
186   u32 fq_feature_index;
187
188   // reference count for enabling/disabling feature - per interface
189   u32 *feature_use_refcount_per_intf;
190 } ip4_full_reass_main_t;
191
192 extern ip4_full_reass_main_t ip4_full_reass_main;
193
194 #ifndef CLIB_MARCH_VARIANT
195 ip4_full_reass_main_t ip4_full_reass_main;
196 #endif /* CLIB_MARCH_VARIANT */
197
198 typedef enum
199 {
200   IP4_FULL_REASS_NEXT_INPUT,
201   IP4_FULL_REASS_NEXT_DROP,
202   IP4_FULL_REASS_NEXT_HANDOFF,
203   IP4_FULL_REASS_N_NEXT,
204 } ip4_full_reass_next_t;
205
206 typedef enum
207 {
208   RANGE_NEW,
209   RANGE_SHRINK,
210   RANGE_DISCARD,
211   RANGE_OVERLAP,
212   FINALIZE,
213   HANDOFF,
214 } ip4_full_reass_trace_operation_e;
215
216 typedef struct
217 {
218   u16 range_first;
219   u16 range_last;
220   u32 range_bi;
221   i32 data_offset;
222   u32 data_len;
223   u32 first_bi;
224 } ip4_full_reass_range_trace_t;
225
226 typedef struct
227 {
228   ip4_full_reass_trace_operation_e action;
229   u32 reass_id;
230   ip4_full_reass_range_trace_t trace_range;
231   u32 size_diff;
232   u32 op_id;
233   u32 thread_id;
234   u32 thread_id_to;
235   u32 fragment_first;
236   u32 fragment_last;
237   u32 total_data_len;
238   bool is_after_handoff;
239   ip4_header_t ip4_header;
240 } ip4_full_reass_trace_t;
241
242 extern vlib_node_registration_t ip4_full_reass_node;
243 extern vlib_node_registration_t ip4_full_reass_node_feature;
244
245 static void
246 ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
247                               ip4_full_reass_range_trace_t * trace)
248 {
249   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
250   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
251   trace->range_first = vnb->ip.reass.range_first;
252   trace->range_last = vnb->ip.reass.range_last;
253   trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
254   trace->data_len = ip4_full_reass_buffer_get_data_len (b);
255   trace->range_bi = bi;
256 }
257
258 static u8 *
259 format_ip4_full_reass_range_trace (u8 * s, va_list * args)
260 {
261   ip4_full_reass_range_trace_t *trace =
262     va_arg (*args, ip4_full_reass_range_trace_t *);
263   s =
264     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
265             trace->range_last, trace->data_offset, trace->data_len,
266             trace->range_bi);
267   return s;
268 }
269
270 static u8 *
271 format_ip4_full_reass_trace (u8 * s, va_list * args)
272 {
273   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
274   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
275   ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
276   u32 indent = 0;
277   if (~0 != t->reass_id)
278     {
279       if (t->is_after_handoff)
280         {
281           s =
282             format (s, "%U\n", format_ip4_header, &t->ip4_header,
283                     sizeof (t->ip4_header));
284           indent = 2;
285         }
286       s =
287         format (s, "%Ureass id: %u, op id: %u, ", format_white_space, indent,
288                 t->reass_id, t->op_id);
289       indent = format_get_indent (s);
290       s =
291         format (s,
292                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
293                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
294                 t->fragment_last);
295     }
296   switch (t->action)
297     {
298     case RANGE_SHRINK:
299       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
300                   format_ip4_full_reass_range_trace, &t->trace_range,
301                   t->size_diff);
302       break;
303     case RANGE_DISCARD:
304       s = format (s, "\n%Udiscard %U", format_white_space, indent,
305                   format_ip4_full_reass_range_trace, &t->trace_range);
306       break;
307     case RANGE_NEW:
308       s = format (s, "\n%Unew %U", format_white_space, indent,
309                   format_ip4_full_reass_range_trace, &t->trace_range);
310       break;
311     case RANGE_OVERLAP:
312       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
313                   format_ip4_full_reass_range_trace, &t->trace_range);
314       break;
315     case FINALIZE:
316       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
317       break;
318     case HANDOFF:
319       s =
320         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
321                 t->thread_id_to);
322       break;
323     }
324   return s;
325 }
326
327 static void
328 ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
329                           ip4_full_reass_main_t * rm,
330                           ip4_full_reass_t * reass, u32 bi,
331                           ip4_full_reass_trace_operation_e action,
332                           u32 size_diff, u32 thread_id_to)
333 {
334   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
335   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
336   bool is_after_handoff = false;
337   if (vlib_buffer_get_trace_thread (b) != vm->thread_index)
338     {
339       is_after_handoff = true;
340     }
341   ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
342   t->is_after_handoff = is_after_handoff;
343   if (t->is_after_handoff)
344     {
345       clib_memcpy (&t->ip4_header, vlib_buffer_get_current (b),
346                    clib_min (sizeof (t->ip4_header), b->current_length));
347     }
348   if (reass)
349     {
350       t->reass_id = reass->id;
351       t->op_id = reass->trace_op_counter;
352       t->trace_range.first_bi = reass->first_bi;
353       t->total_data_len = reass->data_len;
354       ++reass->trace_op_counter;
355     }
356   else
357     {
358       t->reass_id = ~0;
359       t->op_id = 0;
360       t->trace_range.first_bi = 0;
361       t->total_data_len = 0;
362     }
363   t->action = action;
364   ip4_full_reass_trace_details (vm, bi, &t->trace_range);
365   t->size_diff = size_diff;
366   t->thread_id = vm->thread_index;
367   t->thread_id_to = thread_id_to;
368   t->fragment_first = vnb->ip.reass.fragment_first;
369   t->fragment_last = vnb->ip.reass.fragment_last;
370 #if 0
371   static u8 *s = NULL;
372   s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
373   printf ("%.*s\n", vec_len (s), s);
374   fflush (stdout);
375   vec_reset_length (s);
376 #endif
377 }
378
379 always_inline void
380 ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
381                          ip4_full_reass_t * reass)
382 {
383   pool_put (rt->pool, reass);
384   --rt->reass_n;
385 }
386
387 always_inline void
388 ip4_full_reass_free (ip4_full_reass_main_t * rm,
389                      ip4_full_reass_per_thread_t * rt,
390                      ip4_full_reass_t * reass)
391 {
392   clib_bihash_kv_16_8_t kv;
393   kv.key[0] = reass->key.as_u64[0];
394   kv.key[1] = reass->key.as_u64[1];
395   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
396   return ip4_full_reass_free_ctx (rt, reass);
397 }
398
399 always_inline void
400 ip4_full_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
401                          ip4_full_reass_main_t * rm, ip4_full_reass_t * reass)
402 {
403   u32 range_bi = reass->first_bi;
404   vlib_buffer_t *range_b;
405   vnet_buffer_opaque_t *range_vnb;
406   u32 *to_free = NULL;
407   while (~0 != range_bi)
408     {
409       range_b = vlib_get_buffer (vm, range_bi);
410       range_vnb = vnet_buffer (range_b);
411       u32 bi = range_bi;
412       while (~0 != bi)
413         {
414           vec_add1 (to_free, bi);
415           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
416           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
417             {
418               bi = b->next_buffer;
419               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
420             }
421           else
422             {
423               bi = ~0;
424             }
425         }
426       range_bi = range_vnb->ip.reass.next_range_bi;
427     }
428   /* send to next_error_index */
429   if (~0 != reass->error_next_index)
430     {
431       u32 n_left_to_next, *to_next, next_index;
432
433       next_index = reass->error_next_index;
434       u32 bi = ~0;
435
436       while (vec_len (to_free) > 0)
437         {
438           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
439
440           while (vec_len (to_free) > 0 && n_left_to_next > 0)
441             {
442               bi = vec_pop (to_free);
443
444               if (~0 != bi)
445                 {
446                   to_next[0] = bi;
447                   to_next += 1;
448                   n_left_to_next -= 1;
449                 }
450             }
451           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
452         }
453     }
454   else
455     {
456       vlib_buffer_free (vm, to_free, vec_len (to_free));
457     }
458 }
459
460 always_inline void
461 ip4_full_reass_init (ip4_full_reass_t * reass)
462 {
463   reass->first_bi = ~0;
464   reass->last_packet_octet = ~0;
465   reass->data_len = 0;
466   reass->next_index = ~0;
467   reass->error_next_index = ~0;
468 }
469
470 always_inline ip4_full_reass_t *
471 ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
472                                ip4_full_reass_main_t * rm,
473                                ip4_full_reass_per_thread_t * rt,
474                                ip4_full_reass_kv_t * kv, u8 * do_handoff)
475 {
476   ip4_full_reass_t *reass;
477   f64 now;
478
479 again:
480
481   reass = NULL;
482   now = vlib_time_now (vm);
483   if (!clib_bihash_search_16_8
484       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
485     {
486       reass =
487         pool_elt_at_index (rm->per_thread_data
488                            [kv->v.memory_owner_thread_index].pool,
489                            kv->v.reass_index);
490       if (vm->thread_index != reass->memory_owner_thread_index)
491         {
492           *do_handoff = 1;
493           return reass;
494         }
495
496       if (now > reass->last_heard + rm->timeout)
497         {
498           ip4_full_reass_drop_all (vm, node, rm, reass);
499           ip4_full_reass_free (rm, rt, reass);
500           reass = NULL;
501         }
502     }
503
504   if (reass)
505     {
506       reass->last_heard = now;
507       return reass;
508     }
509
510   if (rt->reass_n >= rm->max_reass_n)
511     {
512       reass = NULL;
513       return reass;
514     }
515   else
516     {
517       pool_get (rt->pool, reass);
518       clib_memset (reass, 0, sizeof (*reass));
519       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
520       reass->memory_owner_thread_index = vm->thread_index;
521       ++rt->id_counter;
522       ip4_full_reass_init (reass);
523       ++rt->reass_n;
524     }
525
526   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
527   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
528   kv->v.reass_index = (reass - rt->pool);
529   kv->v.memory_owner_thread_index = vm->thread_index;
530   reass->last_heard = now;
531
532   int rv =
533     clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 2);
534   if (rv)
535     {
536       ip4_full_reass_free_ctx (rt, reass);
537       reass = NULL;
538       // if other worker created a context already work with the other copy
539       if (-2 == rv)
540         goto again;
541     }
542
543   return reass;
544 }
545
546 always_inline ip4_full_reass_rc_t
547 ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
548                          ip4_full_reass_main_t * rm,
549                          ip4_full_reass_per_thread_t * rt,
550                          ip4_full_reass_t * reass, u32 * bi0,
551                          u32 * next0, u32 * error0, bool is_custom_app)
552 {
553   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
554   vlib_buffer_t *last_b = NULL;
555   u32 sub_chain_bi = reass->first_bi;
556   u32 total_length = 0;
557   u32 buf_cnt = 0;
558   do
559     {
560       u32 tmp_bi = sub_chain_bi;
561       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
562       ip4_header_t *ip = vlib_buffer_get_current (tmp);
563       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
564       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
565           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
566         {
567           return IP4_REASS_RC_INTERNAL_ERROR;
568         }
569
570       u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
571       u32 trim_front =
572         ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
573       u32 trim_end =
574         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
575       if (tmp_bi == reass->first_bi)
576         {
577           /* first buffer - keep ip4 header */
578           if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
579             {
580               return IP4_REASS_RC_INTERNAL_ERROR;
581             }
582           trim_front = 0;
583           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
584             ip4_header_bytes (ip);
585           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
586             {
587               return IP4_REASS_RC_INTERNAL_ERROR;
588             }
589         }
590       u32 keep_data =
591         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
592       while (1)
593         {
594           ++buf_cnt;
595           if (trim_front)
596             {
597               if (trim_front > tmp->current_length)
598                 {
599                   /* drop whole buffer */
600                   u32 to_be_freed_bi = tmp_bi;
601                   trim_front -= tmp->current_length;
602                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
603                     {
604                       return IP4_REASS_RC_INTERNAL_ERROR;
605                     }
606                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
607                   tmp_bi = tmp->next_buffer;
608                   tmp->next_buffer = 0;
609                   tmp = vlib_get_buffer (vm, tmp_bi);
610                   vlib_buffer_free_one (vm, to_be_freed_bi);
611                   continue;
612                 }
613               else
614                 {
615                   vlib_buffer_advance (tmp, trim_front);
616                   trim_front = 0;
617                 }
618             }
619           if (keep_data)
620             {
621               if (last_b)
622                 {
623                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
624                   last_b->next_buffer = tmp_bi;
625                 }
626               last_b = tmp;
627               if (keep_data <= tmp->current_length)
628                 {
629                   tmp->current_length = keep_data;
630                   keep_data = 0;
631                 }
632               else
633                 {
634                   keep_data -= tmp->current_length;
635                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
636                     {
637                       return IP4_REASS_RC_INTERNAL_ERROR;
638                     }
639                 }
640               total_length += tmp->current_length;
641               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
642                 {
643                   tmp_bi = tmp->next_buffer;
644                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
645                 }
646               else
647                 {
648                   break;
649                 }
650             }
651           else
652             {
653               u32 to_be_freed_bi = tmp_bi;
654               if (reass->first_bi == tmp_bi)
655                 {
656                   return IP4_REASS_RC_INTERNAL_ERROR;
657                 }
658               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
659                 {
660                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
661                   tmp_bi = tmp->next_buffer;
662                   tmp->next_buffer = 0;
663                   tmp = vlib_get_buffer (vm, tmp_bi);
664                   vlib_buffer_free_one (vm, to_be_freed_bi);
665                 }
666               else
667                 {
668                   tmp->next_buffer = 0;
669                   vlib_buffer_free_one (vm, to_be_freed_bi);
670                   break;
671                 }
672             }
673         }
674       sub_chain_bi =
675         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
676         reass.next_range_bi;
677     }
678   while (~0 != sub_chain_bi);
679
680   if (!last_b)
681     {
682       return IP4_REASS_RC_INTERNAL_ERROR;
683     }
684   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
685
686   if (total_length < first_b->current_length)
687     {
688       return IP4_REASS_RC_INTERNAL_ERROR;
689     }
690   total_length -= first_b->current_length;
691   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
692   first_b->total_length_not_including_first_buffer = total_length;
693   ip4_header_t *ip = vlib_buffer_get_current (first_b);
694   ip->flags_and_fragment_offset = 0;
695   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
696   ip->checksum = ip4_header_checksum (ip);
697   if (!vlib_buffer_chain_linearize (vm, first_b))
698     {
699       return IP4_REASS_RC_NO_BUF;
700     }
701   // reset to reconstruct the mbuf linking
702   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
703   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
704     {
705       ip4_full_reass_add_trace (vm, node, rm, reass, reass->first_bi,
706                                 FINALIZE, 0, ~0);
707 #if 0
708       // following code does a hexdump of packet fragments to stdout ...
709       do
710         {
711           u32 bi = reass->first_bi;
712           u8 *s = NULL;
713           while (~0 != bi)
714             {
715               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
716               s = format (s, "%u: %U\n", bi, format_hexdump,
717                           vlib_buffer_get_current (b), b->current_length);
718               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
719                 {
720                   bi = b->next_buffer;
721                 }
722               else
723                 {
724                   break;
725                 }
726             }
727           printf ("%.*s\n", vec_len (s), s);
728           fflush (stdout);
729           vec_free (s);
730         }
731       while (0);
732 #endif
733     }
734   *bi0 = reass->first_bi;
735   if (!is_custom_app)
736     {
737       *next0 = IP4_FULL_REASS_NEXT_INPUT;
738     }
739   else
740     {
741       *next0 = reass->next_index;
742     }
743   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
744   *error0 = IP4_ERROR_NONE;
745   ip4_full_reass_free (rm, rt, reass);
746   reass = NULL;
747   return IP4_REASS_RC_OK;
748 }
749
750 always_inline ip4_full_reass_rc_t
751 ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
752                                       ip4_full_reass_main_t * rm,
753                                       ip4_full_reass_per_thread_t * rt,
754                                       ip4_full_reass_t * reass,
755                                       u32 prev_range_bi, u32 new_next_bi)
756 {
757   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
758   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
759   if (~0 != prev_range_bi)
760     {
761       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
762       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
763       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
764       prev_vnb->ip.reass.next_range_bi = new_next_bi;
765     }
766   else
767     {
768       if (~0 != reass->first_bi)
769         {
770           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
771         }
772       reass->first_bi = new_next_bi;
773     }
774   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
775   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
776       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
777     {
778       return IP4_REASS_RC_INTERNAL_ERROR;
779     }
780   reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
781   return IP4_REASS_RC_OK;
782 }
783
784 always_inline ip4_full_reass_rc_t
785 ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
786                                         vlib_node_runtime_t * node,
787                                         ip4_full_reass_main_t * rm,
788                                         ip4_full_reass_t * reass,
789                                         u32 prev_range_bi, u32 discard_bi)
790 {
791   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
792   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
793   if (~0 != prev_range_bi)
794     {
795       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
796       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
797       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
798         {
799           return IP4_REASS_RC_INTERNAL_ERROR;
800         }
801       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
802     }
803   else
804     {
805       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
806     }
807   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
808   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
809       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
810     {
811       return IP4_REASS_RC_INTERNAL_ERROR;
812     }
813   reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
814   while (1)
815     {
816       u32 to_be_freed_bi = discard_bi;
817       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
818         {
819           ip4_full_reass_add_trace (vm, node, rm, reass, discard_bi,
820                                     RANGE_DISCARD, 0, ~0);
821         }
822       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
823         {
824           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
825           discard_bi = discard_b->next_buffer;
826           discard_b->next_buffer = 0;
827           discard_b = vlib_get_buffer (vm, discard_bi);
828           vlib_buffer_free_one (vm, to_be_freed_bi);
829         }
830       else
831         {
832           discard_b->next_buffer = 0;
833           vlib_buffer_free_one (vm, to_be_freed_bi);
834           break;
835         }
836     }
837   return IP4_REASS_RC_OK;
838 }
839
840 always_inline ip4_full_reass_rc_t
841 ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
842                        ip4_full_reass_main_t * rm,
843                        ip4_full_reass_per_thread_t * rt,
844                        ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
845                        u32 * error0, bool is_custom_app,
846                        u32 * handoff_thread_idx)
847 {
848   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
849   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
850   if (is_custom_app)
851     {
852       // store (error_)next_index before it's overwritten
853       reass->next_index = fvnb->ip.reass.next_index;
854       reass->error_next_index = fvnb->ip.reass.error_next_index;
855     }
856   ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
857   int consumed = 0;
858   ip4_header_t *fip = vlib_buffer_get_current (fb);
859   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
860   const u32 fragment_length =
861     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
862   const u32 fragment_last = fragment_first + fragment_length - 1;
863   fvnb->ip.reass.fragment_first = fragment_first;
864   fvnb->ip.reass.fragment_last = fragment_last;
865   int more_fragments = ip4_get_fragment_more (fip);
866   u32 candidate_range_bi = reass->first_bi;
867   u32 prev_range_bi = ~0;
868   fvnb->ip.reass.range_first = fragment_first;
869   fvnb->ip.reass.range_last = fragment_last;
870   fvnb->ip.reass.next_range_bi = ~0;
871   if (!more_fragments)
872     {
873       reass->last_packet_octet = fragment_last;
874     }
875   if (~0 == reass->first_bi)
876     {
877       // starting a new reassembly
878       rc =
879         ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
880                                               prev_range_bi, *bi0);
881       if (IP4_REASS_RC_OK != rc)
882         {
883           return rc;
884         }
885       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
886         {
887           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
888                                     ~0);
889         }
890       *bi0 = ~0;
891       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
892       reass->fragments_n = 1;
893       return IP4_REASS_RC_OK;
894     }
895   reass->min_fragment_length =
896     clib_min (clib_net_to_host_u16 (fip->length),
897               fvnb->ip.reass.estimated_mtu);
898   while (~0 != candidate_range_bi)
899     {
900       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
901       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
902       if (fragment_first > candidate_vnb->ip.reass.range_last)
903         {
904           // this fragments starts after candidate range
905           prev_range_bi = candidate_range_bi;
906           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
907           if (candidate_vnb->ip.reass.range_last < fragment_last &&
908               ~0 == candidate_range_bi)
909             {
910               // special case - this fragment falls beyond all known ranges
911               rc =
912                 ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
913                                                       prev_range_bi, *bi0);
914               if (IP4_REASS_RC_OK != rc)
915                 {
916                   return rc;
917                 }
918               consumed = 1;
919               break;
920             }
921           continue;
922         }
923       if (fragment_last < candidate_vnb->ip.reass.range_first)
924         {
925           // this fragment ends before candidate range without any overlap
926           rc =
927             ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
928                                                   prev_range_bi, *bi0);
929           if (IP4_REASS_RC_OK != rc)
930             {
931               return rc;
932             }
933           consumed = 1;
934         }
935       else
936         {
937           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
938               fragment_last <= candidate_vnb->ip.reass.range_last)
939             {
940               // this fragment is a (sub)part of existing range, ignore it
941               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
942                 {
943                   ip4_full_reass_add_trace (vm, node, rm, reass, *bi0,
944                                             RANGE_OVERLAP, 0, ~0);
945                 }
946               break;
947             }
948           int discard_candidate = 0;
949           if (fragment_first < candidate_vnb->ip.reass.range_first)
950             {
951               u32 overlap =
952                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
953               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
954                 {
955                   candidate_vnb->ip.reass.range_first += overlap;
956                   if (reass->data_len < overlap)
957                     {
958                       return IP4_REASS_RC_INTERNAL_ERROR;
959                     }
960                   reass->data_len -= overlap;
961                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
962                     {
963                       ip4_full_reass_add_trace (vm, node, rm, reass,
964                                                 candidate_range_bi,
965                                                 RANGE_SHRINK, 0, ~0);
966                     }
967                   rc =
968                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
969                                                           prev_range_bi,
970                                                           *bi0);
971                   if (IP4_REASS_RC_OK != rc)
972                     {
973                       return rc;
974                     }
975                   consumed = 1;
976                 }
977               else
978                 {
979                   discard_candidate = 1;
980                 }
981             }
982           else if (fragment_last > candidate_vnb->ip.reass.range_last)
983             {
984               u32 overlap =
985                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
986               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
987                 {
988                   fvnb->ip.reass.range_first += overlap;
989                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
990                     {
991                       prev_range_bi = candidate_range_bi;
992                       candidate_range_bi =
993                         candidate_vnb->ip.reass.next_range_bi;
994                       continue;
995                     }
996                   else
997                     {
998                       // special case - last range discarded
999                       rc =
1000                         ip4_full_reass_insert_range_in_chain (vm, rm, rt,
1001                                                               reass,
1002                                                               candidate_range_bi,
1003                                                               *bi0);
1004                       if (IP4_REASS_RC_OK != rc)
1005                         {
1006                           return rc;
1007                         }
1008                       consumed = 1;
1009                     }
1010                 }
1011               else
1012                 {
1013                   discard_candidate = 1;
1014                 }
1015             }
1016           else
1017             {
1018               discard_candidate = 1;
1019             }
1020           if (discard_candidate)
1021             {
1022               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
1023               // discard candidate range, probe next range
1024               rc =
1025                 ip4_full_reass_remove_range_from_chain (vm, node, rm, reass,
1026                                                         prev_range_bi,
1027                                                         candidate_range_bi);
1028               if (IP4_REASS_RC_OK != rc)
1029                 {
1030                   return rc;
1031                 }
1032               if (~0 != next_range_bi)
1033                 {
1034                   candidate_range_bi = next_range_bi;
1035                   continue;
1036                 }
1037               else
1038                 {
1039                   // special case - last range discarded
1040                   rc =
1041                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
1042                                                           prev_range_bi,
1043                                                           *bi0);
1044                   if (IP4_REASS_RC_OK != rc)
1045                     {
1046                       return rc;
1047                     }
1048                   consumed = 1;
1049                 }
1050             }
1051         }
1052       break;
1053     }
1054   ++reass->fragments_n;
1055   if (consumed)
1056     {
1057       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1058         {
1059           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
1060                                     ~0);
1061         }
1062     }
1063   if (~0 != reass->last_packet_octet &&
1064       reass->data_len == reass->last_packet_octet + 1)
1065     {
1066       *handoff_thread_idx = reass->sendout_thread_index;
1067       int handoff =
1068         reass->memory_owner_thread_index != reass->sendout_thread_index;
1069       rc =
1070         ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1071                                  is_custom_app);
1072       if (IP4_REASS_RC_OK == rc && handoff)
1073         {
1074           rc = IP4_REASS_RC_HANDOFF;
1075         }
1076     }
1077   else
1078     {
1079       if (consumed)
1080         {
1081           *bi0 = ~0;
1082           if (reass->fragments_n > rm->max_reass_len)
1083             {
1084               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1085             }
1086         }
1087       else
1088         {
1089           *next0 = IP4_FULL_REASS_NEXT_DROP;
1090           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1091         }
1092     }
1093   return rc;
1094 }
1095
1096 always_inline uword
1097 ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1098                        vlib_frame_t * frame, bool is_feature,
1099                        bool is_custom_app)
1100 {
1101   u32 *from = vlib_frame_vector_args (frame);
1102   u32 n_left_from, n_left_to_next, *to_next, next_index;
1103   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1104   ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1105   clib_spinlock_lock (&rt->lock);
1106
1107   n_left_from = frame->n_vectors;
1108   next_index = node->cached_next_index;
1109   while (n_left_from > 0)
1110     {
1111       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1112
1113       while (n_left_from > 0 && n_left_to_next > 0)
1114         {
1115           u32 bi0;
1116           vlib_buffer_t *b0;
1117           u32 next0;
1118           u32 error0 = IP4_ERROR_NONE;
1119
1120           bi0 = from[0];
1121           b0 = vlib_get_buffer (vm, bi0);
1122
1123           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1124           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1125             {
1126               // this is a whole packet - no fragmentation
1127               if (!is_custom_app)
1128                 {
1129                   next0 = IP4_FULL_REASS_NEXT_INPUT;
1130                 }
1131               else
1132                 {
1133                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1134                 }
1135               goto packet_enqueue;
1136             }
1137           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1138           const u32 fragment_length =
1139             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1140           const u32 fragment_last = fragment_first + fragment_length - 1;
1141           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
1142             {
1143               next0 = IP4_FULL_REASS_NEXT_DROP;
1144               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1145               goto packet_enqueue;
1146             }
1147           ip4_full_reass_kv_t kv;
1148           u8 do_handoff = 0;
1149
1150           kv.k.as_u64[0] =
1151             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1152                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1153             (u64) ip0->src_address.as_u32 << 32;
1154           kv.k.as_u64[1] =
1155             (u64) ip0->dst_address.
1156             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1157
1158           ip4_full_reass_t *reass =
1159             ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
1160                                            &do_handoff);
1161
1162           if (reass)
1163             {
1164               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1165               if (0 == fragment_first)
1166                 {
1167                   reass->sendout_thread_index = vm->thread_index;
1168                 }
1169             }
1170
1171           if (PREDICT_FALSE (do_handoff))
1172             {
1173               next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1174               vnet_buffer (b0)->ip.reass.owner_thread_index =
1175                 kv.v.memory_owner_thread_index;
1176             }
1177           else if (reass)
1178             {
1179               u32 handoff_thread_idx;
1180               switch (ip4_full_reass_update
1181                       (vm, node, rm, rt, reass, &bi0, &next0,
1182                        &error0, is_custom_app, &handoff_thread_idx))
1183                 {
1184                 case IP4_REASS_RC_OK:
1185                   /* nothing to do here */
1186                   break;
1187                 case IP4_REASS_RC_HANDOFF:
1188                   next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1189                   b0 = vlib_get_buffer (vm, bi0);
1190                   vnet_buffer (b0)->ip.reass.owner_thread_index =
1191                     handoff_thread_idx;
1192                   break;
1193                 case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1194                   vlib_node_increment_counter (vm, node->node_index,
1195                                                IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1196                                                1);
1197                   ip4_full_reass_drop_all (vm, node, rm, reass);
1198                   ip4_full_reass_free (rm, rt, reass);
1199                   goto next_packet;
1200                   break;
1201                 case IP4_REASS_RC_NO_BUF:
1202                   vlib_node_increment_counter (vm, node->node_index,
1203                                                IP4_ERROR_REASS_NO_BUF, 1);
1204                   ip4_full_reass_drop_all (vm, node, rm, reass);
1205                   ip4_full_reass_free (rm, rt, reass);
1206                   goto next_packet;
1207                   break;
1208                 case IP4_REASS_RC_INTERNAL_ERROR:
1209                   /* drop everything and start with a clean slate */
1210                   vlib_node_increment_counter (vm, node->node_index,
1211                                                IP4_ERROR_REASS_INTERNAL_ERROR,
1212                                                1);
1213                   ip4_full_reass_drop_all (vm, node, rm, reass);
1214                   ip4_full_reass_free (rm, rt, reass);
1215                   goto next_packet;
1216                   break;
1217                 }
1218             }
1219           else
1220             {
1221               next0 = IP4_FULL_REASS_NEXT_DROP;
1222               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1223             }
1224
1225
1226         packet_enqueue:
1227
1228           if (bi0 != ~0)
1229             {
1230               to_next[0] = bi0;
1231               to_next += 1;
1232               n_left_to_next -= 1;
1233
1234               /* bi0 might have been updated by reass_finalize, reload */
1235               b0 = vlib_get_buffer (vm, bi0);
1236               b0->error = node->errors[error0];
1237
1238               if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1239                 {
1240                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1241                     {
1242                       ip4_full_reass_add_trace (vm, node, rm, NULL, bi0,
1243                                                 HANDOFF, 0,
1244                                                 vnet_buffer (b0)->ip.
1245                                                 reass.owner_thread_index);
1246                     }
1247                 }
1248               else if (is_feature && IP4_ERROR_NONE == error0)
1249                 {
1250                   vnet_feature_next (&next0, b0);
1251                 }
1252               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1253                                                to_next, n_left_to_next,
1254                                                bi0, next0);
1255               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1256             }
1257
1258         next_packet:
1259           from += 1;
1260           n_left_from -= 1;
1261         }
1262
1263       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1264     }
1265
1266   clib_spinlock_unlock (&rt->lock);
1267   return frame->n_vectors;
1268 }
1269
1270 static char *ip4_full_reass_error_strings[] = {
1271 #define _(sym, string) string,
1272   foreach_ip4_error
1273 #undef _
1274 };
1275
1276 VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1277                                     vlib_node_runtime_t * node,
1278                                     vlib_frame_t * frame)
1279 {
1280   return ip4_full_reass_inline (vm, node, frame, false /* is_feature */ ,
1281                                 false /* is_custom_app */ );
1282 }
1283
1284 /* *INDENT-OFF* */
1285 VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1286     .name = "ip4-full-reassembly",
1287     .vector_size = sizeof (u32),
1288     .format_trace = format_ip4_full_reass_trace,
1289     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1290     .error_strings = ip4_full_reass_error_strings,
1291     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1292     .next_nodes =
1293         {
1294                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1295                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1296                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1297
1298         },
1299 };
1300 /* *INDENT-ON* */
1301
1302 VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1303                                             vlib_node_runtime_t * node,
1304                                             vlib_frame_t * frame)
1305 {
1306   return ip4_full_reass_inline (vm, node, frame, true /* is_feature */ ,
1307                                 false /* is_custom_app */ );
1308 }
1309
1310 /* *INDENT-OFF* */
1311 VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1312     .name = "ip4-full-reassembly-feature",
1313     .vector_size = sizeof (u32),
1314     .format_trace = format_ip4_full_reass_trace,
1315     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1316     .error_strings = ip4_full_reass_error_strings,
1317     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1318     .next_nodes =
1319         {
1320                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1321                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1322                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1323         },
1324 };
1325 /* *INDENT-ON* */
1326
1327 /* *INDENT-OFF* */
1328 VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1329     .arc_name = "ip4-unicast",
1330     .node_name = "ip4-full-reassembly-feature",
1331     .runs_before = VNET_FEATURES ("ip4-lookup",
1332                                   "ipsec4-input-feature"),
1333     .runs_after = 0,
1334 };
1335 /* *INDENT-ON* */
1336
1337 #ifndef CLIB_MARCH_VARIANT
1338 always_inline u32
1339 ip4_full_reass_get_nbuckets ()
1340 {
1341   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1342   u32 nbuckets;
1343   u8 i;
1344
1345   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1346
1347   for (i = 0; i < 31; i++)
1348     if ((1 << i) >= nbuckets)
1349       break;
1350   nbuckets = 1 << i;
1351
1352   return nbuckets;
1353 }
1354 #endif /* CLIB_MARCH_VARIANT */
1355
1356 typedef enum
1357 {
1358   IP4_EVENT_CONFIG_CHANGED = 1,
1359 } ip4_full_reass_event_t;
1360
1361 typedef struct
1362 {
1363   int failure;
1364   clib_bihash_16_8_t *new_hash;
1365 } ip4_rehash_cb_ctx;
1366
1367 #ifndef CLIB_MARCH_VARIANT
1368 static int
1369 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1370 {
1371   ip4_rehash_cb_ctx *ctx = _ctx;
1372   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1373     {
1374       ctx->failure = 1;
1375     }
1376   return (BIHASH_WALK_CONTINUE);
1377 }
1378
1379 static void
1380 ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1381                            u32 max_reassembly_length,
1382                            u32 expire_walk_interval_ms)
1383 {
1384   ip4_full_reass_main.timeout_ms = timeout_ms;
1385   ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1386   ip4_full_reass_main.max_reass_n = max_reassemblies;
1387   ip4_full_reass_main.max_reass_len = max_reassembly_length;
1388   ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1389 }
1390
1391 vnet_api_error_t
1392 ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1393                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1394 {
1395   u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1396   ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1397                              max_reassembly_length, expire_walk_interval_ms);
1398   vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1399                              ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1400                              IP4_EVENT_CONFIG_CHANGED, 0);
1401   u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1402   if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1403     {
1404       clib_bihash_16_8_t new_hash;
1405       clib_memset (&new_hash, 0, sizeof (new_hash));
1406       ip4_rehash_cb_ctx ctx;
1407       ctx.failure = 0;
1408       ctx.new_hash = &new_hash;
1409       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1410                              new_nbuckets * 1024);
1411       clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1412                                                ip4_rehash_cb, &ctx);
1413       if (ctx.failure)
1414         {
1415           clib_bihash_free_16_8 (&new_hash);
1416           return -1;
1417         }
1418       else
1419         {
1420           clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1421           clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1422                             sizeof (ip4_full_reass_main.hash));
1423           clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1424         }
1425     }
1426   return 0;
1427 }
1428
1429 vnet_api_error_t
1430 ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1431                     u32 * max_reassembly_length,
1432                     u32 * expire_walk_interval_ms)
1433 {
1434   *timeout_ms = ip4_full_reass_main.timeout_ms;
1435   *max_reassemblies = ip4_full_reass_main.max_reass_n;
1436   *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1437   *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1438   return 0;
1439 }
1440
1441 static clib_error_t *
1442 ip4_full_reass_init_function (vlib_main_t * vm)
1443 {
1444   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1445   clib_error_t *error = 0;
1446   u32 nbuckets;
1447   vlib_node_t *node;
1448
1449   rm->vlib_main = vm;
1450
1451   vec_validate (rm->per_thread_data, vlib_num_workers ());
1452   ip4_full_reass_per_thread_t *rt;
1453   vec_foreach (rt, rm->per_thread_data)
1454   {
1455     clib_spinlock_init (&rt->lock);
1456     pool_alloc (rt->pool, rm->max_reass_n);
1457   }
1458
1459   node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1460   ASSERT (node);
1461   rm->ip4_full_reass_expire_node_idx = node->index;
1462
1463   ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1464                              IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1465                              IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1466                              IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1467
1468   nbuckets = ip4_full_reass_get_nbuckets ();
1469   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1470
1471   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1472   ASSERT (node);
1473   rm->ip4_drop_idx = node->index;
1474
1475   rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1476   rm->fq_feature_index =
1477     vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1478
1479   rm->feature_use_refcount_per_intf = NULL;
1480   return error;
1481 }
1482
1483 VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1484 #endif /* CLIB_MARCH_VARIANT */
1485
1486 static uword
1487 ip4_full_reass_walk_expired (vlib_main_t * vm,
1488                              vlib_node_runtime_t * node, vlib_frame_t * f)
1489 {
1490   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1491   uword event_type, *event_data = 0;
1492
1493   while (true)
1494     {
1495       vlib_process_wait_for_event_or_clock (vm,
1496                                             (f64)
1497                                             rm->expire_walk_interval_ms /
1498                                             (f64) MSEC_PER_SEC);
1499       event_type = vlib_process_get_events (vm, &event_data);
1500
1501       switch (event_type)
1502         {
1503         case ~0:                /* no events => timeout */
1504           /* nothing to do here */
1505           break;
1506         case IP4_EVENT_CONFIG_CHANGED:
1507           break;
1508         default:
1509           clib_warning ("BUG: event type 0x%wx", event_type);
1510           break;
1511         }
1512       f64 now = vlib_time_now (vm);
1513
1514       ip4_full_reass_t *reass;
1515       int *pool_indexes_to_free = NULL;
1516
1517       uword thread_index = 0;
1518       int index;
1519       const uword nthreads = vlib_num_workers () + 1;
1520       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1521         {
1522           ip4_full_reass_per_thread_t *rt =
1523             &rm->per_thread_data[thread_index];
1524           clib_spinlock_lock (&rt->lock);
1525
1526           vec_reset_length (pool_indexes_to_free);
1527           /* *INDENT-OFF* */
1528           pool_foreach_index (index, rt->pool, ({
1529                                 reass = pool_elt_at_index (rt->pool, index);
1530                                 if (now > reass->last_heard + rm->timeout)
1531                                   {
1532                                     vec_add1 (pool_indexes_to_free, index);
1533                                   }
1534                               }));
1535           /* *INDENT-ON* */
1536           int *i;
1537           /* *INDENT-OFF* */
1538           vec_foreach (i, pool_indexes_to_free)
1539           {
1540             ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1541             ip4_full_reass_drop_all (vm, node, rm, reass);
1542             ip4_full_reass_free (rm, rt, reass);
1543           }
1544           /* *INDENT-ON* */
1545
1546           clib_spinlock_unlock (&rt->lock);
1547         }
1548
1549       vec_free (pool_indexes_to_free);
1550       if (event_data)
1551         {
1552           _vec_len (event_data) = 0;
1553         }
1554     }
1555
1556   return 0;
1557 }
1558
1559 /* *INDENT-OFF* */
1560 VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1561     .function = ip4_full_reass_walk_expired,
1562     .type = VLIB_NODE_TYPE_PROCESS,
1563     .name = "ip4-full-reassembly-expire-walk",
1564     .format_trace = format_ip4_full_reass_trace,
1565     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1566     .error_strings = ip4_full_reass_error_strings,
1567
1568 };
1569 /* *INDENT-ON* */
1570
1571 static u8 *
1572 format_ip4_full_reass_key (u8 * s, va_list * args)
1573 {
1574   ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1575   s =
1576     format (s,
1577             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1578             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1579             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1580   return s;
1581 }
1582
1583 static u8 *
1584 format_ip4_reass (u8 * s, va_list * args)
1585 {
1586   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1587   ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1588
1589   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1590               "last_packet_octet: %u, trace_op_counter: %u\n",
1591               reass->id, format_ip4_full_reass_key, &reass->key,
1592               reass->first_bi, reass->data_len,
1593               reass->last_packet_octet, reass->trace_op_counter);
1594
1595   u32 bi = reass->first_bi;
1596   u32 counter = 0;
1597   while (~0 != bi)
1598     {
1599       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1600       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1601       s =
1602         format (s,
1603                 "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1604                 "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1605                 vnb->ip.reass.range_last, bi,
1606                 ip4_full_reass_buffer_get_data_offset (b),
1607                 ip4_full_reass_buffer_get_data_len (b),
1608                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1609       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1610         {
1611           bi = b->next_buffer;
1612         }
1613       else
1614         {
1615           bi = ~0;
1616         }
1617     }
1618   return s;
1619 }
1620
1621 static clib_error_t *
1622 show_ip4_reass (vlib_main_t * vm,
1623                 unformat_input_t * input,
1624                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1625 {
1626   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1627
1628   vlib_cli_output (vm, "---------------------");
1629   vlib_cli_output (vm, "IP4 reassembly status");
1630   vlib_cli_output (vm, "---------------------");
1631   bool details = false;
1632   if (unformat (input, "details"))
1633     {
1634       details = true;
1635     }
1636
1637   u32 sum_reass_n = 0;
1638   ip4_full_reass_t *reass;
1639   uword thread_index;
1640   const uword nthreads = vlib_num_workers () + 1;
1641   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1642     {
1643       ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1644       clib_spinlock_lock (&rt->lock);
1645       if (details)
1646         {
1647           /* *INDENT-OFF* */
1648           pool_foreach (reass, rt->pool, {
1649             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1650           });
1651           /* *INDENT-ON* */
1652         }
1653       sum_reass_n += rt->reass_n;
1654       clib_spinlock_unlock (&rt->lock);
1655     }
1656   vlib_cli_output (vm, "---------------------");
1657   vlib_cli_output (vm, "Current full IP4 reassemblies count: %lu\n",
1658                    (long unsigned) sum_reass_n);
1659   vlib_cli_output (vm,
1660                    "Maximum configured concurrent full IP4 reassemblies per worker-thread: %lu\n",
1661                    (long unsigned) rm->max_reass_n);
1662   vlib_cli_output (vm,
1663                    "Maximum configured full IP4 reassembly timeout: %lums\n",
1664                    (long unsigned) rm->timeout_ms);
1665   vlib_cli_output (vm,
1666                    "Maximum configured full IP4 reassembly expire walk interval: %lums\n",
1667                    (long unsigned) rm->expire_walk_interval_ms);
1668   return 0;
1669 }
1670
1671 /* *INDENT-OFF* */
1672 VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1673     .path = "show ip4-full-reassembly",
1674     .short_help = "show ip4-full-reassembly [details]",
1675     .function = show_ip4_reass,
1676 };
1677 /* *INDENT-ON* */
1678
1679 #ifndef CLIB_MARCH_VARIANT
1680 vnet_api_error_t
1681 ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1682 {
1683   return vnet_feature_enable_disable ("ip4-unicast",
1684                                       "ip4-full-reassembly-feature",
1685                                       sw_if_index, enable_disable, 0, 0);
1686 }
1687 #endif /* CLIB_MARCH_VARIANT */
1688
1689
1690 #define foreach_ip4_full_reass_handoff_error                       \
1691 _(CONGESTION_DROP, "congestion drop")
1692
1693
1694 typedef enum
1695 {
1696 #define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1697   foreach_ip4_full_reass_handoff_error
1698 #undef _
1699     IP4_FULL_REASS_HANDOFF_N_ERROR,
1700 } ip4_full_reass_handoff_error_t;
1701
1702 static char *ip4_full_reass_handoff_error_strings[] = {
1703 #define _(sym,string) string,
1704   foreach_ip4_full_reass_handoff_error
1705 #undef _
1706 };
1707
1708 typedef struct
1709 {
1710   u32 next_worker_index;
1711 } ip4_full_reass_handoff_trace_t;
1712
1713 static u8 *
1714 format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1715 {
1716   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1717   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1718   ip4_full_reass_handoff_trace_t *t =
1719     va_arg (*args, ip4_full_reass_handoff_trace_t *);
1720
1721   s =
1722     format (s, "ip4-full-reassembly-handoff: next-worker %d",
1723             t->next_worker_index);
1724
1725   return s;
1726 }
1727
1728 always_inline uword
1729 ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
1730                                     vlib_node_runtime_t * node,
1731                                     vlib_frame_t * frame, bool is_feature)
1732 {
1733   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1734
1735   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1736   u32 n_enq, n_left_from, *from;
1737   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1738   u32 fq_index;
1739
1740   from = vlib_frame_vector_args (frame);
1741   n_left_from = frame->n_vectors;
1742   vlib_get_buffers (vm, from, bufs, n_left_from);
1743
1744   b = bufs;
1745   ti = thread_indices;
1746
1747   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1748
1749   while (n_left_from > 0)
1750     {
1751       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1752
1753       if (PREDICT_FALSE
1754           ((node->flags & VLIB_NODE_FLAG_TRACE)
1755            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1756         {
1757           ip4_full_reass_handoff_trace_t *t =
1758             vlib_add_trace (vm, node, b[0], sizeof (*t));
1759           t->next_worker_index = ti[0];
1760         }
1761
1762       n_left_from -= 1;
1763       ti += 1;
1764       b += 1;
1765     }
1766   n_enq =
1767     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1768                                    frame->n_vectors, 1);
1769
1770   if (n_enq < frame->n_vectors)
1771     vlib_node_increment_counter (vm, node->node_index,
1772                                  IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1773                                  frame->n_vectors - n_enq);
1774   return frame->n_vectors;
1775 }
1776
1777 VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1778                                             vlib_node_runtime_t * node,
1779                                             vlib_frame_t * frame)
1780 {
1781   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1782                                              false /* is_feature */ );
1783 }
1784
1785
1786 /* *INDENT-OFF* */
1787 VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1788   .name = "ip4-full-reassembly-handoff",
1789   .vector_size = sizeof (u32),
1790   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1791   .error_strings = ip4_full_reass_handoff_error_strings,
1792   .format_trace = format_ip4_full_reass_handoff_trace,
1793
1794   .n_next_nodes = 1,
1795
1796   .next_nodes = {
1797     [0] = "error-drop",
1798   },
1799 };
1800 /* *INDENT-ON* */
1801
1802
1803 /* *INDENT-OFF* */
1804 VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
1805                                                     vlib_node_runtime_t *
1806                                                     node,
1807                                                     vlib_frame_t * frame)
1808 {
1809   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1810                                              true /* is_feature */ );
1811 }
1812 /* *INDENT-ON* */
1813
1814
1815 /* *INDENT-OFF* */
1816 VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
1817   .name = "ip4-full-reass-feature-hoff",
1818   .vector_size = sizeof (u32),
1819   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1820   .error_strings = ip4_full_reass_handoff_error_strings,
1821   .format_trace = format_ip4_full_reass_handoff_trace,
1822
1823   .n_next_nodes = 1,
1824
1825   .next_nodes = {
1826     [0] = "error-drop",
1827   },
1828 };
1829 /* *INDENT-ON* */
1830
1831 #ifndef CLIB_MARCH_VARIANT
1832 int
1833 ip4_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1834 {
1835   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1836   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1837   if (is_enable)
1838     {
1839       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1840         {
1841           ++rm->feature_use_refcount_per_intf[sw_if_index];
1842           return vnet_feature_enable_disable ("ip4-unicast",
1843                                               "ip4-full-reassembly-feature",
1844                                               sw_if_index, 1, 0, 0);
1845         }
1846       ++rm->feature_use_refcount_per_intf[sw_if_index];
1847     }
1848   else
1849     {
1850       --rm->feature_use_refcount_per_intf[sw_if_index];
1851       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1852         return vnet_feature_enable_disable ("ip4-unicast",
1853                                             "ip4-full-reassembly-feature",
1854                                             sw_if_index, 0, 0, 0);
1855     }
1856   return -1;
1857 }
1858 #endif
1859
1860 /*
1861  * fd.io coding-style-patch-verification: ON
1862  *
1863  * Local Variables:
1864  * eval: (c-set-style "gnu")
1865  * End:
1866  */