vppinfra: selectively disable false-positive GCC-10 warnings
[vpp.git] / src / vnet / ip / reass / ip4_full_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Full Reassembly.
19  *
20  * This file contains the source code for IPv4 full reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/fifo.h>
27 #include <vppinfra/bihash_16_8.h>
28 #include <vnet/ip/reass/ip4_full_reass.h>
29 #include <stddef.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
34 #define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38 #define IP4_REASS_DEBUG_BUFFERS 0
39 #if IP4_REASS_DEBUG_BUFFERS
40 #define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41   do                                                 \
42     {                                                \
43       u32 _bi = bi;                                  \
44       printf (#what "buffer %u", _bi);               \
45       vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46       while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47         {                                            \
48           _bi = _b->next_buffer;                     \
49           printf ("[%u]", _bi);                      \
50           _b = vlib_get_buffer (vm, _bi);            \
51         }                                            \
52       printf ("\n");                                 \
53       fflush (stdout);                               \
54     }                                                \
55   while (0)
56 #else
57 #define IP4_REASS_DEBUG_BUFFER(...)
58 #endif
59
60 typedef enum
61 {
62   IP4_REASS_RC_OK,
63   IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64   IP4_REASS_RC_INTERNAL_ERROR,
65   IP4_REASS_RC_NO_BUF,
66   IP4_REASS_RC_HANDOFF,
67 } ip4_full_reass_rc_t;
68
69 typedef struct
70 {
71   union
72   {
73     struct
74     {
75       u32 xx_id;
76       ip4_address_t src;
77       ip4_address_t dst;
78       u16 frag_id;
79       u8 proto;
80       u8 unused;
81     };
82     u64 as_u64[2];
83   };
84 } ip4_full_reass_key_t;
85
86 typedef union
87 {
88   struct
89   {
90     u32 reass_index;
91     u32 memory_owner_thread_index;
92   };
93   u64 as_u64;
94 } ip4_full_reass_val_t;
95
96 typedef union
97 {
98   struct
99   {
100     ip4_full_reass_key_t k;
101     ip4_full_reass_val_t v;
102   };
103   clib_bihash_kv_16_8_t kv;
104 } ip4_full_reass_kv_t;
105
106 always_inline u32
107 ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108 {
109   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111 }
112
113 always_inline u16
114 ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115 {
116   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118     (vnb->ip.reass.fragment_first +
119      ip4_full_reass_buffer_get_data_offset (b)) + 1;
120 }
121
122 typedef struct
123 {
124   // hash table key
125   ip4_full_reass_key_t key;
126   // time when last packet was received
127   f64 last_heard;
128   // internal id of this reassembly
129   u64 id;
130   // buffer index of first buffer in this reassembly context
131   u32 first_bi;
132   // last octet of packet, ~0 until fragment without more_fragments arrives
133   u32 last_packet_octet;
134   // length of data collected so far
135   u32 data_len;
136   // trace operation counter
137   u32 trace_op_counter;
138   // next index - used by non-feature node
139   u32 next_index;
140   // error next index - used by custom apps (~0 if not used)
141   u32 error_next_index;
142   // minimum fragment length for this reassembly - used to estimate MTU
143   u16 min_fragment_length;
144   // number of fragments in this reassembly
145   u32 fragments_n;
146   // thread owning memory for this context (whose pool contains this ctx)
147   u32 memory_owner_thread_index;
148   // thread which received fragment with offset 0 and which sends out the
149   // completed reassembly
150   u32 sendout_thread_index;
151 } ip4_full_reass_t;
152
153 typedef struct
154 {
155   ip4_full_reass_t *pool;
156   u32 reass_n;
157   u32 id_counter;
158   clib_spinlock_t lock;
159 } ip4_full_reass_per_thread_t;
160
161 typedef struct
162 {
163   // IPv4 config
164   u32 timeout_ms;
165   f64 timeout;
166   u32 expire_walk_interval_ms;
167   // maximum number of fragments in one reassembly
168   u32 max_reass_len;
169   // maximum number of reassemblies
170   u32 max_reass_n;
171
172   // IPv4 runtime
173   clib_bihash_16_8_t hash;
174   // per-thread data
175   ip4_full_reass_per_thread_t *per_thread_data;
176
177   // convenience
178   vlib_main_t *vlib_main;
179
180   // node index of ip4-drop node
181   u32 ip4_drop_idx;
182   u32 ip4_full_reass_expire_node_idx;
183
184   /** Worker handoff */
185   u32 fq_index;
186   u32 fq_feature_index;
187
188   // reference count for enabling/disabling feature - per interface
189   u32 *feature_use_refcount_per_intf;
190 } ip4_full_reass_main_t;
191
192 extern ip4_full_reass_main_t ip4_full_reass_main;
193
194 #ifndef CLIB_MARCH_VARIANT
195 ip4_full_reass_main_t ip4_full_reass_main;
196 #endif /* CLIB_MARCH_VARIANT */
197
198 typedef enum
199 {
200   IP4_FULL_REASS_NEXT_INPUT,
201   IP4_FULL_REASS_NEXT_DROP,
202   IP4_FULL_REASS_NEXT_HANDOFF,
203   IP4_FULL_REASS_N_NEXT,
204 } ip4_full_reass_next_t;
205
206 typedef enum
207 {
208   RANGE_NEW,
209   RANGE_SHRINK,
210   RANGE_DISCARD,
211   RANGE_OVERLAP,
212   FINALIZE,
213   HANDOFF,
214 } ip4_full_reass_trace_operation_e;
215
216 typedef struct
217 {
218   u16 range_first;
219   u16 range_last;
220   u32 range_bi;
221   i32 data_offset;
222   u32 data_len;
223   u32 first_bi;
224 } ip4_full_reass_range_trace_t;
225
226 typedef struct
227 {
228   ip4_full_reass_trace_operation_e action;
229   u32 reass_id;
230   ip4_full_reass_range_trace_t trace_range;
231   u32 size_diff;
232   u32 op_id;
233   u32 thread_id;
234   u32 thread_id_to;
235   u32 fragment_first;
236   u32 fragment_last;
237   u32 total_data_len;
238   bool is_after_handoff;
239   ip4_header_t ip4_header;
240 } ip4_full_reass_trace_t;
241
242 extern vlib_node_registration_t ip4_full_reass_node;
243 extern vlib_node_registration_t ip4_full_reass_node_feature;
244
245 static void
246 ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
247                               ip4_full_reass_range_trace_t * trace)
248 {
249   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
250   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
251   trace->range_first = vnb->ip.reass.range_first;
252   trace->range_last = vnb->ip.reass.range_last;
253   trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
254   trace->data_len = ip4_full_reass_buffer_get_data_len (b);
255   trace->range_bi = bi;
256 }
257
258 static u8 *
259 format_ip4_full_reass_range_trace (u8 * s, va_list * args)
260 {
261   ip4_full_reass_range_trace_t *trace =
262     va_arg (*args, ip4_full_reass_range_trace_t *);
263   s =
264     format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
265             trace->range_last, trace->data_offset, trace->data_len,
266             trace->range_bi);
267   return s;
268 }
269
270 static u8 *
271 format_ip4_full_reass_trace (u8 * s, va_list * args)
272 {
273   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
274   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
275   ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
276   u32 indent = 0;
277   if (~0 != t->reass_id)
278     {
279       if (t->is_after_handoff)
280         {
281           s =
282             format (s, "%U\n", format_ip4_header, &t->ip4_header,
283                     sizeof (t->ip4_header));
284           indent = 2;
285         }
286       s =
287         format (s, "%Ureass id: %u, op id: %u, ", format_white_space, indent,
288                 t->reass_id, t->op_id);
289       indent = format_get_indent (s);
290       s =
291         format (s,
292                 "first bi: %u, data len: %u, ip/fragment[%u, %u]",
293                 t->trace_range.first_bi, t->total_data_len, t->fragment_first,
294                 t->fragment_last);
295     }
296   switch (t->action)
297     {
298     case RANGE_SHRINK:
299       s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
300                   format_ip4_full_reass_range_trace, &t->trace_range,
301                   t->size_diff);
302       break;
303     case RANGE_DISCARD:
304       s = format (s, "\n%Udiscard %U", format_white_space, indent,
305                   format_ip4_full_reass_range_trace, &t->trace_range);
306       break;
307     case RANGE_NEW:
308       s = format (s, "\n%Unew %U", format_white_space, indent,
309                   format_ip4_full_reass_range_trace, &t->trace_range);
310       break;
311     case RANGE_OVERLAP:
312       s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
313                   format_ip4_full_reass_range_trace, &t->trace_range);
314       break;
315     case FINALIZE:
316       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
317       break;
318     case HANDOFF:
319       s =
320         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
321                 t->thread_id_to);
322       break;
323     }
324   return s;
325 }
326
327 static void
328 ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
329                           ip4_full_reass_main_t * rm,
330                           ip4_full_reass_t * reass, u32 bi,
331                           ip4_full_reass_trace_operation_e action,
332                           u32 size_diff, u32 thread_id_to)
333 {
334   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
335   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
336   bool is_after_handoff = false;
337   if (vlib_buffer_get_trace_thread (b) != vm->thread_index)
338     {
339       is_after_handoff = true;
340     }
341   ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
342   t->is_after_handoff = is_after_handoff;
343   if (t->is_after_handoff)
344     {
345       clib_memcpy (&t->ip4_header, vlib_buffer_get_current (b),
346                    clib_min (sizeof (t->ip4_header), b->current_length));
347     }
348   if (reass)
349     {
350       t->reass_id = reass->id;
351       t->op_id = reass->trace_op_counter;
352       t->trace_range.first_bi = reass->first_bi;
353       t->total_data_len = reass->data_len;
354       ++reass->trace_op_counter;
355     }
356   else
357     {
358       t->reass_id = ~0;
359       t->op_id = 0;
360       t->trace_range.first_bi = 0;
361       t->total_data_len = 0;
362     }
363   t->action = action;
364   ip4_full_reass_trace_details (vm, bi, &t->trace_range);
365   t->size_diff = size_diff;
366   t->thread_id = vm->thread_index;
367   t->thread_id_to = thread_id_to;
368   t->fragment_first = vnb->ip.reass.fragment_first;
369   t->fragment_last = vnb->ip.reass.fragment_last;
370 #if 0
371   static u8 *s = NULL;
372   s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
373   printf ("%.*s\n", vec_len (s), s);
374   fflush (stdout);
375   vec_reset_length (s);
376 #endif
377 }
378
379 always_inline void
380 ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
381                          ip4_full_reass_t * reass)
382 {
383   pool_put (rt->pool, reass);
384   --rt->reass_n;
385 }
386
387 always_inline void
388 ip4_full_reass_free (ip4_full_reass_main_t * rm,
389                      ip4_full_reass_per_thread_t * rt,
390                      ip4_full_reass_t * reass)
391 {
392   clib_bihash_kv_16_8_t kv;
393   kv.key[0] = reass->key.as_u64[0];
394   kv.key[1] = reass->key.as_u64[1];
395   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
396   return ip4_full_reass_free_ctx (rt, reass);
397 }
398
399 always_inline void
400 ip4_full_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
401                          ip4_full_reass_main_t * rm, ip4_full_reass_t * reass)
402 {
403   u32 range_bi = reass->first_bi;
404   vlib_buffer_t *range_b;
405   vnet_buffer_opaque_t *range_vnb;
406   u32 *to_free = NULL;
407   while (~0 != range_bi)
408     {
409       range_b = vlib_get_buffer (vm, range_bi);
410       range_vnb = vnet_buffer (range_b);
411       u32 bi = range_bi;
412       while (~0 != bi)
413         {
414           vec_add1 (to_free, bi);
415           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
416           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
417             {
418               bi = b->next_buffer;
419               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
420             }
421           else
422             {
423               bi = ~0;
424             }
425         }
426       range_bi = range_vnb->ip.reass.next_range_bi;
427     }
428   /* send to next_error_index */
429   if (~0 != reass->error_next_index)
430     {
431       u32 n_left_to_next, *to_next, next_index;
432
433       next_index = reass->error_next_index;
434       u32 bi = ~0;
435
436       while (vec_len (to_free) > 0)
437         {
438           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
439
440           while (vec_len (to_free) > 0 && n_left_to_next > 0)
441             {
442               bi = vec_pop (to_free);
443
444               if (~0 != bi)
445                 {
446                   to_next[0] = bi;
447                   to_next += 1;
448                   n_left_to_next -= 1;
449                 }
450             }
451           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
452         }
453     }
454   else
455     {
456       vlib_buffer_free (vm, to_free, vec_len (to_free));
457     }
458 }
459
460 always_inline void
461 ip4_full_reass_init (ip4_full_reass_t * reass)
462 {
463   reass->first_bi = ~0;
464   reass->last_packet_octet = ~0;
465   reass->data_len = 0;
466   reass->next_index = ~0;
467   reass->error_next_index = ~0;
468 }
469
470 always_inline ip4_full_reass_t *
471 ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
472                                ip4_full_reass_main_t * rm,
473                                ip4_full_reass_per_thread_t * rt,
474                                ip4_full_reass_kv_t * kv, u8 * do_handoff)
475 {
476   ip4_full_reass_t *reass;
477   f64 now;
478
479 again:
480
481   reass = NULL;
482   now = vlib_time_now (vm);
483   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
484     {
485       reass =
486         pool_elt_at_index (rm->per_thread_data
487                            [kv->v.memory_owner_thread_index].pool,
488                            kv->v.reass_index);
489       if (vm->thread_index != reass->memory_owner_thread_index)
490         {
491           *do_handoff = 1;
492           return reass;
493         }
494
495       if (now > reass->last_heard + rm->timeout)
496         {
497           ip4_full_reass_drop_all (vm, node, rm, reass);
498           ip4_full_reass_free (rm, rt, reass);
499           reass = NULL;
500         }
501     }
502
503   if (reass)
504     {
505       reass->last_heard = now;
506       return reass;
507     }
508
509   if (rt->reass_n >= rm->max_reass_n)
510     {
511       reass = NULL;
512       return reass;
513     }
514   else
515     {
516       pool_get (rt->pool, reass);
517       clib_memset (reass, 0, sizeof (*reass));
518       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
519       reass->memory_owner_thread_index = vm->thread_index;
520       ++rt->id_counter;
521       ip4_full_reass_init (reass);
522       ++rt->reass_n;
523     }
524
525   reass->key.as_u64[0] = kv->kv.key[0];
526   reass->key.as_u64[1] = kv->kv.key[1];
527   kv->v.reass_index = (reass - rt->pool);
528   kv->v.memory_owner_thread_index = vm->thread_index;
529   reass->last_heard = now;
530
531   int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
532   if (rv)
533     {
534       ip4_full_reass_free_ctx (rt, reass);
535       reass = NULL;
536       // if other worker created a context already work with the other copy
537       if (-2 == rv)
538         goto again;
539     }
540
541   return reass;
542 }
543
544 always_inline ip4_full_reass_rc_t
545 ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
546                          ip4_full_reass_main_t * rm,
547                          ip4_full_reass_per_thread_t * rt,
548                          ip4_full_reass_t * reass, u32 * bi0,
549                          u32 * next0, u32 * error0, bool is_custom_app)
550 {
551   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
552   vlib_buffer_t *last_b = NULL;
553   u32 sub_chain_bi = reass->first_bi;
554   u32 total_length = 0;
555   u32 buf_cnt = 0;
556   do
557     {
558       u32 tmp_bi = sub_chain_bi;
559       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
560       ip4_header_t *ip = vlib_buffer_get_current (tmp);
561       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
562       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
563           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
564         {
565           return IP4_REASS_RC_INTERNAL_ERROR;
566         }
567
568       u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
569       u32 trim_front =
570         ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
571       u32 trim_end =
572         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
573       if (tmp_bi == reass->first_bi)
574         {
575           /* first buffer - keep ip4 header */
576           if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
577             {
578               return IP4_REASS_RC_INTERNAL_ERROR;
579             }
580           trim_front = 0;
581           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
582             ip4_header_bytes (ip);
583           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
584             {
585               return IP4_REASS_RC_INTERNAL_ERROR;
586             }
587         }
588       u32 keep_data =
589         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
590       while (1)
591         {
592           ++buf_cnt;
593           if (trim_front)
594             {
595               if (trim_front > tmp->current_length)
596                 {
597                   /* drop whole buffer */
598                   u32 to_be_freed_bi = tmp_bi;
599                   trim_front -= tmp->current_length;
600                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
601                     {
602                       return IP4_REASS_RC_INTERNAL_ERROR;
603                     }
604                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
605                   tmp_bi = tmp->next_buffer;
606                   tmp->next_buffer = 0;
607                   tmp = vlib_get_buffer (vm, tmp_bi);
608                   vlib_buffer_free_one (vm, to_be_freed_bi);
609                   continue;
610                 }
611               else
612                 {
613                   vlib_buffer_advance (tmp, trim_front);
614                   trim_front = 0;
615                 }
616             }
617           if (keep_data)
618             {
619               if (last_b)
620                 {
621                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
622                   last_b->next_buffer = tmp_bi;
623                 }
624               last_b = tmp;
625               if (keep_data <= tmp->current_length)
626                 {
627                   tmp->current_length = keep_data;
628                   keep_data = 0;
629                 }
630               else
631                 {
632                   keep_data -= tmp->current_length;
633                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
634                     {
635                       return IP4_REASS_RC_INTERNAL_ERROR;
636                     }
637                 }
638               total_length += tmp->current_length;
639               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
640                 {
641                   tmp_bi = tmp->next_buffer;
642                   tmp = vlib_get_buffer (vm, tmp->next_buffer);
643                 }
644               else
645                 {
646                   break;
647                 }
648             }
649           else
650             {
651               u32 to_be_freed_bi = tmp_bi;
652               if (reass->first_bi == tmp_bi)
653                 {
654                   return IP4_REASS_RC_INTERNAL_ERROR;
655                 }
656               if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
657                 {
658                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
659                   tmp_bi = tmp->next_buffer;
660                   tmp->next_buffer = 0;
661                   tmp = vlib_get_buffer (vm, tmp_bi);
662                   vlib_buffer_free_one (vm, to_be_freed_bi);
663                 }
664               else
665                 {
666                   tmp->next_buffer = 0;
667                   vlib_buffer_free_one (vm, to_be_freed_bi);
668                   break;
669                 }
670             }
671         }
672       sub_chain_bi =
673         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
674         reass.next_range_bi;
675     }
676   while (~0 != sub_chain_bi);
677
678   if (!last_b)
679     {
680       return IP4_REASS_RC_INTERNAL_ERROR;
681     }
682   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
683
684   if (total_length < first_b->current_length)
685     {
686       return IP4_REASS_RC_INTERNAL_ERROR;
687     }
688   total_length -= first_b->current_length;
689   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
690   first_b->total_length_not_including_first_buffer = total_length;
691   ip4_header_t *ip = vlib_buffer_get_current (first_b);
692   ip->flags_and_fragment_offset = 0;
693   ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
694   ip->checksum = ip4_header_checksum (ip);
695   if (!vlib_buffer_chain_linearize (vm, first_b))
696     {
697       return IP4_REASS_RC_NO_BUF;
698     }
699   // reset to reconstruct the mbuf linking
700   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
701   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
702     {
703       ip4_full_reass_add_trace (vm, node, rm, reass, reass->first_bi,
704                                 FINALIZE, 0, ~0);
705 #if 0
706       // following code does a hexdump of packet fragments to stdout ...
707       do
708         {
709           u32 bi = reass->first_bi;
710           u8 *s = NULL;
711           while (~0 != bi)
712             {
713               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
714               s = format (s, "%u: %U\n", bi, format_hexdump,
715                           vlib_buffer_get_current (b), b->current_length);
716               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
717                 {
718                   bi = b->next_buffer;
719                 }
720               else
721                 {
722                   break;
723                 }
724             }
725           printf ("%.*s\n", vec_len (s), s);
726           fflush (stdout);
727           vec_free (s);
728         }
729       while (0);
730 #endif
731     }
732   *bi0 = reass->first_bi;
733   if (!is_custom_app)
734     {
735       *next0 = IP4_FULL_REASS_NEXT_INPUT;
736     }
737   else
738     {
739       *next0 = reass->next_index;
740     }
741   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
742   *error0 = IP4_ERROR_NONE;
743   ip4_full_reass_free (rm, rt, reass);
744   reass = NULL;
745   return IP4_REASS_RC_OK;
746 }
747
748 always_inline ip4_full_reass_rc_t
749 ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
750                                       ip4_full_reass_main_t * rm,
751                                       ip4_full_reass_per_thread_t * rt,
752                                       ip4_full_reass_t * reass,
753                                       u32 prev_range_bi, u32 new_next_bi)
754 {
755   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
756   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
757   if (~0 != prev_range_bi)
758     {
759       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
760       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
761       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
762       prev_vnb->ip.reass.next_range_bi = new_next_bi;
763     }
764   else
765     {
766       if (~0 != reass->first_bi)
767         {
768           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
769         }
770       reass->first_bi = new_next_bi;
771     }
772   vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
773   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
774       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
775     {
776       return IP4_REASS_RC_INTERNAL_ERROR;
777     }
778   reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
779   return IP4_REASS_RC_OK;
780 }
781
782 always_inline ip4_full_reass_rc_t
783 ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
784                                         vlib_node_runtime_t * node,
785                                         ip4_full_reass_main_t * rm,
786                                         ip4_full_reass_t * reass,
787                                         u32 prev_range_bi, u32 discard_bi)
788 {
789   vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
790   vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
791   if (~0 != prev_range_bi)
792     {
793       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
794       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
795       if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
796         {
797           return IP4_REASS_RC_INTERNAL_ERROR;
798         }
799       prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
800     }
801   else
802     {
803       reass->first_bi = discard_vnb->ip.reass.next_range_bi;
804     }
805   vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
806   if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
807       !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
808     {
809       return IP4_REASS_RC_INTERNAL_ERROR;
810     }
811   reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
812   while (1)
813     {
814       u32 to_be_freed_bi = discard_bi;
815       if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
816         {
817           ip4_full_reass_add_trace (vm, node, rm, reass, discard_bi,
818                                     RANGE_DISCARD, 0, ~0);
819         }
820       if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
821         {
822           discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
823           discard_bi = discard_b->next_buffer;
824           discard_b->next_buffer = 0;
825           discard_b = vlib_get_buffer (vm, discard_bi);
826           vlib_buffer_free_one (vm, to_be_freed_bi);
827         }
828       else
829         {
830           discard_b->next_buffer = 0;
831           vlib_buffer_free_one (vm, to_be_freed_bi);
832           break;
833         }
834     }
835   return IP4_REASS_RC_OK;
836 }
837
838 always_inline ip4_full_reass_rc_t
839 ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
840                        ip4_full_reass_main_t * rm,
841                        ip4_full_reass_per_thread_t * rt,
842                        ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
843                        u32 * error0, bool is_custom_app,
844                        u32 * handoff_thread_idx)
845 {
846   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
847   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
848   if (is_custom_app)
849     {
850       // store (error_)next_index before it's overwritten
851       reass->next_index = fvnb->ip.reass.next_index;
852       reass->error_next_index = fvnb->ip.reass.error_next_index;
853     }
854   ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
855   int consumed = 0;
856   ip4_header_t *fip = vlib_buffer_get_current (fb);
857   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
858   const u32 fragment_length =
859     clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
860   const u32 fragment_last = fragment_first + fragment_length - 1;
861   fvnb->ip.reass.fragment_first = fragment_first;
862   fvnb->ip.reass.fragment_last = fragment_last;
863   int more_fragments = ip4_get_fragment_more (fip);
864   u32 candidate_range_bi = reass->first_bi;
865   u32 prev_range_bi = ~0;
866   fvnb->ip.reass.range_first = fragment_first;
867   fvnb->ip.reass.range_last = fragment_last;
868   fvnb->ip.reass.next_range_bi = ~0;
869   if (!more_fragments)
870     {
871       reass->last_packet_octet = fragment_last;
872     }
873   if (~0 == reass->first_bi)
874     {
875       // starting a new reassembly
876       rc =
877         ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
878                                               prev_range_bi, *bi0);
879       if (IP4_REASS_RC_OK != rc)
880         {
881           return rc;
882         }
883       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
884         {
885           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
886                                     ~0);
887         }
888       *bi0 = ~0;
889       reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
890       reass->fragments_n = 1;
891       return IP4_REASS_RC_OK;
892     }
893   reass->min_fragment_length =
894     clib_min (clib_net_to_host_u16 (fip->length),
895               fvnb->ip.reass.estimated_mtu);
896   while (~0 != candidate_range_bi)
897     {
898       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
899       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
900       if (fragment_first > candidate_vnb->ip.reass.range_last)
901         {
902           // this fragments starts after candidate range
903           prev_range_bi = candidate_range_bi;
904           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
905           if (candidate_vnb->ip.reass.range_last < fragment_last &&
906               ~0 == candidate_range_bi)
907             {
908               // special case - this fragment falls beyond all known ranges
909               rc =
910                 ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
911                                                       prev_range_bi, *bi0);
912               if (IP4_REASS_RC_OK != rc)
913                 {
914                   return rc;
915                 }
916               consumed = 1;
917               break;
918             }
919           continue;
920         }
921       if (fragment_last < candidate_vnb->ip.reass.range_first)
922         {
923           // this fragment ends before candidate range without any overlap
924           rc =
925             ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
926                                                   prev_range_bi, *bi0);
927           if (IP4_REASS_RC_OK != rc)
928             {
929               return rc;
930             }
931           consumed = 1;
932         }
933       else
934         {
935           if (fragment_first >= candidate_vnb->ip.reass.range_first &&
936               fragment_last <= candidate_vnb->ip.reass.range_last)
937             {
938               // this fragment is a (sub)part of existing range, ignore it
939               if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
940                 {
941                   ip4_full_reass_add_trace (vm, node, rm, reass, *bi0,
942                                             RANGE_OVERLAP, 0, ~0);
943                 }
944               break;
945             }
946           int discard_candidate = 0;
947           if (fragment_first < candidate_vnb->ip.reass.range_first)
948             {
949               u32 overlap =
950                 fragment_last - candidate_vnb->ip.reass.range_first + 1;
951               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
952                 {
953                   candidate_vnb->ip.reass.range_first += overlap;
954                   if (reass->data_len < overlap)
955                     {
956                       return IP4_REASS_RC_INTERNAL_ERROR;
957                     }
958                   reass->data_len -= overlap;
959                   if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
960                     {
961                       ip4_full_reass_add_trace (vm, node, rm, reass,
962                                                 candidate_range_bi,
963                                                 RANGE_SHRINK, 0, ~0);
964                     }
965                   rc =
966                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
967                                                           prev_range_bi,
968                                                           *bi0);
969                   if (IP4_REASS_RC_OK != rc)
970                     {
971                       return rc;
972                     }
973                   consumed = 1;
974                 }
975               else
976                 {
977                   discard_candidate = 1;
978                 }
979             }
980           else if (fragment_last > candidate_vnb->ip.reass.range_last)
981             {
982               u32 overlap =
983                 candidate_vnb->ip.reass.range_last - fragment_first + 1;
984               if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
985                 {
986                   fvnb->ip.reass.range_first += overlap;
987                   if (~0 != candidate_vnb->ip.reass.next_range_bi)
988                     {
989                       prev_range_bi = candidate_range_bi;
990                       candidate_range_bi =
991                         candidate_vnb->ip.reass.next_range_bi;
992                       continue;
993                     }
994                   else
995                     {
996                       // special case - last range discarded
997                       rc =
998                         ip4_full_reass_insert_range_in_chain (vm, rm, rt,
999                                                               reass,
1000                                                               candidate_range_bi,
1001                                                               *bi0);
1002                       if (IP4_REASS_RC_OK != rc)
1003                         {
1004                           return rc;
1005                         }
1006                       consumed = 1;
1007                     }
1008                 }
1009               else
1010                 {
1011                   discard_candidate = 1;
1012                 }
1013             }
1014           else
1015             {
1016               discard_candidate = 1;
1017             }
1018           if (discard_candidate)
1019             {
1020               u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
1021               // discard candidate range, probe next range
1022               rc =
1023                 ip4_full_reass_remove_range_from_chain (vm, node, rm, reass,
1024                                                         prev_range_bi,
1025                                                         candidate_range_bi);
1026               if (IP4_REASS_RC_OK != rc)
1027                 {
1028                   return rc;
1029                 }
1030               if (~0 != next_range_bi)
1031                 {
1032                   candidate_range_bi = next_range_bi;
1033                   continue;
1034                 }
1035               else
1036                 {
1037                   // special case - last range discarded
1038                   rc =
1039                     ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
1040                                                           prev_range_bi,
1041                                                           *bi0);
1042                   if (IP4_REASS_RC_OK != rc)
1043                     {
1044                       return rc;
1045                     }
1046                   consumed = 1;
1047                 }
1048             }
1049         }
1050       break;
1051     }
1052   ++reass->fragments_n;
1053   if (consumed)
1054     {
1055       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1056         {
1057           ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
1058                                     ~0);
1059         }
1060     }
1061   if (~0 != reass->last_packet_octet &&
1062       reass->data_len == reass->last_packet_octet + 1)
1063     {
1064       *handoff_thread_idx = reass->sendout_thread_index;
1065       int handoff =
1066         reass->memory_owner_thread_index != reass->sendout_thread_index;
1067       rc =
1068         ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1069                                  is_custom_app);
1070       if (IP4_REASS_RC_OK == rc && handoff)
1071         {
1072           rc = IP4_REASS_RC_HANDOFF;
1073         }
1074     }
1075   else
1076     {
1077       if (consumed)
1078         {
1079           *bi0 = ~0;
1080           if (reass->fragments_n > rm->max_reass_len)
1081             {
1082               rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1083             }
1084         }
1085       else
1086         {
1087           *next0 = IP4_FULL_REASS_NEXT_DROP;
1088           *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1089         }
1090     }
1091   return rc;
1092 }
1093
1094 always_inline uword
1095 ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1096                        vlib_frame_t * frame, bool is_feature,
1097                        bool is_custom_app)
1098 {
1099   u32 *from = vlib_frame_vector_args (frame);
1100   u32 n_left_from, n_left_to_next, *to_next, next_index;
1101   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1102   ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1103   clib_spinlock_lock (&rt->lock);
1104
1105   n_left_from = frame->n_vectors;
1106   next_index = node->cached_next_index;
1107   while (n_left_from > 0)
1108     {
1109       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1110
1111       while (n_left_from > 0 && n_left_to_next > 0)
1112         {
1113           u32 bi0;
1114           vlib_buffer_t *b0;
1115           u32 next0;
1116           u32 error0 = IP4_ERROR_NONE;
1117
1118           bi0 = from[0];
1119           b0 = vlib_get_buffer (vm, bi0);
1120
1121           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1122           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1123             {
1124               // this is a whole packet - no fragmentation
1125               if (!is_custom_app)
1126                 {
1127                   next0 = IP4_FULL_REASS_NEXT_INPUT;
1128                 }
1129               else
1130                 {
1131                   next0 = vnet_buffer (b0)->ip.reass.next_index;
1132                 }
1133               goto packet_enqueue;
1134             }
1135           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1136           const u32 fragment_length =
1137             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1138           const u32 fragment_last = fragment_first + fragment_length - 1;
1139           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
1140             {
1141               next0 = IP4_FULL_REASS_NEXT_DROP;
1142               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1143               goto packet_enqueue;
1144             }
1145           ip4_full_reass_kv_t kv;
1146           u8 do_handoff = 0;
1147
1148           kv.k.as_u64[0] =
1149             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1150                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1151             (u64) ip0->src_address.as_u32 << 32;
1152           kv.k.as_u64[1] =
1153             (u64) ip0->dst_address.
1154             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1155
1156           ip4_full_reass_t *reass =
1157             ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
1158                                            &do_handoff);
1159
1160           if (reass)
1161             {
1162               const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1163               if (0 == fragment_first)
1164                 {
1165                   reass->sendout_thread_index = vm->thread_index;
1166                 }
1167             }
1168
1169           if (PREDICT_FALSE (do_handoff))
1170             {
1171               next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1172               vnet_buffer (b0)->ip.reass.owner_thread_index =
1173                 kv.v.memory_owner_thread_index;
1174             }
1175           else if (reass)
1176             {
1177               u32 handoff_thread_idx;
1178               switch (ip4_full_reass_update
1179                       (vm, node, rm, rt, reass, &bi0, &next0,
1180                        &error0, is_custom_app, &handoff_thread_idx))
1181                 {
1182                 case IP4_REASS_RC_OK:
1183                   /* nothing to do here */
1184                   break;
1185                 case IP4_REASS_RC_HANDOFF:
1186                   next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1187                   b0 = vlib_get_buffer (vm, bi0);
1188                   vnet_buffer (b0)->ip.reass.owner_thread_index =
1189                     handoff_thread_idx;
1190                   break;
1191                 case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1192                   vlib_node_increment_counter (vm, node->node_index,
1193                                                IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1194                                                1);
1195                   ip4_full_reass_drop_all (vm, node, rm, reass);
1196                   ip4_full_reass_free (rm, rt, reass);
1197                   goto next_packet;
1198                   break;
1199                 case IP4_REASS_RC_NO_BUF:
1200                   vlib_node_increment_counter (vm, node->node_index,
1201                                                IP4_ERROR_REASS_NO_BUF, 1);
1202                   ip4_full_reass_drop_all (vm, node, rm, reass);
1203                   ip4_full_reass_free (rm, rt, reass);
1204                   goto next_packet;
1205                   break;
1206                 case IP4_REASS_RC_INTERNAL_ERROR:
1207                   /* drop everything and start with a clean slate */
1208                   vlib_node_increment_counter (vm, node->node_index,
1209                                                IP4_ERROR_REASS_INTERNAL_ERROR,
1210                                                1);
1211                   ip4_full_reass_drop_all (vm, node, rm, reass);
1212                   ip4_full_reass_free (rm, rt, reass);
1213                   goto next_packet;
1214                   break;
1215                 }
1216             }
1217           else
1218             {
1219               next0 = IP4_FULL_REASS_NEXT_DROP;
1220               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1221             }
1222
1223
1224         packet_enqueue:
1225
1226           if (bi0 != ~0)
1227             {
1228               to_next[0] = bi0;
1229               to_next += 1;
1230               n_left_to_next -= 1;
1231
1232               /* bi0 might have been updated by reass_finalize, reload */
1233               b0 = vlib_get_buffer (vm, bi0);
1234               if (IP4_ERROR_NONE != error0)
1235                 {
1236                   b0->error = node->errors[error0];
1237                 }
1238
1239               if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1240                 {
1241                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1242                     {
1243                       ip4_full_reass_add_trace (vm, node, rm, NULL, bi0,
1244                                                 HANDOFF, 0,
1245                                                 vnet_buffer (b0)->ip.
1246                                                 reass.owner_thread_index);
1247                     }
1248                 }
1249               else if (is_feature && IP4_ERROR_NONE == error0)
1250                 {
1251                   vnet_feature_next (&next0, b0);
1252                 }
1253               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1254                                                to_next, n_left_to_next,
1255                                                bi0, next0);
1256               IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1257             }
1258
1259         next_packet:
1260           from += 1;
1261           n_left_from -= 1;
1262         }
1263
1264       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1265     }
1266
1267   clib_spinlock_unlock (&rt->lock);
1268   return frame->n_vectors;
1269 }
1270
1271 static char *ip4_full_reass_error_strings[] = {
1272 #define _(sym, string) string,
1273   foreach_ip4_error
1274 #undef _
1275 };
1276
1277 VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1278                                     vlib_node_runtime_t * node,
1279                                     vlib_frame_t * frame)
1280 {
1281   return ip4_full_reass_inline (vm, node, frame, false /* is_feature */ ,
1282                                 false /* is_custom_app */ );
1283 }
1284
1285 /* *INDENT-OFF* */
1286 VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1287     .name = "ip4-full-reassembly",
1288     .vector_size = sizeof (u32),
1289     .format_trace = format_ip4_full_reass_trace,
1290     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1291     .error_strings = ip4_full_reass_error_strings,
1292     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1293     .next_nodes =
1294         {
1295                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1296                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1297                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1298
1299         },
1300 };
1301 /* *INDENT-ON* */
1302
1303 VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1304                                             vlib_node_runtime_t * node,
1305                                             vlib_frame_t * frame)
1306 {
1307   return ip4_full_reass_inline (vm, node, frame, true /* is_feature */ ,
1308                                 false /* is_custom_app */ );
1309 }
1310
1311 /* *INDENT-OFF* */
1312 VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1313     .name = "ip4-full-reassembly-feature",
1314     .vector_size = sizeof (u32),
1315     .format_trace = format_ip4_full_reass_trace,
1316     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1317     .error_strings = ip4_full_reass_error_strings,
1318     .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1319     .next_nodes =
1320         {
1321                 [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1322                 [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1323                 [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1324         },
1325 };
1326 /* *INDENT-ON* */
1327
1328 /* *INDENT-OFF* */
1329 VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1330     .arc_name = "ip4-unicast",
1331     .node_name = "ip4-full-reassembly-feature",
1332     .runs_before = VNET_FEATURES ("ip4-lookup",
1333                                   "ipsec4-input-feature"),
1334     .runs_after = 0,
1335 };
1336 /* *INDENT-ON* */
1337
1338 #ifndef CLIB_MARCH_VARIANT
1339 always_inline u32
1340 ip4_full_reass_get_nbuckets ()
1341 {
1342   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1343   u32 nbuckets;
1344   u8 i;
1345
1346   nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1347
1348   for (i = 0; i < 31; i++)
1349     if ((1 << i) >= nbuckets)
1350       break;
1351   nbuckets = 1 << i;
1352
1353   return nbuckets;
1354 }
1355 #endif /* CLIB_MARCH_VARIANT */
1356
1357 typedef enum
1358 {
1359   IP4_EVENT_CONFIG_CHANGED = 1,
1360 } ip4_full_reass_event_t;
1361
1362 typedef struct
1363 {
1364   int failure;
1365   clib_bihash_16_8_t *new_hash;
1366 } ip4_rehash_cb_ctx;
1367
1368 #ifndef CLIB_MARCH_VARIANT
1369 static int
1370 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1371 {
1372   ip4_rehash_cb_ctx *ctx = _ctx;
1373   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1374     {
1375       ctx->failure = 1;
1376     }
1377   return (BIHASH_WALK_CONTINUE);
1378 }
1379
1380 static void
1381 ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1382                            u32 max_reassembly_length,
1383                            u32 expire_walk_interval_ms)
1384 {
1385   ip4_full_reass_main.timeout_ms = timeout_ms;
1386   ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1387   ip4_full_reass_main.max_reass_n = max_reassemblies;
1388   ip4_full_reass_main.max_reass_len = max_reassembly_length;
1389   ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1390 }
1391
1392 vnet_api_error_t
1393 ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1394                     u32 max_reassembly_length, u32 expire_walk_interval_ms)
1395 {
1396   u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1397   ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1398                              max_reassembly_length, expire_walk_interval_ms);
1399   vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1400                              ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1401                              IP4_EVENT_CONFIG_CHANGED, 0);
1402   u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1403   if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1404     {
1405       clib_bihash_16_8_t new_hash;
1406       clib_memset (&new_hash, 0, sizeof (new_hash));
1407       ip4_rehash_cb_ctx ctx;
1408       ctx.failure = 0;
1409       ctx.new_hash = &new_hash;
1410       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1411                              new_nbuckets * 1024);
1412       clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1413                                                ip4_rehash_cb, &ctx);
1414       if (ctx.failure)
1415         {
1416           clib_bihash_free_16_8 (&new_hash);
1417           return -1;
1418         }
1419       else
1420         {
1421           clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1422           clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1423                             sizeof (ip4_full_reass_main.hash));
1424           clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1425         }
1426     }
1427   return 0;
1428 }
1429
1430 vnet_api_error_t
1431 ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1432                     u32 * max_reassembly_length,
1433                     u32 * expire_walk_interval_ms)
1434 {
1435   *timeout_ms = ip4_full_reass_main.timeout_ms;
1436   *max_reassemblies = ip4_full_reass_main.max_reass_n;
1437   *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1438   *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1439   return 0;
1440 }
1441
1442 static clib_error_t *
1443 ip4_full_reass_init_function (vlib_main_t * vm)
1444 {
1445   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1446   clib_error_t *error = 0;
1447   u32 nbuckets;
1448   vlib_node_t *node;
1449
1450   rm->vlib_main = vm;
1451
1452   vec_validate (rm->per_thread_data, vlib_num_workers ());
1453   ip4_full_reass_per_thread_t *rt;
1454   vec_foreach (rt, rm->per_thread_data)
1455   {
1456     clib_spinlock_init (&rt->lock);
1457     pool_alloc (rt->pool, rm->max_reass_n);
1458   }
1459
1460   node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1461   ASSERT (node);
1462   rm->ip4_full_reass_expire_node_idx = node->index;
1463
1464   ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1465                              IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1466                              IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1467                              IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1468
1469   nbuckets = ip4_full_reass_get_nbuckets ();
1470   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1471
1472   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1473   ASSERT (node);
1474   rm->ip4_drop_idx = node->index;
1475
1476   rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1477   rm->fq_feature_index =
1478     vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1479
1480   rm->feature_use_refcount_per_intf = NULL;
1481   return error;
1482 }
1483
1484 VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1485 #endif /* CLIB_MARCH_VARIANT */
1486
1487 static uword
1488 ip4_full_reass_walk_expired (vlib_main_t * vm,
1489                              vlib_node_runtime_t * node, vlib_frame_t * f)
1490 {
1491   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1492   uword event_type, *event_data = 0;
1493
1494   while (true)
1495     {
1496       vlib_process_wait_for_event_or_clock (vm,
1497                                             (f64)
1498                                             rm->expire_walk_interval_ms /
1499                                             (f64) MSEC_PER_SEC);
1500       event_type = vlib_process_get_events (vm, &event_data);
1501
1502       switch (event_type)
1503         {
1504         case ~0:                /* no events => timeout */
1505           /* nothing to do here */
1506           break;
1507         case IP4_EVENT_CONFIG_CHANGED:
1508           break;
1509         default:
1510           clib_warning ("BUG: event type 0x%wx", event_type);
1511           break;
1512         }
1513       f64 now = vlib_time_now (vm);
1514
1515       ip4_full_reass_t *reass;
1516       int *pool_indexes_to_free = NULL;
1517
1518       uword thread_index = 0;
1519       int index;
1520       const uword nthreads = vlib_num_workers () + 1;
1521       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1522         {
1523           ip4_full_reass_per_thread_t *rt =
1524             &rm->per_thread_data[thread_index];
1525           clib_spinlock_lock (&rt->lock);
1526
1527           vec_reset_length (pool_indexes_to_free);
1528           /* *INDENT-OFF* */
1529           pool_foreach_index (index, rt->pool, ({
1530                                 reass = pool_elt_at_index (rt->pool, index);
1531                                 if (now > reass->last_heard + rm->timeout)
1532                                   {
1533                                     vec_add1 (pool_indexes_to_free, index);
1534                                   }
1535                               }));
1536           /* *INDENT-ON* */
1537           int *i;
1538           /* *INDENT-OFF* */
1539           vec_foreach (i, pool_indexes_to_free)
1540           {
1541             ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1542             ip4_full_reass_drop_all (vm, node, rm, reass);
1543             ip4_full_reass_free (rm, rt, reass);
1544           }
1545           /* *INDENT-ON* */
1546
1547           clib_spinlock_unlock (&rt->lock);
1548         }
1549
1550       vec_free (pool_indexes_to_free);
1551       if (event_data)
1552         {
1553           _vec_len (event_data) = 0;
1554         }
1555     }
1556
1557   return 0;
1558 }
1559
1560 /* *INDENT-OFF* */
1561 VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1562     .function = ip4_full_reass_walk_expired,
1563     .type = VLIB_NODE_TYPE_PROCESS,
1564     .name = "ip4-full-reassembly-expire-walk",
1565     .format_trace = format_ip4_full_reass_trace,
1566     .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1567     .error_strings = ip4_full_reass_error_strings,
1568
1569 };
1570 /* *INDENT-ON* */
1571
1572 static u8 *
1573 format_ip4_full_reass_key (u8 * s, va_list * args)
1574 {
1575   ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1576   s =
1577     format (s,
1578             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1579             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1580             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1581   return s;
1582 }
1583
1584 static u8 *
1585 format_ip4_reass (u8 * s, va_list * args)
1586 {
1587   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1588   ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1589
1590   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1591               "last_packet_octet: %u, trace_op_counter: %u\n",
1592               reass->id, format_ip4_full_reass_key, &reass->key,
1593               reass->first_bi, reass->data_len,
1594               reass->last_packet_octet, reass->trace_op_counter);
1595
1596   u32 bi = reass->first_bi;
1597   u32 counter = 0;
1598   while (~0 != bi)
1599     {
1600       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1601       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1602       s =
1603         format (s,
1604                 "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1605                 "fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1606                 vnb->ip.reass.range_last, bi,
1607                 ip4_full_reass_buffer_get_data_offset (b),
1608                 ip4_full_reass_buffer_get_data_len (b),
1609                 vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1610       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1611         {
1612           bi = b->next_buffer;
1613         }
1614       else
1615         {
1616           bi = ~0;
1617         }
1618     }
1619   return s;
1620 }
1621
1622 static clib_error_t *
1623 show_ip4_reass (vlib_main_t * vm,
1624                 unformat_input_t * input,
1625                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1626 {
1627   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1628
1629   vlib_cli_output (vm, "---------------------");
1630   vlib_cli_output (vm, "IP4 reassembly status");
1631   vlib_cli_output (vm, "---------------------");
1632   bool details = false;
1633   if (unformat (input, "details"))
1634     {
1635       details = true;
1636     }
1637
1638   u32 sum_reass_n = 0;
1639   ip4_full_reass_t *reass;
1640   uword thread_index;
1641   const uword nthreads = vlib_num_workers () + 1;
1642   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1643     {
1644       ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1645       clib_spinlock_lock (&rt->lock);
1646       if (details)
1647         {
1648           /* *INDENT-OFF* */
1649           pool_foreach (reass, rt->pool, {
1650             vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1651           });
1652           /* *INDENT-ON* */
1653         }
1654       sum_reass_n += rt->reass_n;
1655       clib_spinlock_unlock (&rt->lock);
1656     }
1657   vlib_cli_output (vm, "---------------------");
1658   vlib_cli_output (vm, "Current full IP4 reassemblies count: %lu\n",
1659                    (long unsigned) sum_reass_n);
1660   vlib_cli_output (vm,
1661                    "Maximum configured concurrent full IP4 reassemblies per worker-thread: %lu\n",
1662                    (long unsigned) rm->max_reass_n);
1663   vlib_cli_output (vm,
1664                    "Maximum configured full IP4 reassembly timeout: %lums\n",
1665                    (long unsigned) rm->timeout_ms);
1666   vlib_cli_output (vm,
1667                    "Maximum configured full IP4 reassembly expire walk interval: %lums\n",
1668                    (long unsigned) rm->expire_walk_interval_ms);
1669   return 0;
1670 }
1671
1672 /* *INDENT-OFF* */
1673 VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1674     .path = "show ip4-full-reassembly",
1675     .short_help = "show ip4-full-reassembly [details]",
1676     .function = show_ip4_reass,
1677 };
1678 /* *INDENT-ON* */
1679
1680 #ifndef CLIB_MARCH_VARIANT
1681 vnet_api_error_t
1682 ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1683 {
1684   return vnet_feature_enable_disable ("ip4-unicast",
1685                                       "ip4-full-reassembly-feature",
1686                                       sw_if_index, enable_disable, 0, 0);
1687 }
1688 #endif /* CLIB_MARCH_VARIANT */
1689
1690
1691 #define foreach_ip4_full_reass_handoff_error                       \
1692 _(CONGESTION_DROP, "congestion drop")
1693
1694
1695 typedef enum
1696 {
1697 #define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1698   foreach_ip4_full_reass_handoff_error
1699 #undef _
1700     IP4_FULL_REASS_HANDOFF_N_ERROR,
1701 } ip4_full_reass_handoff_error_t;
1702
1703 static char *ip4_full_reass_handoff_error_strings[] = {
1704 #define _(sym,string) string,
1705   foreach_ip4_full_reass_handoff_error
1706 #undef _
1707 };
1708
1709 typedef struct
1710 {
1711   u32 next_worker_index;
1712 } ip4_full_reass_handoff_trace_t;
1713
1714 static u8 *
1715 format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1716 {
1717   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1718   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1719   ip4_full_reass_handoff_trace_t *t =
1720     va_arg (*args, ip4_full_reass_handoff_trace_t *);
1721
1722   s =
1723     format (s, "ip4-full-reassembly-handoff: next-worker %d",
1724             t->next_worker_index);
1725
1726   return s;
1727 }
1728
1729 always_inline uword
1730 ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
1731                                     vlib_node_runtime_t * node,
1732                                     vlib_frame_t * frame, bool is_feature)
1733 {
1734   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1735
1736   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1737   u32 n_enq, n_left_from, *from;
1738   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1739   u32 fq_index;
1740
1741   from = vlib_frame_vector_args (frame);
1742   n_left_from = frame->n_vectors;
1743   vlib_get_buffers (vm, from, bufs, n_left_from);
1744
1745   b = bufs;
1746   ti = thread_indices;
1747
1748   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1749
1750   while (n_left_from > 0)
1751     {
1752       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1753
1754       if (PREDICT_FALSE
1755           ((node->flags & VLIB_NODE_FLAG_TRACE)
1756            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1757         {
1758           ip4_full_reass_handoff_trace_t *t =
1759             vlib_add_trace (vm, node, b[0], sizeof (*t));
1760           t->next_worker_index = ti[0];
1761         }
1762
1763       n_left_from -= 1;
1764       ti += 1;
1765       b += 1;
1766     }
1767   n_enq =
1768     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1769                                    frame->n_vectors, 1);
1770
1771   if (n_enq < frame->n_vectors)
1772     vlib_node_increment_counter (vm, node->node_index,
1773                                  IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1774                                  frame->n_vectors - n_enq);
1775   return frame->n_vectors;
1776 }
1777
1778 VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1779                                             vlib_node_runtime_t * node,
1780                                             vlib_frame_t * frame)
1781 {
1782   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1783                                              false /* is_feature */ );
1784 }
1785
1786
1787 /* *INDENT-OFF* */
1788 VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1789   .name = "ip4-full-reassembly-handoff",
1790   .vector_size = sizeof (u32),
1791   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1792   .error_strings = ip4_full_reass_handoff_error_strings,
1793   .format_trace = format_ip4_full_reass_handoff_trace,
1794
1795   .n_next_nodes = 1,
1796
1797   .next_nodes = {
1798     [0] = "error-drop",
1799   },
1800 };
1801 /* *INDENT-ON* */
1802
1803
1804 /* *INDENT-OFF* */
1805 VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
1806                                                     vlib_node_runtime_t *
1807                                                     node,
1808                                                     vlib_frame_t * frame)
1809 {
1810   return ip4_full_reass_handoff_node_inline (vm, node, frame,
1811                                              true /* is_feature */ );
1812 }
1813 /* *INDENT-ON* */
1814
1815
1816 /* *INDENT-OFF* */
1817 VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
1818   .name = "ip4-full-reass-feature-hoff",
1819   .vector_size = sizeof (u32),
1820   .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1821   .error_strings = ip4_full_reass_handoff_error_strings,
1822   .format_trace = format_ip4_full_reass_handoff_trace,
1823
1824   .n_next_nodes = 1,
1825
1826   .next_nodes = {
1827     [0] = "error-drop",
1828   },
1829 };
1830 /* *INDENT-ON* */
1831
1832 #ifndef CLIB_MARCH_VARIANT
1833 int
1834 ip4_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1835 {
1836   ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1837   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1838   if (is_enable)
1839     {
1840       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1841         {
1842           ++rm->feature_use_refcount_per_intf[sw_if_index];
1843           return vnet_feature_enable_disable ("ip4-unicast",
1844                                               "ip4-full-reassembly-feature",
1845                                               sw_if_index, 1, 0, 0);
1846         }
1847       ++rm->feature_use_refcount_per_intf[sw_if_index];
1848     }
1849   else
1850     {
1851       --rm->feature_use_refcount_per_intf[sw_if_index];
1852       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1853         return vnet_feature_enable_disable ("ip4-unicast",
1854                                             "ip4-full-reassembly-feature",
1855                                             sw_if_index, 0, 0, 0);
1856     }
1857   return -1;
1858 }
1859 #endif
1860
1861 /*
1862  * fd.io coding-style-patch-verification: ON
1863  *
1864  * Local Variables:
1865  * eval: (c-set-style "gnu")
1866  * End:
1867  */