hs-test: more debug output in http3 test
[vpp.git] / src / vnet / ip / reass / ip4_sv_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Shallow Virtual Reassembly.
19  *
20  * This file contains the source code for IPv4 Shallow Virtual reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vnet/ip/ip4_to_ip6.h>
27 #include <vppinfra/fifo.h>
28 #include <vppinfra/bihash_16_8.h>
29 #include <vnet/ip/reass/ip4_sv_reass.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_SV_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000      // 10 seconds default
34 #define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_SV_REASS_HT_LOAD_FACTOR (0.75)
37
38 typedef enum
39 {
40   IP4_SV_REASS_RC_OK,
41   IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS,
42   IP4_SV_REASS_RC_UNSUPP_IP_PROTO,
43 } ip4_sv_reass_rc_t;
44
45 typedef struct
46 {
47   union
48   {
49     struct
50     {
51       u32 fib_index;
52       ip4_address_t src;
53       ip4_address_t dst;
54       u16 frag_id;
55       u8 proto;
56       u8 unused;
57     };
58     u64 as_u64[2];
59   };
60 } ip4_sv_reass_key_t;
61
62 typedef union
63 {
64   struct
65   {
66     u32 reass_index;
67     u32 thread_index;
68   };
69   u64 as_u64;
70 } ip4_sv_reass_val_t;
71
72 typedef union
73 {
74   struct
75   {
76     ip4_sv_reass_key_t k;
77     ip4_sv_reass_val_t v;
78   };
79   clib_bihash_kv_16_8_t kv;
80 } ip4_sv_reass_kv_t;
81
82 typedef struct
83 {
84   // hash table key
85   ip4_sv_reass_key_t key;
86   // time when last packet was received
87   f64 last_heard;
88   // internal id of this reassembly
89   u64 id;
90   // trace operation counter
91   u32 trace_op_counter;
92   // minimum fragment length for this reassembly - used to estimate MTU
93   u16 min_fragment_length;
94   // buffer indexes of buffers in this reassembly in chronological order -
95   // including overlaps and duplicate fragments
96   u32 *cached_buffers;
97   // set to true when this reassembly is completed
98   bool is_complete;
99   // ip protocol
100   u8 ip_proto;
101   u8 icmp_type_or_tcp_flags;
102   u32 tcp_ack_number;
103   u32 tcp_seq_number;
104   // l4 src port
105   u16 l4_src_port;
106   // l4 dst port
107   u16 l4_dst_port;
108   u32 next_index;
109   // lru indexes
110   u32 lru_prev;
111   u32 lru_next;
112 } ip4_sv_reass_t;
113
114 typedef struct
115 {
116   ip4_sv_reass_t *pool;
117   u32 reass_n;
118   u32 id_counter;
119   clib_spinlock_t lock;
120   // lru indexes
121   u32 lru_first;
122   u32 lru_last;
123
124 } ip4_sv_reass_per_thread_t;
125
126 typedef struct
127 {
128   // IPv4 config
129   u32 timeout_ms;
130   f64 timeout;
131   u32 expire_walk_interval_ms;
132   // maximum number of fragments in one reassembly
133   u32 max_reass_len;
134   // maximum number of reassemblies
135   u32 max_reass_n;
136
137   // IPv4 runtime
138   clib_bihash_16_8_t hash;
139   // per-thread data
140   ip4_sv_reass_per_thread_t *per_thread_data;
141
142   // convenience
143   vlib_main_t *vlib_main;
144   vnet_main_t *vnet_main;
145
146   // node index of ip4-drop node
147   u32 ip4_drop_idx;
148   u32 ip4_sv_reass_expire_node_idx;
149
150   /** Worker handoff */
151   u32 fq_index;
152   u32 fq_feature_index;
153   u32 fq_custom_context_index;
154
155   // reference count for enabling/disabling feature - per interface
156   u32 *feature_use_refcount_per_intf;
157
158   // reference count for enabling/disabling feature - per interface
159   u32 *output_feature_use_refcount_per_intf;
160
161 } ip4_sv_reass_main_t;
162
163 extern ip4_sv_reass_main_t ip4_sv_reass_main;
164
165 #ifndef CLIB_MARCH_VARIANT
166 ip4_sv_reass_main_t ip4_sv_reass_main;
167 #endif /* CLIB_MARCH_VARIANT */
168
169 typedef enum
170 {
171   IP4_SV_REASSEMBLY_NEXT_INPUT,
172   IP4_SV_REASSEMBLY_NEXT_DROP,
173   IP4_SV_REASSEMBLY_NEXT_HANDOFF,
174   IP4_SV_REASSEMBLY_N_NEXT,
175 } ip4_sv_reass_next_t;
176
177 typedef enum
178 {
179   REASS_FRAGMENT_CACHE,
180   REASS_FINISH,
181   REASS_FRAGMENT_FORWARD,
182   REASS_PASSTHROUGH,
183 } ip4_sv_reass_trace_operation_e;
184
185 typedef struct
186 {
187   ip4_sv_reass_trace_operation_e action;
188   u32 reass_id;
189   u32 op_id;
190   u8 ip_proto;
191   u16 l4_src_port;
192   u16 l4_dst_port;
193   int l4_layer_truncated;
194 } ip4_sv_reass_trace_t;
195
196 extern vlib_node_registration_t ip4_sv_reass_node;
197 extern vlib_node_registration_t ip4_sv_reass_node_feature;
198
199 static u8 *
200 format_ip4_sv_reass_trace (u8 * s, va_list * args)
201 {
202   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
203   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
204   ip4_sv_reass_trace_t *t = va_arg (*args, ip4_sv_reass_trace_t *);
205   if (REASS_PASSTHROUGH != t->action)
206     {
207       s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
208     }
209   switch (t->action)
210     {
211     case REASS_FRAGMENT_CACHE:
212       s = format (s, "[cached]");
213       break;
214     case REASS_FINISH:
215       s =
216         format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
217                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
218                 clib_net_to_host_u16 (t->l4_dst_port));
219       break;
220     case REASS_FRAGMENT_FORWARD:
221       s =
222         format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
223                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
224                 clib_net_to_host_u16 (t->l4_dst_port));
225       break;
226     case REASS_PASSTHROUGH:
227       s = format (s, "[not-fragmented]");
228       break;
229     }
230   if (t->l4_layer_truncated)
231     {
232       s = format (s, " [l4-layer-truncated]");
233     }
234   return s;
235 }
236
237 static void
238 ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
239                         ip4_sv_reass_t *reass, u32 bi,
240                         ip4_sv_reass_trace_operation_e action, u32 ip_proto,
241                         u16 l4_src_port, u16 l4_dst_port,
242                         int l4_layer_truncated)
243 {
244   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
245   if (pool_is_free_index
246       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
247     {
248       // this buffer's trace is gone
249       b->flags &= ~VLIB_BUFFER_IS_TRACED;
250       return;
251     }
252   ip4_sv_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
253   if (reass)
254     {
255       t->reass_id = reass->id;
256       t->op_id = reass->trace_op_counter;
257       ++reass->trace_op_counter;
258     }
259   t->action = action;
260   t->ip_proto = ip_proto;
261   t->l4_src_port = l4_src_port;
262   t->l4_dst_port = l4_dst_port;
263   t->l4_layer_truncated = l4_layer_truncated;
264 #if 0
265   static u8 *s = NULL;
266   s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
267   printf ("%.*s\n", vec_len (s), s);
268   fflush (stdout);
269   vec_reset_length (s);
270 #endif
271 }
272
273
274 always_inline void
275 ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
276                    ip4_sv_reass_per_thread_t * rt, ip4_sv_reass_t * reass)
277 {
278   clib_bihash_kv_16_8_t kv;
279   kv.key[0] = reass->key.as_u64[0];
280   kv.key[1] = reass->key.as_u64[1];
281   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
282   vlib_buffer_free (vm, reass->cached_buffers,
283                     vec_len (reass->cached_buffers));
284   vec_free (reass->cached_buffers);
285   reass->cached_buffers = NULL;
286   if (~0 != reass->lru_prev)
287     {
288       ip4_sv_reass_t *lru_prev =
289         pool_elt_at_index (rt->pool, reass->lru_prev);
290       lru_prev->lru_next = reass->lru_next;
291     }
292   if (~0 != reass->lru_next)
293     {
294       ip4_sv_reass_t *lru_next =
295         pool_elt_at_index (rt->pool, reass->lru_next);
296       lru_next->lru_prev = reass->lru_prev;
297     }
298   if (rt->lru_first == reass - rt->pool)
299     {
300       rt->lru_first = reass->lru_next;
301     }
302   if (rt->lru_last == reass - rt->pool)
303     {
304       rt->lru_last = reass->lru_prev;
305     }
306   pool_put (rt->pool, reass);
307   --rt->reass_n;
308 }
309
310 always_inline void
311 ip4_sv_reass_init (ip4_sv_reass_t * reass)
312 {
313   reass->cached_buffers = NULL;
314   reass->is_complete = false;
315 }
316
317 always_inline ip4_sv_reass_t *
318 ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
319                              ip4_sv_reass_per_thread_t * rt,
320                              ip4_sv_reass_kv_t * kv, u8 * do_handoff)
321 {
322   ip4_sv_reass_t *reass = NULL;
323   f64 now = vlib_time_now (vm);
324
325 again:
326
327   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
328     {
329       if (vm->thread_index != kv->v.thread_index)
330         {
331           *do_handoff = 1;
332           return NULL;
333         }
334       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
335
336       if (now > reass->last_heard + rm->timeout)
337         {
338           ip4_sv_reass_free (vm, rm, rt, reass);
339           reass = NULL;
340         }
341     }
342
343   if (reass)
344     {
345       reass->last_heard = now;
346       return reass;
347     }
348
349   if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
350     {
351       reass = pool_elt_at_index (rt->pool, rt->lru_first);
352       ip4_sv_reass_free (vm, rm, rt, reass);
353     }
354
355   pool_get (rt->pool, reass);
356   clib_memset (reass, 0, sizeof (*reass));
357   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
358   ++rt->id_counter;
359   ip4_sv_reass_init (reass);
360   ++rt->reass_n;
361   reass->lru_prev = reass->lru_next = ~0;
362
363   if (~0 != rt->lru_last)
364     {
365       ip4_sv_reass_t *lru_last = pool_elt_at_index (rt->pool, rt->lru_last);
366       reass->lru_prev = rt->lru_last;
367       lru_last->lru_next = rt->lru_last = reass - rt->pool;
368     }
369
370   if (~0 == rt->lru_first)
371     {
372       rt->lru_first = rt->lru_last = reass - rt->pool;
373     }
374
375   reass->key.as_u64[0] = kv->kv.key[0];
376   reass->key.as_u64[1] = kv->kv.key[1];
377   kv->v.reass_index = (reass - rt->pool);
378   kv->v.thread_index = vm->thread_index;
379   reass->last_heard = now;
380
381   int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
382   if (rv)
383     {
384       ip4_sv_reass_free (vm, rm, rt, reass);
385       reass = NULL;
386       // if other worker created a context already work with the other copy
387       if (-2 == rv)
388         goto again;
389     }
390
391   return reass;
392 }
393
394 always_inline ip4_sv_reass_rc_t
395 ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
396                      ip4_sv_reass_main_t *rm, ip4_header_t *ip0,
397                      ip4_sv_reass_t *reass, u32 bi0)
398 {
399   vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
400   ip4_sv_reass_rc_t rc = IP4_SV_REASS_RC_OK;
401   const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
402   if (0 == fragment_first)
403     {
404       reass->ip_proto = ip0->protocol;
405       reass->l4_src_port = ip4_get_port (ip0, 1);
406       reass->l4_dst_port = ip4_get_port (ip0, 0);
407       if (!reass->l4_src_port || !reass->l4_dst_port)
408         return IP4_SV_REASS_RC_UNSUPP_IP_PROTO;
409       if (IP_PROTOCOL_TCP == reass->ip_proto)
410         {
411           reass->icmp_type_or_tcp_flags = ((tcp_header_t *) (ip0 + 1))->flags;
412           reass->tcp_ack_number = ((tcp_header_t *) (ip0 + 1))->ack_number;
413           reass->tcp_seq_number = ((tcp_header_t *) (ip0 + 1))->seq_number;
414         }
415       else if (IP_PROTOCOL_ICMP == reass->ip_proto)
416         {
417           reass->icmp_type_or_tcp_flags =
418             ((icmp46_header_t *) (ip0 + 1))->type;
419         }
420       reass->is_complete = true;
421       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
422       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
423         {
424           ip4_sv_reass_add_trace (
425             vm, node, reass, bi0, REASS_FINISH, reass->ip_proto,
426             reass->l4_src_port, reass->l4_dst_port,
427             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
428         }
429     }
430   vec_add1 (reass->cached_buffers, bi0);
431   if (!reass->is_complete)
432     {
433       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
434         {
435           ip4_sv_reass_add_trace (
436             vm, node, reass, bi0, REASS_FRAGMENT_CACHE, ~0, ~0, ~0,
437             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
438         }
439       if (vec_len (reass->cached_buffers) > rm->max_reass_len)
440         {
441           rc = IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS;
442         }
443     }
444   return rc;
445 }
446
447 always_inline int
448 l4_layer_truncated (ip4_header_t *ip)
449 {
450   static const int l4_layer_length[256] = {
451     [IP_PROTOCOL_TCP] = sizeof (tcp_header_t),
452     [IP_PROTOCOL_UDP] = sizeof (udp_header_t),
453     [IP_PROTOCOL_ICMP] = sizeof (icmp46_header_t),
454   };
455
456   return ((u8 *) ip + ip4_header_bytes (ip) + l4_layer_length[ip->protocol] >
457           (u8 *) ip + clib_net_to_host_u16 (ip->length));
458 }
459
460 always_inline uword
461 ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
462                      vlib_frame_t *frame, bool is_feature,
463                      bool is_output_feature, bool is_custom,
464                      bool with_custom_context)
465 {
466   u32 *from = vlib_frame_vector_args (frame);
467   u32 n_left_from, n_left_to_next, *to_next, *to_next_aux, next_index;
468   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
469   ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
470   u32 *context;
471   if (with_custom_context)
472     context = vlib_frame_aux_args (frame);
473
474   clib_spinlock_lock (&rt->lock);
475
476   n_left_from = frame->n_vectors;
477   next_index = node->cached_next_index;
478
479   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
480   vlib_get_buffers (vm, from, bufs, n_left_from);
481   u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
482   b = bufs;
483
484   /* optimistic case first - no fragments */
485   while (n_left_from >= 2)
486     {
487       vlib_buffer_t *b0, *b1;
488       u32 next0, next1;
489       b0 = *b;
490       b++;
491       b1 = *b;
492       b++;
493
494       /* Prefetch next iteration. */
495       if (PREDICT_TRUE (n_left_from >= 4))
496         {
497           vlib_buffer_t *p2, *p3;
498
499           p2 = *b;
500           p3 = *(b + 1);
501
502           vlib_prefetch_buffer_header (p2, LOAD);
503           vlib_prefetch_buffer_header (p3, LOAD);
504
505           clib_prefetch_load (p2->data);
506           clib_prefetch_load (p3->data);
507         }
508
509       ip4_header_t *ip0 =
510         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
511                                      (is_output_feature ? 1 : 0) *
512                                      vnet_buffer (b0)->
513                                      ip.save_rewrite_length);
514       ip4_header_t *ip1 =
515         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b1),
516                                      (is_output_feature ? 1 : 0) *
517                                      vnet_buffer (b1)->
518                                      ip.save_rewrite_length);
519
520       if (PREDICT_FALSE
521           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0))
522           || (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
523         {
524           // fragment found, go slow path
525           b -= 2;
526           if (b - bufs > 0)
527             {
528               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
529                                            b - bufs);
530             }
531           goto slow_path;
532         }
533       if (is_feature)
534         {
535           vnet_feature_next (&next0, b0);
536         }
537       else
538         {
539           next0 = is_custom ? vnet_buffer (b0)->ip.reass.next_index :
540             IP4_SV_REASSEMBLY_NEXT_INPUT;
541         }
542       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
543       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
544       if (l4_layer_truncated (ip0))
545         {
546           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
547           vnet_buffer (b0)->ip.reass.l4_src_port = 0;
548           vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
549         }
550       else
551         {
552           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
553           if (IP_PROTOCOL_TCP == ip0->protocol)
554             {
555               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
556                 ((tcp_header_t *) (ip0 + 1))->flags;
557               vnet_buffer (b0)->ip.reass.tcp_ack_number =
558                 ((tcp_header_t *) (ip0 + 1))->ack_number;
559               vnet_buffer (b0)->ip.reass.tcp_seq_number =
560                 ((tcp_header_t *) (ip0 + 1))->seq_number;
561             }
562           else if (IP_PROTOCOL_ICMP == ip0->protocol)
563             {
564               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
565                 ((icmp46_header_t *) (ip0 + 1))->type;
566             }
567           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
568           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
569         }
570       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
571         {
572           ip4_sv_reass_add_trace (
573             vm, node, NULL, from[(b - 2) - bufs], REASS_PASSTHROUGH,
574             vnet_buffer (b0)->ip.reass.ip_proto,
575             vnet_buffer (b0)->ip.reass.l4_src_port,
576             vnet_buffer (b0)->ip.reass.l4_dst_port,
577             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
578         }
579       if (is_feature)
580         {
581           vnet_feature_next (&next1, b1);
582         }
583       else
584         {
585           next1 = is_custom ? vnet_buffer (b1)->ip.reass.next_index :
586             IP4_SV_REASSEMBLY_NEXT_INPUT;
587         }
588       vnet_buffer (b1)->ip.reass.is_non_first_fragment = 0;
589       vnet_buffer (b1)->ip.reass.ip_proto = ip1->protocol;
590       if (l4_layer_truncated (ip1))
591         {
592           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 1;
593           vnet_buffer (b1)->ip.reass.l4_src_port = 0;
594           vnet_buffer (b1)->ip.reass.l4_dst_port = 0;
595         }
596       else
597         {
598           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 0;
599           if (IP_PROTOCOL_TCP == ip1->protocol)
600             {
601               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
602                 ((tcp_header_t *) (ip1 + 1))->flags;
603               vnet_buffer (b1)->ip.reass.tcp_ack_number =
604                 ((tcp_header_t *) (ip1 + 1))->ack_number;
605               vnet_buffer (b1)->ip.reass.tcp_seq_number =
606                 ((tcp_header_t *) (ip1 + 1))->seq_number;
607             }
608           else if (IP_PROTOCOL_ICMP == ip1->protocol)
609             {
610               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
611                 ((icmp46_header_t *) (ip1 + 1))->type;
612             }
613           vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
614           vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
615         }
616       if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
617         {
618           ip4_sv_reass_add_trace (
619             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
620             vnet_buffer (b1)->ip.reass.ip_proto,
621             vnet_buffer (b1)->ip.reass.l4_src_port,
622             vnet_buffer (b1)->ip.reass.l4_dst_port,
623             vnet_buffer (b1)->ip.reass.l4_layer_truncated);
624         }
625
626       n_left_from -= 2;
627       next[0] = next0;
628       next[1] = next1;
629       next += 2;
630       if (with_custom_context)
631         context += 2;
632     }
633
634   while (n_left_from > 0)
635     {
636       vlib_buffer_t *b0;
637       u32 next0;
638       b0 = *b;
639       b++;
640
641       ip4_header_t *ip0 =
642         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
643                                      (is_output_feature ? 1 : 0) *
644                                      vnet_buffer (b0)->
645                                      ip.save_rewrite_length);
646       if (PREDICT_FALSE
647           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0)))
648         {
649           // fragment found, go slow path
650           b -= 1;
651           if (b - bufs > 0)
652             {
653               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
654                                            b - bufs);
655             }
656           goto slow_path;
657         }
658       if (is_feature)
659         {
660           vnet_feature_next (&next0, b0);
661         }
662       else
663         {
664           next0 =
665             is_custom ? vnet_buffer (b0)->ip.
666             reass.next_index : IP4_SV_REASSEMBLY_NEXT_INPUT;
667         }
668       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
669       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
670       if (l4_layer_truncated (ip0))
671         {
672           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
673         }
674       else
675         {
676           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
677           if (IP_PROTOCOL_TCP == ip0->protocol)
678             {
679               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
680                 ((tcp_header_t *) (ip0 + 1))->flags;
681               vnet_buffer (b0)->ip.reass.tcp_ack_number =
682                 ((tcp_header_t *) (ip0 + 1))->ack_number;
683               vnet_buffer (b0)->ip.reass.tcp_seq_number =
684                 ((tcp_header_t *) (ip0 + 1))->seq_number;
685             }
686           else if (IP_PROTOCOL_ICMP == ip0->protocol)
687             {
688               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
689                 ((icmp46_header_t *) (ip0 + 1))->type;
690             }
691           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
692           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
693         }
694       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
695         {
696           ip4_sv_reass_add_trace (
697             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
698             vnet_buffer (b0)->ip.reass.ip_proto,
699             vnet_buffer (b0)->ip.reass.l4_src_port,
700             vnet_buffer (b0)->ip.reass.l4_dst_port,
701             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
702         }
703
704       n_left_from -= 1;
705       next[0] = next0;
706       next += 1;
707       if (with_custom_context)
708         context += 1;
709     }
710
711   vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
712                                frame->n_vectors);
713
714   goto done;
715
716 slow_path:
717
718   from += b - bufs;
719
720   while (n_left_from > 0)
721     {
722       if (with_custom_context)
723         vlib_get_next_frame_with_aux_safe (vm, node, next_index, to_next,
724                                            to_next_aux, n_left_to_next);
725       else
726         vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
727
728       while (n_left_from > 0 && n_left_to_next > 0)
729         {
730           u32 bi0;
731           vlib_buffer_t *b0;
732           u32 next0;
733           u32 error0 = IP4_ERROR_NONE;
734           u8 forward_context = 0;
735
736           bi0 = from[0];
737           b0 = vlib_get_buffer (vm, bi0);
738
739           ip4_header_t *ip0 =
740             (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
741                                          (is_output_feature ? 1 : 0) *
742                                          vnet_buffer (b0)->
743                                          ip.save_rewrite_length);
744           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
745             {
746               // this is a regular packet - no fragmentation
747               if (is_custom)
748                 {
749                   next0 = vnet_buffer (b0)->ip.reass.next_index;
750                 }
751               else
752                 {
753                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
754                 }
755               vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
756               vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
757               if (l4_layer_truncated (ip0))
758                 {
759                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
760                   vnet_buffer (b0)->ip.reass.l4_src_port = 0;
761                   vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
762                 }
763               else
764                 {
765                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
766                   if (IP_PROTOCOL_TCP == ip0->protocol)
767                     {
768                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
769                         ((tcp_header_t *) (ip0 + 1))->flags;
770                       vnet_buffer (b0)->ip.reass.tcp_ack_number =
771                         ((tcp_header_t *) (ip0 + 1))->ack_number;
772                       vnet_buffer (b0)->ip.reass.tcp_seq_number =
773                         ((tcp_header_t *) (ip0 + 1))->seq_number;
774                     }
775                   else if (IP_PROTOCOL_ICMP == ip0->protocol)
776                     {
777                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
778                         ((icmp46_header_t *) (ip0 + 1))->type;
779                     }
780                   vnet_buffer (b0)->ip.reass.l4_src_port =
781                     ip4_get_port (ip0, 1);
782                   vnet_buffer (b0)->ip.reass.l4_dst_port =
783                     ip4_get_port (ip0, 0);
784                 }
785               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
786                 {
787                   ip4_sv_reass_add_trace (
788                     vm, node, NULL, bi0, REASS_PASSTHROUGH,
789                     vnet_buffer (b0)->ip.reass.ip_proto,
790                     vnet_buffer (b0)->ip.reass.l4_src_port,
791                     vnet_buffer (b0)->ip.reass.l4_dst_port,
792                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
793                 }
794               goto packet_enqueue;
795             }
796           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
797           const u32 fragment_length =
798             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
799           const u32 fragment_last = fragment_first + fragment_length - 1;
800           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
801             {
802               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
803               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
804               b0->error = node->errors[error0];
805               goto packet_enqueue;
806             }
807           ip4_sv_reass_kv_t kv;
808           u8 do_handoff = 0;
809
810           if (with_custom_context)
811             kv.k.as_u64[0] = (u64) *context | (u64) ip0->src_address.as_u32
812                                                 << 32;
813           else
814             kv.k.as_u64[0] =
815               (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
816                              vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
817               (u64) ip0->src_address.as_u32 << 32;
818           kv.k.as_u64[1] = (u64) ip0->dst_address.as_u32 |
819                            (u64) ip0->fragment_id << 32 |
820                            (u64) ip0->protocol << 48;
821
822           ip4_sv_reass_t *reass =
823             ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
824
825           if (PREDICT_FALSE (do_handoff))
826             {
827               next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
828               vnet_buffer (b0)->ip.reass.owner_thread_index =
829                 kv.v.thread_index;
830               if (with_custom_context)
831                 forward_context = 1;
832               goto packet_enqueue;
833             }
834
835           if (!reass)
836             {
837               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
838               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
839               b0->error = node->errors[error0];
840               goto packet_enqueue;
841             }
842
843           if (reass->is_complete)
844             {
845               if (is_custom)
846                 {
847                   next0 = vnet_buffer (b0)->ip.reass.next_index;
848                 }
849               else
850                 {
851                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
852                 }
853               vnet_buffer (b0)->ip.reass.is_non_first_fragment =
854                 ! !fragment_first;
855               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
856               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
857                 reass->icmp_type_or_tcp_flags;
858               vnet_buffer (b0)->ip.reass.tcp_ack_number =
859                 reass->tcp_ack_number;
860               vnet_buffer (b0)->ip.reass.tcp_seq_number =
861                 reass->tcp_seq_number;
862               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
863               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
864               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
865                 {
866                   ip4_sv_reass_add_trace (
867                     vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
868                     reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
869                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
870                 }
871               goto packet_enqueue;
872             }
873
874           ip4_sv_reass_rc_t rc =
875             ip4_sv_reass_update (vm, node, rm, ip0, reass, bi0);
876           u32 counter = ~0;
877           switch (rc)
878             {
879             case IP4_SV_REASS_RC_OK:
880               /* nothing to do here */
881               break;
882             case IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS:
883               counter = IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
884               break;
885             case IP4_SV_REASS_RC_UNSUPP_IP_PROTO:
886               counter = IP4_ERROR_REASS_UNSUPP_IP_PROT;
887               break;
888             }
889           if (~0 != counter)
890             {
891               vlib_node_increment_counter (vm, node->node_index, counter, 1);
892               ip4_sv_reass_free (vm, rm, rt, reass);
893               goto next_packet;
894             }
895           if (reass->is_complete)
896             {
897               u32 idx;
898               vec_foreach_index (idx, reass->cached_buffers)
899               {
900                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
901                 vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
902                 ip0 =
903                   (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
904                                                (is_output_feature ? 1 : 0) *
905                                                vnet_buffer (b0)->
906                                                ip.save_rewrite_length);
907                 u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
908                 if (is_feature)
909                   {
910                     vnet_feature_next (&next0, b0);
911                   }
912                 if (is_custom)
913                   {
914                     next0 = vnet_buffer (b0)->ip.reass.next_index;
915                   }
916                 if (0 == n_left_to_next)
917                   {
918                     vlib_put_next_frame (vm, node, next_index,
919                                          n_left_to_next);
920                     vlib_get_next_frame (vm, node, next_index, to_next,
921                                          n_left_to_next);
922                   }
923                 to_next[0] = bi0;
924                 to_next += 1;
925                 n_left_to_next -= 1;
926                 vnet_buffer (b0)->ip.reass.is_non_first_fragment =
927                   ! !ip4_get_fragment_offset (ip0);
928                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
929                 vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
930                   reass->icmp_type_or_tcp_flags;
931                 vnet_buffer (b0)->ip.reass.tcp_ack_number =
932                   reass->tcp_ack_number;
933                 vnet_buffer (b0)->ip.reass.tcp_seq_number =
934                   reass->tcp_seq_number;
935                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
936                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
937                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
938                   {
939                     ip4_sv_reass_add_trace (
940                       vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
941                       reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
942                       vnet_buffer (b0)->ip.reass.l4_layer_truncated);
943                   }
944                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
945                                                  to_next, n_left_to_next, bi0,
946                                                  next0);
947               }
948               vec_set_len (reass->cached_buffers,
949                            0); // buffers are owned by frame now
950             }
951           goto next_packet;
952
953         packet_enqueue:
954           to_next[0] = bi0;
955           to_next += 1;
956           n_left_to_next -= 1;
957           if (is_feature && IP4_ERROR_NONE == error0)
958             {
959               b0 = vlib_get_buffer (vm, bi0);
960               vnet_feature_next (&next0, b0);
961             }
962           if (with_custom_context && forward_context)
963             {
964               if (to_next_aux)
965                 {
966                   to_next_aux[0] = *context;
967                   to_next_aux += 1;
968                 }
969               vlib_validate_buffer_enqueue_with_aux_x1 (
970                 vm, node, next_index, to_next, to_next_aux, n_left_to_next,
971                 bi0, *context, next0);
972             }
973           else
974             vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
975                                              n_left_to_next, bi0, next0);
976
977         next_packet:
978           from += 1;
979           n_left_from -= 1;
980           if (with_custom_context)
981             context += 1;
982         }
983
984       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
985     }
986
987 done:
988   clib_spinlock_unlock (&rt->lock);
989   return frame->n_vectors;
990 }
991
992 VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
993                                   vlib_node_runtime_t * node,
994                                   vlib_frame_t * frame)
995 {
996   return ip4_sv_reass_inline (
997     vm, node, frame, false /* is_feature */, false /* is_output_feature */,
998     false /* is_custom */, false /* with_custom_context */);
999 }
1000
1001 VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
1002     .name = "ip4-sv-reassembly",
1003     .vector_size = sizeof (u32),
1004     .format_trace = format_ip4_sv_reass_trace,
1005     .n_errors = IP4_N_ERROR,
1006     .error_counters = ip4_error_counters,
1007     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1008     .next_nodes =
1009         {
1010                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1011                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1012                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
1013
1014         },
1015 };
1016
1017 VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
1018                                           vlib_node_runtime_t * node,
1019                                           vlib_frame_t * frame)
1020 {
1021   return ip4_sv_reass_inline (
1022     vm, node, frame, true /* is_feature */, false /* is_output_feature */,
1023     false /* is_custom */, false /* with_custom_context */);
1024 }
1025
1026 VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
1027     .name = "ip4-sv-reassembly-feature",
1028     .vector_size = sizeof (u32),
1029     .format_trace = format_ip4_sv_reass_trace,
1030     .n_errors = IP4_N_ERROR,
1031     .error_counters = ip4_error_counters,
1032     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1033     .next_nodes =
1034         {
1035                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1036                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1037                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1038         },
1039 };
1040
1041 VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
1042     .arc_name = "ip4-unicast",
1043     .node_name = "ip4-sv-reassembly-feature",
1044     .runs_before = VNET_FEATURES ("ip4-lookup"),
1045     .runs_after = 0,
1046 };
1047
1048 VLIB_NODE_FN (ip4_sv_reass_node_output_feature) (vlib_main_t * vm,
1049                                                  vlib_node_runtime_t * node,
1050                                                  vlib_frame_t * frame)
1051 {
1052   return ip4_sv_reass_inline (
1053     vm, node, frame, true /* is_feature */, true /* is_output_feature */,
1054     false /* is_custom */, false /* with_custom_context */);
1055 }
1056
1057
1058 VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
1059     .name = "ip4-sv-reassembly-output-feature",
1060     .vector_size = sizeof (u32),
1061     .format_trace = format_ip4_sv_reass_trace,
1062     .n_errors = IP4_N_ERROR,
1063     .error_counters = ip4_error_counters,
1064     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1065     .next_nodes =
1066         {
1067                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1068                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1069                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1070         },
1071 };
1072
1073 VNET_FEATURE_INIT (ip4_sv_reass_output_feature) = {
1074     .arc_name = "ip4-output",
1075     .node_name = "ip4-sv-reassembly-output-feature",
1076     .runs_before = 0,
1077     .runs_after = 0,
1078 };
1079
1080 VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
1081     .name = "ip4-sv-reassembly-custom-next",
1082     .vector_size = sizeof (u32),
1083     .format_trace = format_ip4_sv_reass_trace,
1084     .n_errors = IP4_N_ERROR,
1085     .error_counters = ip4_error_counters,
1086     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1087     .next_nodes =
1088         {
1089                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1090                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1091                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
1092
1093         },
1094 };
1095
1096 VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
1097                                          vlib_node_runtime_t * node,
1098                                          vlib_frame_t * frame)
1099 {
1100   return ip4_sv_reass_inline (
1101     vm, node, frame, false /* is_feature */, false /* is_output_feature */,
1102     true /* is_custom */, false /* with_custom_context */);
1103 }
1104
1105 VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_node) = {
1106     .name = "ip4-sv-reassembly-custom-context",
1107     .vector_size = sizeof (u32),
1108     .aux_size = sizeof(u32),
1109     .format_trace = format_ip4_sv_reass_trace,
1110     .n_errors = IP4_N_ERROR,
1111     .error_counters = ip4_error_counters,
1112     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1113     .next_nodes =
1114         {
1115                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1116                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1117                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-custom-context-handoff",
1118
1119         },
1120 };
1121
1122 VLIB_NODE_FN (ip4_sv_reass_custom_context_node)
1123 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1124 {
1125   return ip4_sv_reass_inline (
1126     vm, node, frame, false /* is_feature */, false /* is_output_feature */,
1127     true /* is_custom */, true /* with_custom_context */);
1128 }
1129
1130 #ifndef CLIB_MARCH_VARIANT
1131 always_inline u32
1132 ip4_sv_reass_get_nbuckets ()
1133 {
1134   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1135   u32 nbuckets;
1136   u8 i;
1137
1138   nbuckets = (u32) (rm->max_reass_n / IP4_SV_REASS_HT_LOAD_FACTOR);
1139
1140   for (i = 0; i < 31; i++)
1141     if ((1 << i) >= nbuckets)
1142       break;
1143   nbuckets = 1 << i;
1144
1145   return nbuckets;
1146 }
1147 #endif /* CLIB_MARCH_VARIANT */
1148
1149 typedef enum
1150 {
1151   IP4_EVENT_CONFIG_CHANGED = 1,
1152 } ip4_sv_reass_event_t;
1153
1154 typedef struct
1155 {
1156   int failure;
1157   clib_bihash_16_8_t *new_hash;
1158 } ip4_rehash_cb_ctx;
1159
1160 #ifndef CLIB_MARCH_VARIANT
1161 static int
1162 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1163 {
1164   ip4_rehash_cb_ctx *ctx = _ctx;
1165   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1166     {
1167       ctx->failure = 1;
1168     }
1169   return (BIHASH_WALK_CONTINUE);
1170 }
1171
1172 static void
1173 ip4_sv_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1174                          u32 max_reassembly_length,
1175                          u32 expire_walk_interval_ms)
1176 {
1177   ip4_sv_reass_main.timeout_ms = timeout_ms;
1178   ip4_sv_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1179   ip4_sv_reass_main.max_reass_n = max_reassemblies;
1180   ip4_sv_reass_main.max_reass_len = max_reassembly_length;
1181   ip4_sv_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1182 }
1183
1184 vnet_api_error_t
1185 ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
1186                   u32 max_reassembly_length, u32 expire_walk_interval_ms)
1187 {
1188   u32 old_nbuckets = ip4_sv_reass_get_nbuckets ();
1189   ip4_sv_reass_set_params (timeout_ms, max_reassemblies,
1190                            max_reassembly_length, expire_walk_interval_ms);
1191   vlib_process_signal_event (ip4_sv_reass_main.vlib_main,
1192                              ip4_sv_reass_main.ip4_sv_reass_expire_node_idx,
1193                              IP4_EVENT_CONFIG_CHANGED, 0);
1194   u32 new_nbuckets = ip4_sv_reass_get_nbuckets ();
1195   if (ip4_sv_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1196     {
1197       clib_bihash_16_8_t new_hash;
1198       clib_memset (&new_hash, 0, sizeof (new_hash));
1199       ip4_rehash_cb_ctx ctx;
1200       ctx.failure = 0;
1201       ctx.new_hash = &new_hash;
1202       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1203                              new_nbuckets * 1024);
1204       clib_bihash_foreach_key_value_pair_16_8 (&ip4_sv_reass_main.hash,
1205                                                ip4_rehash_cb, &ctx);
1206       if (ctx.failure)
1207         {
1208           clib_bihash_free_16_8 (&new_hash);
1209           return -1;
1210         }
1211       else
1212         {
1213           clib_bihash_free_16_8 (&ip4_sv_reass_main.hash);
1214           clib_memcpy_fast (&ip4_sv_reass_main.hash, &new_hash,
1215                             sizeof (ip4_sv_reass_main.hash));
1216           clib_bihash_copied (&ip4_sv_reass_main.hash, &new_hash);
1217         }
1218     }
1219   return 0;
1220 }
1221
1222 vnet_api_error_t
1223 ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1224                   u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1225 {
1226   *timeout_ms = ip4_sv_reass_main.timeout_ms;
1227   *max_reassemblies = ip4_sv_reass_main.max_reass_n;
1228   *max_reassembly_length = ip4_sv_reass_main.max_reass_len;
1229   *expire_walk_interval_ms = ip4_sv_reass_main.expire_walk_interval_ms;
1230   return 0;
1231 }
1232
1233 static clib_error_t *
1234 ip4_sv_reass_init_function (vlib_main_t * vm)
1235 {
1236   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1237   clib_error_t *error = 0;
1238   u32 nbuckets;
1239   vlib_node_t *node;
1240
1241   rm->vlib_main = vm;
1242   rm->vnet_main = vnet_get_main ();
1243
1244   vec_validate (rm->per_thread_data, vlib_num_workers ());
1245   ip4_sv_reass_per_thread_t *rt;
1246   vec_foreach (rt, rm->per_thread_data)
1247   {
1248     clib_spinlock_init (&rt->lock);
1249     pool_alloc (rt->pool, rm->max_reass_n);
1250     rt->lru_first = rt->lru_last = ~0;
1251   }
1252
1253   node = vlib_get_node_by_name (vm, (u8 *) "ip4-sv-reassembly-expire-walk");
1254   ASSERT (node);
1255   rm->ip4_sv_reass_expire_node_idx = node->index;
1256
1257   ip4_sv_reass_set_params (IP4_SV_REASS_TIMEOUT_DEFAULT_MS,
1258                            IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT,
1259                            IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1260                            IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1261
1262   nbuckets = ip4_sv_reass_get_nbuckets ();
1263   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1264
1265   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1266   ASSERT (node);
1267   rm->ip4_drop_idx = node->index;
1268
1269   rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
1270   rm->fq_feature_index =
1271     vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
1272   rm->fq_custom_context_index =
1273     vlib_frame_queue_main_init (ip4_sv_reass_custom_context_node.index, 0);
1274
1275   rm->feature_use_refcount_per_intf = NULL;
1276   rm->output_feature_use_refcount_per_intf = NULL;
1277
1278   return error;
1279 }
1280
1281 VLIB_INIT_FUNCTION (ip4_sv_reass_init_function);
1282 #endif /* CLIB_MARCH_VARIANT */
1283
1284 static uword
1285 ip4_sv_reass_walk_expired (vlib_main_t *vm,
1286                            CLIB_UNUSED (vlib_node_runtime_t *node),
1287                            CLIB_UNUSED (vlib_frame_t *f))
1288 {
1289   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1290   uword event_type, *event_data = 0;
1291
1292   while (true)
1293     {
1294       vlib_process_wait_for_event_or_clock (vm,
1295                                             (f64)
1296                                             rm->expire_walk_interval_ms /
1297                                             (f64) MSEC_PER_SEC);
1298       event_type = vlib_process_get_events (vm, &event_data);
1299
1300       switch (event_type)
1301         {
1302         case ~0:
1303           /* no events => timeout */
1304           /* fallthrough */
1305         case IP4_EVENT_CONFIG_CHANGED:
1306           /* nothing to do here */
1307           break;
1308         default:
1309           clib_warning ("BUG: event type 0x%wx", event_type);
1310           break;
1311         }
1312       f64 now = vlib_time_now (vm);
1313
1314       ip4_sv_reass_t *reass;
1315       int *pool_indexes_to_free = NULL;
1316
1317       uword thread_index = 0;
1318       int index;
1319       const uword nthreads = vlib_num_workers () + 1;
1320       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1321         {
1322           ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1323           clib_spinlock_lock (&rt->lock);
1324
1325           vec_reset_length (pool_indexes_to_free);
1326           pool_foreach_index (index, rt->pool)  {
1327                                 reass = pool_elt_at_index (rt->pool, index);
1328                                 if (now > reass->last_heard + rm->timeout)
1329                                   {
1330                                     vec_add1 (pool_indexes_to_free, index);
1331                                   }
1332                               }
1333           int *i;
1334           vec_foreach (i, pool_indexes_to_free)
1335           {
1336             ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1337             ip4_sv_reass_free (vm, rm, rt, reass);
1338           }
1339
1340           clib_spinlock_unlock (&rt->lock);
1341         }
1342
1343       vec_free (pool_indexes_to_free);
1344       if (event_data)
1345         {
1346           vec_set_len (event_data, 0);
1347         }
1348     }
1349
1350   return 0;
1351 }
1352
1353 VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
1354   .function = ip4_sv_reass_walk_expired,
1355   .type = VLIB_NODE_TYPE_PROCESS,
1356   .name = "ip4-sv-reassembly-expire-walk",
1357   .format_trace = format_ip4_sv_reass_trace,
1358   .n_errors = IP4_N_ERROR,
1359   .error_counters = ip4_error_counters,
1360 };
1361
1362 static u8 *
1363 format_ip4_sv_reass_key (u8 * s, va_list * args)
1364 {
1365   ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
1366   s =
1367     format (s, "fib_index: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1368             key->fib_index, format_ip4_address, &key->src, format_ip4_address,
1369             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1370   return s;
1371 }
1372
1373 static u8 *
1374 format_ip4_sv_reass (u8 * s, va_list * args)
1375 {
1376   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1377   ip4_sv_reass_t *reass = va_arg (*args, ip4_sv_reass_t *);
1378
1379   s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n",
1380               reass->id, format_ip4_sv_reass_key, &reass->key,
1381               reass->trace_op_counter);
1382
1383   vlib_buffer_t *b;
1384   u32 *bip;
1385   u32 counter = 0;
1386   vec_foreach (bip, reass->cached_buffers)
1387   {
1388     u32 bi = *bip;
1389     do
1390       {
1391         b = vlib_get_buffer (vm, bi);
1392         s = format (s, "  #%03u: bi: %u, ", counter, bi);
1393         ++counter;
1394         bi = b->next_buffer;
1395       }
1396     while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
1397   }
1398   return s;
1399 }
1400
1401 static clib_error_t *
1402 show_ip4_reass (vlib_main_t * vm,
1403                 unformat_input_t * input,
1404                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1405 {
1406   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1407
1408   vlib_cli_output (vm, "---------------------");
1409   vlib_cli_output (vm, "IP4 reassembly status");
1410   vlib_cli_output (vm, "---------------------");
1411   bool details = false;
1412   if (unformat (input, "details"))
1413     {
1414       details = true;
1415     }
1416
1417   u32 sum_reass_n = 0;
1418   ip4_sv_reass_t *reass;
1419   uword thread_index;
1420   const uword nthreads = vlib_num_workers () + 1;
1421   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1422     {
1423       ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1424       clib_spinlock_lock (&rt->lock);
1425       if (details)
1426         {
1427           pool_foreach (reass, rt->pool) {
1428             vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
1429           }
1430         }
1431       sum_reass_n += rt->reass_n;
1432       clib_spinlock_unlock (&rt->lock);
1433     }
1434   vlib_cli_output (vm, "---------------------");
1435   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1436                    (long unsigned) sum_reass_n);
1437   vlib_cli_output (vm,
1438                    "Maximum configured concurrent shallow virtual IP4 reassemblies per worker-thread: %lu\n",
1439                    (long unsigned) rm->max_reass_n);
1440   vlib_cli_output (vm,
1441                    "Maximum configured amount of fragments per shallow "
1442                    "virtual IP4 reassembly: %lu\n",
1443                    (long unsigned) rm->max_reass_len);
1444   vlib_cli_output (vm,
1445                    "Maximum configured shallow virtual IP4 reassembly timeout: %lums\n",
1446                    (long unsigned) rm->timeout_ms);
1447   vlib_cli_output (vm,
1448                    "Maximum configured shallow virtual IP4 reassembly expire walk interval: %lums\n",
1449                    (long unsigned) rm->expire_walk_interval_ms);
1450   return 0;
1451 }
1452
1453 VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
1454     .path = "show ip4-sv-reassembly",
1455     .short_help = "show ip4-sv-reassembly [details]",
1456     .function = show_ip4_reass,
1457 };
1458
1459 #ifndef CLIB_MARCH_VARIANT
1460 vnet_api_error_t
1461 ip4_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1462 {
1463   return ip4_sv_reass_enable_disable_with_refcnt (sw_if_index,
1464                                                   enable_disable);
1465 }
1466 #endif /* CLIB_MARCH_VARIANT */
1467
1468
1469 #define foreach_ip4_sv_reass_handoff_error                       \
1470 _(CONGESTION_DROP, "congestion drop")
1471
1472
1473 typedef enum
1474 {
1475 #define _(sym,str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
1476   foreach_ip4_sv_reass_handoff_error
1477 #undef _
1478     IP4_SV_REASSEMBLY_HANDOFF_N_ERROR,
1479 } ip4_sv_reass_handoff_error_t;
1480
1481 static char *ip4_sv_reass_handoff_error_strings[] = {
1482 #define _(sym,string) string,
1483   foreach_ip4_sv_reass_handoff_error
1484 #undef _
1485 };
1486
1487 typedef struct
1488 {
1489   u32 next_worker_index;
1490 } ip4_sv_reass_handoff_trace_t;
1491
1492 static u8 *
1493 format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
1494 {
1495   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1496   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1497   ip4_sv_reass_handoff_trace_t *t =
1498     va_arg (*args, ip4_sv_reass_handoff_trace_t *);
1499
1500   s =
1501     format (s, "ip4-sv-reassembly-handoff: next-worker %d",
1502             t->next_worker_index);
1503
1504   return s;
1505 }
1506
1507 always_inline uword
1508 ip4_sv_reass_handoff_node_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
1509                                   vlib_frame_t *frame, bool is_feature,
1510                                   bool is_custom_context)
1511 {
1512   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1513
1514   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1515   u32 n_enq, n_left_from, *from, *context;
1516   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1517   u32 fq_index;
1518
1519   from = vlib_frame_vector_args (frame);
1520   if (is_custom_context)
1521     context = vlib_frame_aux_args (frame);
1522
1523   n_left_from = frame->n_vectors;
1524   vlib_get_buffers (vm, from, bufs, n_left_from);
1525
1526   b = bufs;
1527   ti = thread_indices;
1528
1529   fq_index = (is_feature) ? rm->fq_feature_index :
1530                                   (is_custom_context ? rm->fq_custom_context_index :
1531                                                        rm->fq_index);
1532
1533   while (n_left_from > 0)
1534     {
1535       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1536
1537       if (PREDICT_FALSE
1538           ((node->flags & VLIB_NODE_FLAG_TRACE)
1539            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1540         {
1541           ip4_sv_reass_handoff_trace_t *t =
1542             vlib_add_trace (vm, node, b[0], sizeof (*t));
1543           t->next_worker_index = ti[0];
1544         }
1545
1546       n_left_from -= 1;
1547       ti += 1;
1548       b += 1;
1549     }
1550   if (is_custom_context)
1551     n_enq = vlib_buffer_enqueue_to_thread_with_aux (
1552       vm, node, fq_index, from, context, thread_indices, frame->n_vectors, 1);
1553   else
1554     n_enq = vlib_buffer_enqueue_to_thread (
1555       vm, node, fq_index, from, thread_indices, frame->n_vectors, 1);
1556
1557   if (n_enq < frame->n_vectors)
1558     vlib_node_increment_counter (vm, node->node_index,
1559                                  IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1560                                  frame->n_vectors - n_enq);
1561   return frame->n_vectors;
1562 }
1563
1564 VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
1565                                           vlib_node_runtime_t * node,
1566                                           vlib_frame_t * frame)
1567 {
1568   return ip4_sv_reass_handoff_node_inline (
1569     vm, node, frame, false /* is_feature */, false /* is_custom_context */);
1570 }
1571
1572
1573 VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
1574   .name = "ip4-sv-reassembly-handoff",
1575   .vector_size = sizeof (u32),
1576   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1577   .error_strings = ip4_sv_reass_handoff_error_strings,
1578   .format_trace = format_ip4_sv_reass_handoff_trace,
1579
1580   .n_next_nodes = 1,
1581
1582   .next_nodes = {
1583     [0] = "error-drop",
1584   },
1585 };
1586
1587 VLIB_NODE_FN (ip4_sv_reass_custom_context_handoff_node)
1588 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1589 {
1590   return ip4_sv_reass_handoff_node_inline (
1591     vm, node, frame, false /* is_feature */, true /* is_custom_context */);
1592 }
1593
1594 VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_handoff_node) = {
1595   .name = "ip4-sv-reassembly-custom-context-handoff",
1596   .vector_size = sizeof (u32),
1597   .aux_size = sizeof (u32),
1598   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1599   .error_strings = ip4_sv_reass_handoff_error_strings,
1600   .format_trace = format_ip4_sv_reass_handoff_trace,
1601
1602   .n_next_nodes = 1,
1603
1604   .next_nodes = {
1605     [0] = "error-drop",
1606   },
1607 };
1608
1609 VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
1610                                                     vlib_node_runtime_t *
1611                                                     node,
1612                                                     vlib_frame_t * frame)
1613 {
1614   return ip4_sv_reass_handoff_node_inline (
1615     vm, node, frame, true /* is_feature */, false /* is_custom_context */);
1616 }
1617
1618
1619 VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
1620   .name = "ip4-sv-reass-feature-hoff",
1621   .vector_size = sizeof (u32),
1622   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1623   .error_strings = ip4_sv_reass_handoff_error_strings,
1624   .format_trace = format_ip4_sv_reass_handoff_trace,
1625
1626   .n_next_nodes = 1,
1627
1628   .next_nodes = {
1629     [0] = "error-drop",
1630   },
1631 };
1632
1633 #ifndef CLIB_MARCH_VARIANT
1634 int
1635 ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1636 {
1637   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1638   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1639   if (is_enable)
1640     {
1641       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1642         {
1643           ++rm->feature_use_refcount_per_intf[sw_if_index];
1644           return vnet_feature_enable_disable ("ip4-unicast",
1645                                               "ip4-sv-reassembly-feature",
1646                                               sw_if_index, 1, 0, 0);
1647         }
1648       ++rm->feature_use_refcount_per_intf[sw_if_index];
1649     }
1650   else
1651     {
1652       if (rm->feature_use_refcount_per_intf[sw_if_index])
1653         --rm->feature_use_refcount_per_intf[sw_if_index];
1654       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1655         return vnet_feature_enable_disable ("ip4-unicast",
1656                                             "ip4-sv-reassembly-feature",
1657                                             sw_if_index, 0, 0, 0);
1658     }
1659   return 0;
1660 }
1661
1662 uword
1663 ip4_sv_reass_custom_register_next_node (uword node_index)
1664 {
1665   return vlib_node_add_next (vlib_get_main (), ip4_sv_reass_custom_node.index,
1666                              node_index);
1667 }
1668
1669 uword
1670 ip4_sv_reass_custom_context_register_next_node (uword node_index)
1671 {
1672   return vlib_node_add_next (
1673     vlib_get_main (), ip4_sv_reass_custom_context_node.index, node_index);
1674 }
1675
1676 int
1677 ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
1678                                                 int is_enable)
1679 {
1680   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1681   vec_validate (rm->output_feature_use_refcount_per_intf, sw_if_index);
1682   if (is_enable)
1683     {
1684       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1685         {
1686           ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1687           return vnet_feature_enable_disable ("ip4-output",
1688                                               "ip4-sv-reassembly-output-feature",
1689                                               sw_if_index, 1, 0, 0);
1690         }
1691       ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1692     }
1693   else
1694     {
1695       if (rm->output_feature_use_refcount_per_intf[sw_if_index])
1696         --rm->output_feature_use_refcount_per_intf[sw_if_index];
1697       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1698         return vnet_feature_enable_disable ("ip4-output",
1699                                             "ip4-sv-reassembly-output-feature",
1700                                             sw_if_index, 0, 0, 0);
1701     }
1702   return 0;
1703 }
1704 #endif
1705
1706 /*
1707  * fd.io coding-style-patch-verification: ON
1708  *
1709  * Local Variables:
1710  * eval: (c-set-style "gnu")
1711  * End:
1712  */