ip: reassembly - custom context instead of VRF
[vpp.git] / src / vnet / ip / reass / ip4_sv_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Shallow Virtual Reassembly.
19  *
20  * This file contains the source code for IPv4 Shallow Virtual reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vnet/ip/ip4_to_ip6.h>
27 #include <vppinfra/fifo.h>
28 #include <vppinfra/bihash_16_8.h>
29 #include <vnet/ip/reass/ip4_sv_reass.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_SV_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000      // 10 seconds default
34 #define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_SV_REASS_HT_LOAD_FACTOR (0.75)
37
38 typedef enum
39 {
40   IP4_SV_REASS_RC_OK,
41   IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS,
42   IP4_SV_REASS_RC_UNSUPP_IP_PROTO,
43 } ip4_sv_reass_rc_t;
44
45 typedef struct
46 {
47   union
48   {
49     struct
50     {
51       u32 fib_index;
52       ip4_address_t src;
53       ip4_address_t dst;
54       u16 frag_id;
55       u8 proto;
56       u8 unused;
57     };
58     u64 as_u64[2];
59   };
60 } ip4_sv_reass_key_t;
61
62 typedef union
63 {
64   struct
65   {
66     u32 reass_index;
67     u32 thread_index;
68   };
69   u64 as_u64;
70 } ip4_sv_reass_val_t;
71
72 typedef union
73 {
74   struct
75   {
76     ip4_sv_reass_key_t k;
77     ip4_sv_reass_val_t v;
78   };
79   clib_bihash_kv_16_8_t kv;
80 } ip4_sv_reass_kv_t;
81
82 typedef struct
83 {
84   // hash table key
85   ip4_sv_reass_key_t key;
86   // time when last packet was received
87   f64 last_heard;
88   // internal id of this reassembly
89   u64 id;
90   // trace operation counter
91   u32 trace_op_counter;
92   // minimum fragment length for this reassembly - used to estimate MTU
93   u16 min_fragment_length;
94   // buffer indexes of buffers in this reassembly in chronological order -
95   // including overlaps and duplicate fragments
96   u32 *cached_buffers;
97   // set to true when this reassembly is completed
98   bool is_complete;
99   // ip protocol
100   u8 ip_proto;
101   u8 icmp_type_or_tcp_flags;
102   u32 tcp_ack_number;
103   u32 tcp_seq_number;
104   // l4 src port
105   u16 l4_src_port;
106   // l4 dst port
107   u16 l4_dst_port;
108   u32 next_index;
109   // lru indexes
110   u32 lru_prev;
111   u32 lru_next;
112 } ip4_sv_reass_t;
113
114 typedef struct
115 {
116   ip4_sv_reass_t *pool;
117   u32 reass_n;
118   u32 id_counter;
119   clib_spinlock_t lock;
120   // lru indexes
121   u32 lru_first;
122   u32 lru_last;
123
124 } ip4_sv_reass_per_thread_t;
125
126 typedef struct
127 {
128   // IPv4 config
129   u32 timeout_ms;
130   f64 timeout;
131   u32 expire_walk_interval_ms;
132   // maximum number of fragments in one reassembly
133   u32 max_reass_len;
134   // maximum number of reassemblies
135   u32 max_reass_n;
136
137   // IPv4 runtime
138   clib_bihash_16_8_t hash;
139   // per-thread data
140   ip4_sv_reass_per_thread_t *per_thread_data;
141
142   // convenience
143   vlib_main_t *vlib_main;
144   vnet_main_t *vnet_main;
145
146   // node index of ip4-drop node
147   u32 ip4_drop_idx;
148   u32 ip4_sv_reass_expire_node_idx;
149
150   /** Worker handoff */
151   u32 fq_index;
152   u32 fq_feature_index;
153   u32 fq_custom_context_index;
154
155   // reference count for enabling/disabling feature - per interface
156   u32 *feature_use_refcount_per_intf;
157
158   // reference count for enabling/disabling feature - per interface
159   u32 *output_feature_use_refcount_per_intf;
160
161 } ip4_sv_reass_main_t;
162
163 extern ip4_sv_reass_main_t ip4_sv_reass_main;
164
165 #ifndef CLIB_MARCH_VARIANT
166 ip4_sv_reass_main_t ip4_sv_reass_main;
167 #endif /* CLIB_MARCH_VARIANT */
168
169 typedef enum
170 {
171   IP4_SV_REASSEMBLY_NEXT_INPUT,
172   IP4_SV_REASSEMBLY_NEXT_DROP,
173   IP4_SV_REASSEMBLY_NEXT_HANDOFF,
174   IP4_SV_REASSEMBLY_N_NEXT,
175 } ip4_sv_reass_next_t;
176
177 typedef enum
178 {
179   REASS_FRAGMENT_CACHE,
180   REASS_FINISH,
181   REASS_FRAGMENT_FORWARD,
182   REASS_PASSTHROUGH,
183 } ip4_sv_reass_trace_operation_e;
184
185 typedef struct
186 {
187   ip4_sv_reass_trace_operation_e action;
188   u32 reass_id;
189   u32 op_id;
190   u8 ip_proto;
191   u16 l4_src_port;
192   u16 l4_dst_port;
193   int l4_layer_truncated;
194 } ip4_sv_reass_trace_t;
195
196 extern vlib_node_registration_t ip4_sv_reass_node;
197 extern vlib_node_registration_t ip4_sv_reass_node_feature;
198
199 static u8 *
200 format_ip4_sv_reass_trace (u8 * s, va_list * args)
201 {
202   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
203   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
204   ip4_sv_reass_trace_t *t = va_arg (*args, ip4_sv_reass_trace_t *);
205   if (REASS_PASSTHROUGH != t->action)
206     {
207       s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
208     }
209   switch (t->action)
210     {
211     case REASS_FRAGMENT_CACHE:
212       s = format (s, "[cached]");
213       break;
214     case REASS_FINISH:
215       s =
216         format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
217                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
218                 clib_net_to_host_u16 (t->l4_dst_port));
219       break;
220     case REASS_FRAGMENT_FORWARD:
221       s =
222         format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
223                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
224                 clib_net_to_host_u16 (t->l4_dst_port));
225       break;
226     case REASS_PASSTHROUGH:
227       s = format (s, "[not-fragmented]");
228       break;
229     }
230   if (t->l4_layer_truncated)
231     {
232       s = format (s, " [l4-layer-truncated]");
233     }
234   return s;
235 }
236
237 static void
238 ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
239                         ip4_sv_reass_t *reass, u32 bi,
240                         ip4_sv_reass_trace_operation_e action, u32 ip_proto,
241                         u16 l4_src_port, u16 l4_dst_port,
242                         int l4_layer_truncated)
243 {
244   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
245   if (pool_is_free_index
246       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
247     {
248       // this buffer's trace is gone
249       b->flags &= ~VLIB_BUFFER_IS_TRACED;
250       return;
251     }
252   ip4_sv_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
253   if (reass)
254     {
255       t->reass_id = reass->id;
256       t->op_id = reass->trace_op_counter;
257       ++reass->trace_op_counter;
258     }
259   t->action = action;
260   t->ip_proto = ip_proto;
261   t->l4_src_port = l4_src_port;
262   t->l4_dst_port = l4_dst_port;
263   t->l4_layer_truncated = l4_layer_truncated;
264 #if 0
265   static u8 *s = NULL;
266   s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
267   printf ("%.*s\n", vec_len (s), s);
268   fflush (stdout);
269   vec_reset_length (s);
270 #endif
271 }
272
273
274 always_inline void
275 ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
276                    ip4_sv_reass_per_thread_t * rt, ip4_sv_reass_t * reass)
277 {
278   clib_bihash_kv_16_8_t kv;
279   kv.key[0] = reass->key.as_u64[0];
280   kv.key[1] = reass->key.as_u64[1];
281   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
282   vlib_buffer_free (vm, reass->cached_buffers,
283                     vec_len (reass->cached_buffers));
284   vec_free (reass->cached_buffers);
285   reass->cached_buffers = NULL;
286   if (~0 != reass->lru_prev)
287     {
288       ip4_sv_reass_t *lru_prev =
289         pool_elt_at_index (rt->pool, reass->lru_prev);
290       lru_prev->lru_next = reass->lru_next;
291     }
292   if (~0 != reass->lru_next)
293     {
294       ip4_sv_reass_t *lru_next =
295         pool_elt_at_index (rt->pool, reass->lru_next);
296       lru_next->lru_prev = reass->lru_prev;
297     }
298   if (rt->lru_first == reass - rt->pool)
299     {
300       rt->lru_first = reass->lru_next;
301     }
302   if (rt->lru_last == reass - rt->pool)
303     {
304       rt->lru_last = reass->lru_prev;
305     }
306   pool_put (rt->pool, reass);
307   --rt->reass_n;
308 }
309
310 always_inline void
311 ip4_sv_reass_init (ip4_sv_reass_t * reass)
312 {
313   reass->cached_buffers = NULL;
314   reass->is_complete = false;
315 }
316
317 always_inline ip4_sv_reass_t *
318 ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
319                              ip4_sv_reass_per_thread_t * rt,
320                              ip4_sv_reass_kv_t * kv, u8 * do_handoff)
321 {
322   ip4_sv_reass_t *reass = NULL;
323   f64 now = vlib_time_now (vm);
324
325 again:
326
327   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
328     {
329       if (vm->thread_index != kv->v.thread_index)
330         {
331           *do_handoff = 1;
332           return NULL;
333         }
334       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
335
336       if (now > reass->last_heard + rm->timeout)
337         {
338           ip4_sv_reass_free (vm, rm, rt, reass);
339           reass = NULL;
340         }
341     }
342
343   if (reass)
344     {
345       reass->last_heard = now;
346       return reass;
347     }
348
349   if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
350     {
351       reass = pool_elt_at_index (rt->pool, rt->lru_first);
352       ip4_sv_reass_free (vm, rm, rt, reass);
353     }
354
355   pool_get (rt->pool, reass);
356   clib_memset (reass, 0, sizeof (*reass));
357   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
358   ++rt->id_counter;
359   ip4_sv_reass_init (reass);
360   ++rt->reass_n;
361   reass->lru_prev = reass->lru_next = ~0;
362
363   if (~0 != rt->lru_last)
364     {
365       ip4_sv_reass_t *lru_last = pool_elt_at_index (rt->pool, rt->lru_last);
366       reass->lru_prev = rt->lru_last;
367       lru_last->lru_next = rt->lru_last = reass - rt->pool;
368     }
369
370   if (~0 == rt->lru_first)
371     {
372       rt->lru_first = rt->lru_last = reass - rt->pool;
373     }
374
375   reass->key.as_u64[0] = kv->kv.key[0];
376   reass->key.as_u64[1] = kv->kv.key[1];
377   kv->v.reass_index = (reass - rt->pool);
378   kv->v.thread_index = vm->thread_index;
379   reass->last_heard = now;
380
381   int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
382   if (rv)
383     {
384       ip4_sv_reass_free (vm, rm, rt, reass);
385       reass = NULL;
386       // if other worker created a context already work with the other copy
387       if (-2 == rv)
388         goto again;
389     }
390
391   return reass;
392 }
393
394 always_inline ip4_sv_reass_rc_t
395 ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
396                      ip4_sv_reass_main_t *rm, ip4_header_t *ip0,
397                      ip4_sv_reass_t *reass, u32 bi0)
398 {
399   vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
400   ip4_sv_reass_rc_t rc = IP4_SV_REASS_RC_OK;
401   const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
402   if (0 == fragment_first)
403     {
404       reass->ip_proto = ip0->protocol;
405       reass->l4_src_port = ip4_get_port (ip0, 1);
406       reass->l4_dst_port = ip4_get_port (ip0, 0);
407       if (!reass->l4_src_port || !reass->l4_dst_port)
408         return IP4_SV_REASS_RC_UNSUPP_IP_PROTO;
409       if (IP_PROTOCOL_TCP == reass->ip_proto)
410         {
411           reass->icmp_type_or_tcp_flags = ((tcp_header_t *) (ip0 + 1))->flags;
412           reass->tcp_ack_number = ((tcp_header_t *) (ip0 + 1))->ack_number;
413           reass->tcp_seq_number = ((tcp_header_t *) (ip0 + 1))->seq_number;
414         }
415       else if (IP_PROTOCOL_ICMP == reass->ip_proto)
416         {
417           reass->icmp_type_or_tcp_flags =
418             ((icmp46_header_t *) (ip0 + 1))->type;
419         }
420       reass->is_complete = true;
421       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
422       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
423         {
424           ip4_sv_reass_add_trace (
425             vm, node, reass, bi0, REASS_FINISH, reass->ip_proto,
426             reass->l4_src_port, reass->l4_dst_port,
427             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
428         }
429     }
430   vec_add1 (reass->cached_buffers, bi0);
431   if (!reass->is_complete)
432     {
433       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
434         {
435           ip4_sv_reass_add_trace (
436             vm, node, reass, bi0, REASS_FRAGMENT_CACHE, ~0, ~0, ~0,
437             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
438         }
439       if (vec_len (reass->cached_buffers) > rm->max_reass_len)
440         {
441           rc = IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS;
442         }
443     }
444   return rc;
445 }
446
447 always_inline int
448 l4_layer_truncated (ip4_header_t *ip)
449 {
450   static const int l4_layer_length[256] = {
451     [IP_PROTOCOL_TCP] = sizeof (tcp_header_t),
452     [IP_PROTOCOL_UDP] = sizeof (udp_header_t),
453     [IP_PROTOCOL_ICMP] = sizeof (icmp46_header_t),
454   };
455
456   return ((u8 *) ip + ip4_header_bytes (ip) + l4_layer_length[ip->protocol] >
457           (u8 *) ip + clib_net_to_host_u16 (ip->length));
458 }
459
460 always_inline uword
461 ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
462                      vlib_frame_t *frame, bool is_feature,
463                      bool is_output_feature, bool is_custom,
464                      bool with_custom_context)
465 {
466   u32 *from = vlib_frame_vector_args (frame);
467   u32 n_left_from, n_left_to_next, *to_next, *to_next_aux, next_index;
468   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
469   ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
470   u32 *context;
471   if (with_custom_context)
472     context = vlib_frame_aux_args (frame);
473
474   clib_spinlock_lock (&rt->lock);
475
476   n_left_from = frame->n_vectors;
477   next_index = node->cached_next_index;
478
479   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
480   vlib_get_buffers (vm, from, bufs, n_left_from);
481   u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
482   b = bufs;
483
484   /* optimistic case first - no fragments */
485   while (n_left_from >= 2)
486     {
487       vlib_buffer_t *b0, *b1;
488       u32 next0, next1;
489       b0 = *b;
490       b++;
491       b1 = *b;
492       b++;
493
494       /* Prefetch next iteration. */
495       if (PREDICT_TRUE (n_left_from >= 4))
496         {
497           vlib_buffer_t *p2, *p3;
498
499           p2 = *b;
500           p3 = *(b + 1);
501
502           vlib_prefetch_buffer_header (p2, LOAD);
503           vlib_prefetch_buffer_header (p3, LOAD);
504
505           clib_prefetch_load (p2->data);
506           clib_prefetch_load (p3->data);
507         }
508
509       ip4_header_t *ip0 =
510         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
511                                      (is_output_feature ? 1 : 0) *
512                                      vnet_buffer (b0)->
513                                      ip.save_rewrite_length);
514       ip4_header_t *ip1 =
515         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b1),
516                                      (is_output_feature ? 1 : 0) *
517                                      vnet_buffer (b1)->
518                                      ip.save_rewrite_length);
519
520       if (PREDICT_FALSE
521           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0))
522           || (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
523         {
524           // fragment found, go slow path
525           b -= 2;
526           if (b - bufs > 0)
527             {
528               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
529                                            b - bufs);
530             }
531           goto slow_path;
532         }
533       if (is_feature)
534         {
535           vnet_feature_next (&next0, b0);
536         }
537       else
538         {
539           next0 = is_custom ? vnet_buffer (b0)->ip.reass.next_index :
540             IP4_SV_REASSEMBLY_NEXT_INPUT;
541         }
542       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
543       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
544       if (l4_layer_truncated (ip0))
545         {
546           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
547           vnet_buffer (b0)->ip.reass.l4_src_port = 0;
548           vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
549         }
550       else
551         {
552           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
553           if (IP_PROTOCOL_TCP == ip0->protocol)
554             {
555               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
556                 ((tcp_header_t *) (ip0 + 1))->flags;
557               vnet_buffer (b0)->ip.reass.tcp_ack_number =
558                 ((tcp_header_t *) (ip0 + 1))->ack_number;
559               vnet_buffer (b0)->ip.reass.tcp_seq_number =
560                 ((tcp_header_t *) (ip0 + 1))->seq_number;
561             }
562           else if (IP_PROTOCOL_ICMP == ip0->protocol)
563             {
564               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
565                 ((icmp46_header_t *) (ip0 + 1))->type;
566             }
567           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
568           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
569         }
570       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
571         {
572           ip4_sv_reass_add_trace (
573             vm, node, NULL, from[(b - 2) - bufs], REASS_PASSTHROUGH,
574             vnet_buffer (b0)->ip.reass.ip_proto,
575             vnet_buffer (b0)->ip.reass.l4_src_port,
576             vnet_buffer (b0)->ip.reass.l4_dst_port,
577             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
578         }
579       if (is_feature)
580         {
581           vnet_feature_next (&next1, b1);
582         }
583       else
584         {
585           next1 = is_custom ? vnet_buffer (b1)->ip.reass.next_index :
586             IP4_SV_REASSEMBLY_NEXT_INPUT;
587         }
588       vnet_buffer (b1)->ip.reass.is_non_first_fragment = 0;
589       vnet_buffer (b1)->ip.reass.ip_proto = ip1->protocol;
590       if (l4_layer_truncated (ip1))
591         {
592           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 1;
593           vnet_buffer (b1)->ip.reass.l4_src_port = 0;
594           vnet_buffer (b1)->ip.reass.l4_dst_port = 0;
595         }
596       else
597         {
598           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 0;
599           if (IP_PROTOCOL_TCP == ip1->protocol)
600             {
601               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
602                 ((tcp_header_t *) (ip1 + 1))->flags;
603               vnet_buffer (b1)->ip.reass.tcp_ack_number =
604                 ((tcp_header_t *) (ip1 + 1))->ack_number;
605               vnet_buffer (b1)->ip.reass.tcp_seq_number =
606                 ((tcp_header_t *) (ip1 + 1))->seq_number;
607             }
608           else if (IP_PROTOCOL_ICMP == ip1->protocol)
609             {
610               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
611                 ((icmp46_header_t *) (ip1 + 1))->type;
612             }
613           vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
614           vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
615         }
616       if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
617         {
618           ip4_sv_reass_add_trace (
619             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
620             vnet_buffer (b1)->ip.reass.ip_proto,
621             vnet_buffer (b1)->ip.reass.l4_src_port,
622             vnet_buffer (b1)->ip.reass.l4_dst_port,
623             vnet_buffer (b1)->ip.reass.l4_layer_truncated);
624         }
625
626       n_left_from -= 2;
627       next[0] = next0;
628       next[1] = next1;
629       next += 2;
630       if (with_custom_context)
631         context += 2;
632     }
633
634   while (n_left_from > 0)
635     {
636       vlib_buffer_t *b0;
637       u32 next0;
638       b0 = *b;
639       b++;
640
641       ip4_header_t *ip0 =
642         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
643                                      (is_output_feature ? 1 : 0) *
644                                      vnet_buffer (b0)->
645                                      ip.save_rewrite_length);
646       if (PREDICT_FALSE
647           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0)))
648         {
649           // fragment found, go slow path
650           b -= 1;
651           if (b - bufs > 0)
652             {
653               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
654                                            b - bufs);
655             }
656           goto slow_path;
657         }
658       if (is_feature)
659         {
660           vnet_feature_next (&next0, b0);
661         }
662       else
663         {
664           next0 =
665             is_custom ? vnet_buffer (b0)->ip.
666             reass.next_index : IP4_SV_REASSEMBLY_NEXT_INPUT;
667         }
668       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
669       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
670       if (l4_layer_truncated (ip0))
671         {
672           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
673         }
674       else
675         {
676           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
677           if (IP_PROTOCOL_TCP == ip0->protocol)
678             {
679               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
680                 ((tcp_header_t *) (ip0 + 1))->flags;
681               vnet_buffer (b0)->ip.reass.tcp_ack_number =
682                 ((tcp_header_t *) (ip0 + 1))->ack_number;
683               vnet_buffer (b0)->ip.reass.tcp_seq_number =
684                 ((tcp_header_t *) (ip0 + 1))->seq_number;
685             }
686           else if (IP_PROTOCOL_ICMP == ip0->protocol)
687             {
688               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
689                 ((icmp46_header_t *) (ip0 + 1))->type;
690             }
691           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
692           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
693         }
694       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
695         {
696           ip4_sv_reass_add_trace (
697             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
698             vnet_buffer (b0)->ip.reass.ip_proto,
699             vnet_buffer (b0)->ip.reass.l4_src_port,
700             vnet_buffer (b0)->ip.reass.l4_dst_port,
701             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
702         }
703
704       n_left_from -= 1;
705       next[0] = next0;
706       next += 1;
707       if (with_custom_context)
708         context += 1;
709     }
710
711   vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
712                                frame->n_vectors);
713
714   goto done;
715
716 slow_path:
717
718   from += b - bufs;
719
720   while (n_left_from > 0)
721     {
722       if (with_custom_context)
723         vlib_get_next_frame_with_aux_safe (vm, node, next_index, to_next,
724                                            to_next_aux, n_left_to_next);
725       else
726         vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
727
728       while (n_left_from > 0 && n_left_to_next > 0)
729         {
730           u32 bi0;
731           vlib_buffer_t *b0;
732           u32 next0;
733           u32 error0 = IP4_ERROR_NONE;
734           u8 forward_context = 0;
735
736           bi0 = from[0];
737           b0 = vlib_get_buffer (vm, bi0);
738
739           ip4_header_t *ip0 =
740             (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
741                                          (is_output_feature ? 1 : 0) *
742                                          vnet_buffer (b0)->
743                                          ip.save_rewrite_length);
744           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
745             {
746               // this is a regular packet - no fragmentation
747               if (is_custom)
748                 {
749                   next0 = vnet_buffer (b0)->ip.reass.next_index;
750                 }
751               else
752                 {
753                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
754                 }
755               vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
756               vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
757               if (l4_layer_truncated (ip0))
758                 {
759                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
760                   vnet_buffer (b0)->ip.reass.l4_src_port = 0;
761                   vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
762                 }
763               else
764                 {
765                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
766                   if (IP_PROTOCOL_TCP == ip0->protocol)
767                     {
768                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
769                         ((tcp_header_t *) (ip0 + 1))->flags;
770                       vnet_buffer (b0)->ip.reass.tcp_ack_number =
771                         ((tcp_header_t *) (ip0 + 1))->ack_number;
772                       vnet_buffer (b0)->ip.reass.tcp_seq_number =
773                         ((tcp_header_t *) (ip0 + 1))->seq_number;
774                     }
775                   else if (IP_PROTOCOL_ICMP == ip0->protocol)
776                     {
777                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
778                         ((icmp46_header_t *) (ip0 + 1))->type;
779                     }
780                   vnet_buffer (b0)->ip.reass.l4_src_port =
781                     ip4_get_port (ip0, 1);
782                   vnet_buffer (b0)->ip.reass.l4_dst_port =
783                     ip4_get_port (ip0, 0);
784                 }
785               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
786                 {
787                   ip4_sv_reass_add_trace (
788                     vm, node, NULL, bi0, REASS_PASSTHROUGH,
789                     vnet_buffer (b0)->ip.reass.ip_proto,
790                     vnet_buffer (b0)->ip.reass.l4_src_port,
791                     vnet_buffer (b0)->ip.reass.l4_dst_port,
792                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
793                 }
794               goto packet_enqueue;
795             }
796           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
797           const u32 fragment_length =
798             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
799           const u32 fragment_last = fragment_first + fragment_length - 1;
800           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
801             {
802               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
803               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
804               b0->error = node->errors[error0];
805               goto packet_enqueue;
806             }
807           ip4_sv_reass_kv_t kv;
808           u8 do_handoff = 0;
809
810           if (with_custom_context)
811             kv.k.as_u64[0] = (u64) *context | (u64) ip0->src_address.as_u32
812                                                 << 32;
813           else
814             kv.k.as_u64[0] =
815               (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
816                              vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
817               (u64) ip0->src_address.as_u32 << 32;
818           kv.k.as_u64[1] = (u64) ip0->dst_address.as_u32 |
819                            (u64) ip0->fragment_id << 32 |
820                            (u64) ip0->protocol << 48;
821
822           ip4_sv_reass_t *reass =
823             ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
824
825           if (PREDICT_FALSE (do_handoff))
826             {
827               next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
828               vnet_buffer (b0)->ip.reass.owner_thread_index =
829                 kv.v.thread_index;
830               if (with_custom_context)
831                 forward_context = 1;
832               goto packet_enqueue;
833             }
834
835           if (!reass)
836             {
837               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
838               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
839               b0->error = node->errors[error0];
840               goto packet_enqueue;
841             }
842
843           if (reass->is_complete)
844             {
845               if (is_custom)
846                 {
847                   next0 = vnet_buffer (b0)->ip.reass.next_index;
848                 }
849               else
850                 {
851                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
852                 }
853               vnet_buffer (b0)->ip.reass.is_non_first_fragment =
854                 ! !fragment_first;
855               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
856               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
857                 reass->icmp_type_or_tcp_flags;
858               vnet_buffer (b0)->ip.reass.tcp_ack_number =
859                 reass->tcp_ack_number;
860               vnet_buffer (b0)->ip.reass.tcp_seq_number =
861                 reass->tcp_seq_number;
862               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
863               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
864               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
865                 {
866                   ip4_sv_reass_add_trace (
867                     vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
868                     reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
869                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
870                 }
871               goto packet_enqueue;
872             }
873
874           ip4_sv_reass_rc_t rc =
875             ip4_sv_reass_update (vm, node, rm, ip0, reass, bi0);
876           u32 counter = ~0;
877           switch (rc)
878             {
879             case IP4_SV_REASS_RC_OK:
880               /* nothing to do here */
881               break;
882             case IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS:
883               counter = IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
884               break;
885             case IP4_SV_REASS_RC_UNSUPP_IP_PROTO:
886               counter = IP4_ERROR_REASS_UNSUPP_IP_PROT;
887               break;
888             }
889           if (~0 != counter)
890             {
891               vlib_node_increment_counter (vm, node->node_index, counter, 1);
892               ip4_sv_reass_free (vm, rm, rt, reass);
893               goto next_packet;
894             }
895           if (reass->is_complete)
896             {
897               u32 idx;
898               vec_foreach_index (idx, reass->cached_buffers)
899               {
900                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
901                 vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
902                 ip0 =
903                   (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
904                                                (is_output_feature ? 1 : 0) *
905                                                vnet_buffer (b0)->
906                                                ip.save_rewrite_length);
907                 u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
908                 if (is_feature)
909                   {
910                     vnet_feature_next (&next0, b0);
911                   }
912                 if (is_custom)
913                   {
914                     next0 = vnet_buffer (b0)->ip.reass.next_index;
915                   }
916                 if (0 == n_left_to_next)
917                   {
918                     vlib_put_next_frame (vm, node, next_index,
919                                          n_left_to_next);
920                     vlib_get_next_frame (vm, node, next_index, to_next,
921                                          n_left_to_next);
922                   }
923                 to_next[0] = bi0;
924                 to_next += 1;
925                 n_left_to_next -= 1;
926                 vnet_buffer (b0)->ip.reass.is_non_first_fragment =
927                   ! !ip4_get_fragment_offset (ip0);
928                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
929                 vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
930                   reass->icmp_type_or_tcp_flags;
931                 vnet_buffer (b0)->ip.reass.tcp_ack_number =
932                   reass->tcp_ack_number;
933                 vnet_buffer (b0)->ip.reass.tcp_seq_number =
934                   reass->tcp_seq_number;
935                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
936                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
937                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
938                   {
939                     ip4_sv_reass_add_trace (
940                       vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
941                       reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
942                       vnet_buffer (b0)->ip.reass.l4_layer_truncated);
943                   }
944                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
945                                                  to_next, n_left_to_next, bi0,
946                                                  next0);
947               }
948               vec_set_len (reass->cached_buffers,
949                            0); // buffers are owned by frame now
950             }
951           goto next_packet;
952
953         packet_enqueue:
954           to_next[0] = bi0;
955           to_next += 1;
956           n_left_to_next -= 1;
957           if (is_feature && IP4_ERROR_NONE == error0)
958             {
959               b0 = vlib_get_buffer (vm, bi0);
960               vnet_feature_next (&next0, b0);
961             }
962           if (with_custom_context && forward_context)
963             {
964               if (to_next_aux)
965                 {
966                   to_next_aux[0] = *context;
967                   to_next_aux += 1;
968                 }
969               vlib_validate_buffer_enqueue_with_aux_x1 (
970                 vm, node, next_index, to_next, to_next_aux, n_left_to_next,
971                 bi0, *context, next0);
972             }
973           else
974             vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
975                                              n_left_to_next, bi0, next0);
976
977         next_packet:
978           from += 1;
979           n_left_from -= 1;
980           if (with_custom_context)
981             context += 1;
982         }
983
984       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
985     }
986
987 done:
988   clib_spinlock_unlock (&rt->lock);
989   return frame->n_vectors;
990 }
991
992 VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
993                                   vlib_node_runtime_t * node,
994                                   vlib_frame_t * frame)
995 {
996   return ip4_sv_reass_inline (
997     vm, node, frame, false /* is_feature */, false /* is_output_feature */,
998     false /* is_custom */, false /* with_custom_context */);
999 }
1000
1001 /* *INDENT-OFF* */
1002 VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
1003     .name = "ip4-sv-reassembly",
1004     .vector_size = sizeof (u32),
1005     .format_trace = format_ip4_sv_reass_trace,
1006     .n_errors = IP4_N_ERROR,
1007     .error_counters = ip4_error_counters,
1008     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1009     .next_nodes =
1010         {
1011                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1012                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1013                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
1014
1015         },
1016 };
1017 /* *INDENT-ON* */
1018
1019 VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
1020                                           vlib_node_runtime_t * node,
1021                                           vlib_frame_t * frame)
1022 {
1023   return ip4_sv_reass_inline (
1024     vm, node, frame, true /* is_feature */, false /* is_output_feature */,
1025     false /* is_custom */, false /* with_custom_context */);
1026 }
1027
1028 /* *INDENT-OFF* */
1029 VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
1030     .name = "ip4-sv-reassembly-feature",
1031     .vector_size = sizeof (u32),
1032     .format_trace = format_ip4_sv_reass_trace,
1033     .n_errors = IP4_N_ERROR,
1034     .error_counters = ip4_error_counters,
1035     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1036     .next_nodes =
1037         {
1038                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1039                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1040                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1041         },
1042 };
1043 /* *INDENT-ON* */
1044
1045 /* *INDENT-OFF* */
1046 VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
1047     .arc_name = "ip4-unicast",
1048     .node_name = "ip4-sv-reassembly-feature",
1049     .runs_before = VNET_FEATURES ("ip4-lookup"),
1050     .runs_after = 0,
1051 };
1052 /* *INDENT-ON* */
1053
1054 VLIB_NODE_FN (ip4_sv_reass_node_output_feature) (vlib_main_t * vm,
1055                                                  vlib_node_runtime_t * node,
1056                                                  vlib_frame_t * frame)
1057 {
1058   return ip4_sv_reass_inline (
1059     vm, node, frame, true /* is_feature */, true /* is_output_feature */,
1060     false /* is_custom */, false /* with_custom_context */);
1061 }
1062
1063
1064 /* *INDENT-OFF* */
1065 VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
1066     .name = "ip4-sv-reassembly-output-feature",
1067     .vector_size = sizeof (u32),
1068     .format_trace = format_ip4_sv_reass_trace,
1069     .n_errors = IP4_N_ERROR,
1070     .error_counters = ip4_error_counters,
1071     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1072     .next_nodes =
1073         {
1074                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1075                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1076                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1077         },
1078 };
1079 /* *INDENT-ON* */
1080
1081 /* *INDENT-OFF* */
1082 VNET_FEATURE_INIT (ip4_sv_reass_output_feature) = {
1083     .arc_name = "ip4-output",
1084     .node_name = "ip4-sv-reassembly-output-feature",
1085     .runs_before = 0,
1086     .runs_after = 0,
1087 };
1088 /* *INDENT-ON* */
1089
1090 VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
1091     .name = "ip4-sv-reassembly-custom-next",
1092     .vector_size = sizeof (u32),
1093     .format_trace = format_ip4_sv_reass_trace,
1094     .n_errors = IP4_N_ERROR,
1095     .error_counters = ip4_error_counters,
1096     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1097     .next_nodes =
1098         {
1099                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1100                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1101                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
1102
1103         },
1104 };
1105
1106 VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
1107                                          vlib_node_runtime_t * node,
1108                                          vlib_frame_t * frame)
1109 {
1110   return ip4_sv_reass_inline (
1111     vm, node, frame, false /* is_feature */, false /* is_output_feature */,
1112     true /* is_custom */, false /* with_custom_context */);
1113 }
1114
1115 VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_node) = {
1116     .name = "ip4-sv-reassembly-custom-context",
1117     .vector_size = sizeof (u32),
1118     .aux_size = sizeof(u32),
1119     .format_trace = format_ip4_sv_reass_trace,
1120     .n_errors = IP4_N_ERROR,
1121     .error_counters = ip4_error_counters,
1122     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1123     .next_nodes =
1124         {
1125                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1126                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1127                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-custom-context-handoff",
1128
1129         },
1130 };
1131
1132 VLIB_NODE_FN (ip4_sv_reass_custom_context_node)
1133 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1134 {
1135   return ip4_sv_reass_inline (
1136     vm, node, frame, false /* is_feature */, false /* is_output_feature */,
1137     true /* is_custom */, true /* with_custom_context */);
1138 }
1139
1140 #ifndef CLIB_MARCH_VARIANT
1141 always_inline u32
1142 ip4_sv_reass_get_nbuckets ()
1143 {
1144   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1145   u32 nbuckets;
1146   u8 i;
1147
1148   nbuckets = (u32) (rm->max_reass_n / IP4_SV_REASS_HT_LOAD_FACTOR);
1149
1150   for (i = 0; i < 31; i++)
1151     if ((1 << i) >= nbuckets)
1152       break;
1153   nbuckets = 1 << i;
1154
1155   return nbuckets;
1156 }
1157 #endif /* CLIB_MARCH_VARIANT */
1158
1159 typedef enum
1160 {
1161   IP4_EVENT_CONFIG_CHANGED = 1,
1162 } ip4_sv_reass_event_t;
1163
1164 typedef struct
1165 {
1166   int failure;
1167   clib_bihash_16_8_t *new_hash;
1168 } ip4_rehash_cb_ctx;
1169
1170 #ifndef CLIB_MARCH_VARIANT
1171 static int
1172 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1173 {
1174   ip4_rehash_cb_ctx *ctx = _ctx;
1175   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1176     {
1177       ctx->failure = 1;
1178     }
1179   return (BIHASH_WALK_CONTINUE);
1180 }
1181
1182 static void
1183 ip4_sv_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1184                          u32 max_reassembly_length,
1185                          u32 expire_walk_interval_ms)
1186 {
1187   ip4_sv_reass_main.timeout_ms = timeout_ms;
1188   ip4_sv_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1189   ip4_sv_reass_main.max_reass_n = max_reassemblies;
1190   ip4_sv_reass_main.max_reass_len = max_reassembly_length;
1191   ip4_sv_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1192 }
1193
1194 vnet_api_error_t
1195 ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
1196                   u32 max_reassembly_length, u32 expire_walk_interval_ms)
1197 {
1198   u32 old_nbuckets = ip4_sv_reass_get_nbuckets ();
1199   ip4_sv_reass_set_params (timeout_ms, max_reassemblies,
1200                            max_reassembly_length, expire_walk_interval_ms);
1201   vlib_process_signal_event (ip4_sv_reass_main.vlib_main,
1202                              ip4_sv_reass_main.ip4_sv_reass_expire_node_idx,
1203                              IP4_EVENT_CONFIG_CHANGED, 0);
1204   u32 new_nbuckets = ip4_sv_reass_get_nbuckets ();
1205   if (ip4_sv_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1206     {
1207       clib_bihash_16_8_t new_hash;
1208       clib_memset (&new_hash, 0, sizeof (new_hash));
1209       ip4_rehash_cb_ctx ctx;
1210       ctx.failure = 0;
1211       ctx.new_hash = &new_hash;
1212       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1213                              new_nbuckets * 1024);
1214       clib_bihash_foreach_key_value_pair_16_8 (&ip4_sv_reass_main.hash,
1215                                                ip4_rehash_cb, &ctx);
1216       if (ctx.failure)
1217         {
1218           clib_bihash_free_16_8 (&new_hash);
1219           return -1;
1220         }
1221       else
1222         {
1223           clib_bihash_free_16_8 (&ip4_sv_reass_main.hash);
1224           clib_memcpy_fast (&ip4_sv_reass_main.hash, &new_hash,
1225                             sizeof (ip4_sv_reass_main.hash));
1226           clib_bihash_copied (&ip4_sv_reass_main.hash, &new_hash);
1227         }
1228     }
1229   return 0;
1230 }
1231
1232 vnet_api_error_t
1233 ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1234                   u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1235 {
1236   *timeout_ms = ip4_sv_reass_main.timeout_ms;
1237   *max_reassemblies = ip4_sv_reass_main.max_reass_n;
1238   *max_reassembly_length = ip4_sv_reass_main.max_reass_len;
1239   *expire_walk_interval_ms = ip4_sv_reass_main.expire_walk_interval_ms;
1240   return 0;
1241 }
1242
1243 static clib_error_t *
1244 ip4_sv_reass_init_function (vlib_main_t * vm)
1245 {
1246   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1247   clib_error_t *error = 0;
1248   u32 nbuckets;
1249   vlib_node_t *node;
1250
1251   rm->vlib_main = vm;
1252   rm->vnet_main = vnet_get_main ();
1253
1254   vec_validate (rm->per_thread_data, vlib_num_workers ());
1255   ip4_sv_reass_per_thread_t *rt;
1256   vec_foreach (rt, rm->per_thread_data)
1257   {
1258     clib_spinlock_init (&rt->lock);
1259     pool_alloc (rt->pool, rm->max_reass_n);
1260     rt->lru_first = rt->lru_last = ~0;
1261   }
1262
1263   node = vlib_get_node_by_name (vm, (u8 *) "ip4-sv-reassembly-expire-walk");
1264   ASSERT (node);
1265   rm->ip4_sv_reass_expire_node_idx = node->index;
1266
1267   ip4_sv_reass_set_params (IP4_SV_REASS_TIMEOUT_DEFAULT_MS,
1268                            IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT,
1269                            IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1270                            IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1271
1272   nbuckets = ip4_sv_reass_get_nbuckets ();
1273   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1274
1275   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1276   ASSERT (node);
1277   rm->ip4_drop_idx = node->index;
1278
1279   rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
1280   rm->fq_feature_index =
1281     vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
1282   rm->fq_custom_context_index =
1283     vlib_frame_queue_main_init (ip4_sv_reass_custom_context_node.index, 0);
1284
1285   rm->feature_use_refcount_per_intf = NULL;
1286   rm->output_feature_use_refcount_per_intf = NULL;
1287
1288   return error;
1289 }
1290
1291 VLIB_INIT_FUNCTION (ip4_sv_reass_init_function);
1292 #endif /* CLIB_MARCH_VARIANT */
1293
1294 static uword
1295 ip4_sv_reass_walk_expired (vlib_main_t *vm,
1296                            CLIB_UNUSED (vlib_node_runtime_t *node),
1297                            CLIB_UNUSED (vlib_frame_t *f))
1298 {
1299   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1300   uword event_type, *event_data = 0;
1301
1302   while (true)
1303     {
1304       vlib_process_wait_for_event_or_clock (vm,
1305                                             (f64)
1306                                             rm->expire_walk_interval_ms /
1307                                             (f64) MSEC_PER_SEC);
1308       event_type = vlib_process_get_events (vm, &event_data);
1309
1310       switch (event_type)
1311         {
1312         case ~0:
1313           /* no events => timeout */
1314           /* fallthrough */
1315         case IP4_EVENT_CONFIG_CHANGED:
1316           /* nothing to do here */
1317           break;
1318         default:
1319           clib_warning ("BUG: event type 0x%wx", event_type);
1320           break;
1321         }
1322       f64 now = vlib_time_now (vm);
1323
1324       ip4_sv_reass_t *reass;
1325       int *pool_indexes_to_free = NULL;
1326
1327       uword thread_index = 0;
1328       int index;
1329       const uword nthreads = vlib_num_workers () + 1;
1330       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1331         {
1332           ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1333           clib_spinlock_lock (&rt->lock);
1334
1335           vec_reset_length (pool_indexes_to_free);
1336           /* *INDENT-OFF* */
1337           pool_foreach_index (index, rt->pool)  {
1338                                 reass = pool_elt_at_index (rt->pool, index);
1339                                 if (now > reass->last_heard + rm->timeout)
1340                                   {
1341                                     vec_add1 (pool_indexes_to_free, index);
1342                                   }
1343                               }
1344           /* *INDENT-ON* */
1345           int *i;
1346           /* *INDENT-OFF* */
1347           vec_foreach (i, pool_indexes_to_free)
1348           {
1349             ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1350             ip4_sv_reass_free (vm, rm, rt, reass);
1351           }
1352           /* *INDENT-ON* */
1353
1354           clib_spinlock_unlock (&rt->lock);
1355         }
1356
1357       vec_free (pool_indexes_to_free);
1358       if (event_data)
1359         {
1360           vec_set_len (event_data, 0);
1361         }
1362     }
1363
1364   return 0;
1365 }
1366
1367 /* *INDENT-OFF* */
1368 VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
1369   .function = ip4_sv_reass_walk_expired,
1370   .type = VLIB_NODE_TYPE_PROCESS,
1371   .name = "ip4-sv-reassembly-expire-walk",
1372   .format_trace = format_ip4_sv_reass_trace,
1373   .n_errors = IP4_N_ERROR,
1374   .error_counters = ip4_error_counters,
1375 };
1376 /* *INDENT-ON* */
1377
1378 static u8 *
1379 format_ip4_sv_reass_key (u8 * s, va_list * args)
1380 {
1381   ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
1382   s =
1383     format (s, "fib_index: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1384             key->fib_index, format_ip4_address, &key->src, format_ip4_address,
1385             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1386   return s;
1387 }
1388
1389 static u8 *
1390 format_ip4_sv_reass (u8 * s, va_list * args)
1391 {
1392   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1393   ip4_sv_reass_t *reass = va_arg (*args, ip4_sv_reass_t *);
1394
1395   s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n",
1396               reass->id, format_ip4_sv_reass_key, &reass->key,
1397               reass->trace_op_counter);
1398
1399   vlib_buffer_t *b;
1400   u32 *bip;
1401   u32 counter = 0;
1402   vec_foreach (bip, reass->cached_buffers)
1403   {
1404     u32 bi = *bip;
1405     do
1406       {
1407         b = vlib_get_buffer (vm, bi);
1408         s = format (s, "  #%03u: bi: %u, ", counter, bi);
1409         ++counter;
1410         bi = b->next_buffer;
1411       }
1412     while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
1413   }
1414   return s;
1415 }
1416
1417 static clib_error_t *
1418 show_ip4_reass (vlib_main_t * vm,
1419                 unformat_input_t * input,
1420                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1421 {
1422   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1423
1424   vlib_cli_output (vm, "---------------------");
1425   vlib_cli_output (vm, "IP4 reassembly status");
1426   vlib_cli_output (vm, "---------------------");
1427   bool details = false;
1428   if (unformat (input, "details"))
1429     {
1430       details = true;
1431     }
1432
1433   u32 sum_reass_n = 0;
1434   ip4_sv_reass_t *reass;
1435   uword thread_index;
1436   const uword nthreads = vlib_num_workers () + 1;
1437   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1438     {
1439       ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1440       clib_spinlock_lock (&rt->lock);
1441       if (details)
1442         {
1443           /* *INDENT-OFF* */
1444           pool_foreach (reass, rt->pool) {
1445             vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
1446           }
1447           /* *INDENT-ON* */
1448         }
1449       sum_reass_n += rt->reass_n;
1450       clib_spinlock_unlock (&rt->lock);
1451     }
1452   vlib_cli_output (vm, "---------------------");
1453   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1454                    (long unsigned) sum_reass_n);
1455   vlib_cli_output (vm,
1456                    "Maximum configured concurrent shallow virtual IP4 reassemblies per worker-thread: %lu\n",
1457                    (long unsigned) rm->max_reass_n);
1458   vlib_cli_output (vm,
1459                    "Maximum configured amount of fragments per shallow "
1460                    "virtual IP4 reassembly: %lu\n",
1461                    (long unsigned) rm->max_reass_len);
1462   vlib_cli_output (vm,
1463                    "Maximum configured shallow virtual IP4 reassembly timeout: %lums\n",
1464                    (long unsigned) rm->timeout_ms);
1465   vlib_cli_output (vm,
1466                    "Maximum configured shallow virtual IP4 reassembly expire walk interval: %lums\n",
1467                    (long unsigned) rm->expire_walk_interval_ms);
1468   return 0;
1469 }
1470
1471 /* *INDENT-OFF* */
1472 VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
1473     .path = "show ip4-sv-reassembly",
1474     .short_help = "show ip4-sv-reassembly [details]",
1475     .function = show_ip4_reass,
1476 };
1477 /* *INDENT-ON* */
1478
1479 #ifndef CLIB_MARCH_VARIANT
1480 vnet_api_error_t
1481 ip4_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1482 {
1483   return ip4_sv_reass_enable_disable_with_refcnt (sw_if_index,
1484                                                   enable_disable);
1485 }
1486 #endif /* CLIB_MARCH_VARIANT */
1487
1488
1489 #define foreach_ip4_sv_reass_handoff_error                       \
1490 _(CONGESTION_DROP, "congestion drop")
1491
1492
1493 typedef enum
1494 {
1495 #define _(sym,str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
1496   foreach_ip4_sv_reass_handoff_error
1497 #undef _
1498     IP4_SV_REASSEMBLY_HANDOFF_N_ERROR,
1499 } ip4_sv_reass_handoff_error_t;
1500
1501 static char *ip4_sv_reass_handoff_error_strings[] = {
1502 #define _(sym,string) string,
1503   foreach_ip4_sv_reass_handoff_error
1504 #undef _
1505 };
1506
1507 typedef struct
1508 {
1509   u32 next_worker_index;
1510 } ip4_sv_reass_handoff_trace_t;
1511
1512 static u8 *
1513 format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
1514 {
1515   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1516   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1517   ip4_sv_reass_handoff_trace_t *t =
1518     va_arg (*args, ip4_sv_reass_handoff_trace_t *);
1519
1520   s =
1521     format (s, "ip4-sv-reassembly-handoff: next-worker %d",
1522             t->next_worker_index);
1523
1524   return s;
1525 }
1526
1527 always_inline uword
1528 ip4_sv_reass_handoff_node_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
1529                                   vlib_frame_t *frame, bool is_feature,
1530                                   bool is_custom_context)
1531 {
1532   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1533
1534   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1535   u32 n_enq, n_left_from, *from, *context;
1536   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1537   u32 fq_index;
1538
1539   from = vlib_frame_vector_args (frame);
1540   if (is_custom_context)
1541     context = vlib_frame_aux_args (frame);
1542
1543   n_left_from = frame->n_vectors;
1544   vlib_get_buffers (vm, from, bufs, n_left_from);
1545
1546   b = bufs;
1547   ti = thread_indices;
1548
1549   fq_index = (is_feature) ? rm->fq_feature_index :
1550                                   (is_custom_context ? rm->fq_custom_context_index :
1551                                                        rm->fq_index);
1552
1553   while (n_left_from > 0)
1554     {
1555       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1556
1557       if (PREDICT_FALSE
1558           ((node->flags & VLIB_NODE_FLAG_TRACE)
1559            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1560         {
1561           ip4_sv_reass_handoff_trace_t *t =
1562             vlib_add_trace (vm, node, b[0], sizeof (*t));
1563           t->next_worker_index = ti[0];
1564         }
1565
1566       n_left_from -= 1;
1567       ti += 1;
1568       b += 1;
1569     }
1570   if (is_custom_context)
1571     n_enq = vlib_buffer_enqueue_to_thread_with_aux (
1572       vm, node, fq_index, from, context, thread_indices, frame->n_vectors, 1);
1573   else
1574     n_enq = vlib_buffer_enqueue_to_thread (
1575       vm, node, fq_index, from, thread_indices, frame->n_vectors, 1);
1576
1577   if (n_enq < frame->n_vectors)
1578     vlib_node_increment_counter (vm, node->node_index,
1579                                  IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1580                                  frame->n_vectors - n_enq);
1581   return frame->n_vectors;
1582 }
1583
1584 VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
1585                                           vlib_node_runtime_t * node,
1586                                           vlib_frame_t * frame)
1587 {
1588   return ip4_sv_reass_handoff_node_inline (
1589     vm, node, frame, false /* is_feature */, false /* is_custom_context */);
1590 }
1591
1592
1593 /* *INDENT-OFF* */
1594 VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
1595   .name = "ip4-sv-reassembly-handoff",
1596   .vector_size = sizeof (u32),
1597   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1598   .error_strings = ip4_sv_reass_handoff_error_strings,
1599   .format_trace = format_ip4_sv_reass_handoff_trace,
1600
1601   .n_next_nodes = 1,
1602
1603   .next_nodes = {
1604     [0] = "error-drop",
1605   },
1606 };
1607 /* *INDENT-ON* */
1608
1609 VLIB_NODE_FN (ip4_sv_reass_custom_context_handoff_node)
1610 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
1611 {
1612   return ip4_sv_reass_handoff_node_inline (
1613     vm, node, frame, false /* is_feature */, true /* is_custom_context */);
1614 }
1615
1616 VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_handoff_node) = {
1617   .name = "ip4-sv-reassembly-custom-context-handoff",
1618   .vector_size = sizeof (u32),
1619   .aux_size = sizeof (u32),
1620   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1621   .error_strings = ip4_sv_reass_handoff_error_strings,
1622   .format_trace = format_ip4_sv_reass_handoff_trace,
1623
1624   .n_next_nodes = 1,
1625
1626   .next_nodes = {
1627     [0] = "error-drop",
1628   },
1629 };
1630
1631 /* *INDENT-OFF* */
1632 VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
1633                                                     vlib_node_runtime_t *
1634                                                     node,
1635                                                     vlib_frame_t * frame)
1636 {
1637   return ip4_sv_reass_handoff_node_inline (
1638     vm, node, frame, true /* is_feature */, false /* is_custom_context */);
1639 }
1640 /* *INDENT-ON* */
1641
1642
1643 /* *INDENT-OFF* */
1644 VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
1645   .name = "ip4-sv-reass-feature-hoff",
1646   .vector_size = sizeof (u32),
1647   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1648   .error_strings = ip4_sv_reass_handoff_error_strings,
1649   .format_trace = format_ip4_sv_reass_handoff_trace,
1650
1651   .n_next_nodes = 1,
1652
1653   .next_nodes = {
1654     [0] = "error-drop",
1655   },
1656 };
1657 /* *INDENT-ON* */
1658
1659 #ifndef CLIB_MARCH_VARIANT
1660 int
1661 ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1662 {
1663   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1664   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1665   if (is_enable)
1666     {
1667       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1668         {
1669           ++rm->feature_use_refcount_per_intf[sw_if_index];
1670           return vnet_feature_enable_disable ("ip4-unicast",
1671                                               "ip4-sv-reassembly-feature",
1672                                               sw_if_index, 1, 0, 0);
1673         }
1674       ++rm->feature_use_refcount_per_intf[sw_if_index];
1675     }
1676   else
1677     {
1678       if (rm->feature_use_refcount_per_intf[sw_if_index])
1679         --rm->feature_use_refcount_per_intf[sw_if_index];
1680       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1681         return vnet_feature_enable_disable ("ip4-unicast",
1682                                             "ip4-sv-reassembly-feature",
1683                                             sw_if_index, 0, 0, 0);
1684     }
1685   return 0;
1686 }
1687
1688 uword
1689 ip4_sv_reass_custom_register_next_node (uword node_index)
1690 {
1691   return vlib_node_add_next (vlib_get_main (), ip4_sv_reass_custom_node.index,
1692                              node_index);
1693 }
1694
1695 uword
1696 ip4_sv_reass_custom_context_register_next_node (uword node_index)
1697 {
1698   return vlib_node_add_next (
1699     vlib_get_main (), ip4_sv_reass_custom_context_node.index, node_index);
1700 }
1701
1702 int
1703 ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
1704                                                 int is_enable)
1705 {
1706   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1707   vec_validate (rm->output_feature_use_refcount_per_intf, sw_if_index);
1708   if (is_enable)
1709     {
1710       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1711         {
1712           ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1713           return vnet_feature_enable_disable ("ip4-output",
1714                                               "ip4-sv-reassembly-output-feature",
1715                                               sw_if_index, 1, 0, 0);
1716         }
1717       ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1718     }
1719   else
1720     {
1721       if (rm->output_feature_use_refcount_per_intf[sw_if_index])
1722         --rm->output_feature_use_refcount_per_intf[sw_if_index];
1723       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1724         return vnet_feature_enable_disable ("ip4-output",
1725                                             "ip4-sv-reassembly-output-feature",
1726                                             sw_if_index, 0, 0, 0);
1727     }
1728   return 0;
1729 }
1730 #endif
1731
1732 /*
1733  * fd.io coding-style-patch-verification: ON
1734  *
1735  * Local Variables:
1736  * eval: (c-set-style "gnu")
1737  * End:
1738  */