ip: SVR fix race condition
[vpp.git] / src / vnet / ip / reass / ip4_sv_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Shallow Virtual Reassembly.
19  *
20  * This file contains the source code for IPv4 Shallow Virtual reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vnet/ip/ip4_to_ip6.h>
27 #include <vppinfra/fifo.h>
28 #include <vppinfra/bihash_16_8.h>
29 #include <vnet/ip/reass/ip4_sv_reass.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_SV_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000      // 10 seconds default
34 #define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_SV_REASS_HT_LOAD_FACTOR (0.75)
37
38 typedef enum
39 {
40   IP4_SV_REASS_RC_OK,
41   IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS,
42   IP4_SV_REASS_RC_UNSUPP_IP_PROTO,
43 } ip4_sv_reass_rc_t;
44
45 typedef struct
46 {
47   union
48   {
49     struct
50     {
51       u32 fib_index;
52       ip4_address_t src;
53       ip4_address_t dst;
54       u16 frag_id;
55       u8 proto;
56       u8 unused;
57     };
58     u64 as_u64[2];
59   };
60 } ip4_sv_reass_key_t;
61
62 typedef union
63 {
64   struct
65   {
66     u32 reass_index;
67     u32 thread_index;
68   };
69   u64 as_u64;
70 } ip4_sv_reass_val_t;
71
72 typedef union
73 {
74   struct
75   {
76     ip4_sv_reass_key_t k;
77     ip4_sv_reass_val_t v;
78   };
79   clib_bihash_kv_16_8_t kv;
80 } ip4_sv_reass_kv_t;
81
82 typedef struct
83 {
84   // hash table key
85   ip4_sv_reass_key_t key;
86   // time when last packet was received
87   f64 last_heard;
88   // internal id of this reassembly
89   u64 id;
90   // trace operation counter
91   u32 trace_op_counter;
92   // minimum fragment length for this reassembly - used to estimate MTU
93   u16 min_fragment_length;
94   // buffer indexes of buffers in this reassembly in chronological order -
95   // including overlaps and duplicate fragments
96   u32 *cached_buffers;
97   // set to true when this reassembly is completed
98   bool is_complete;
99   // ip protocol
100   u8 ip_proto;
101   u8 icmp_type_or_tcp_flags;
102   u32 tcp_ack_number;
103   u32 tcp_seq_number;
104   // l4 src port
105   u16 l4_src_port;
106   // l4 dst port
107   u16 l4_dst_port;
108   u32 next_index;
109   // lru indexes
110   u32 lru_prev;
111   u32 lru_next;
112 } ip4_sv_reass_t;
113
114 typedef struct
115 {
116   ip4_sv_reass_t *pool;
117   u32 reass_n;
118   u32 id_counter;
119   clib_spinlock_t lock;
120   // lru indexes
121   u32 lru_first;
122   u32 lru_last;
123
124 } ip4_sv_reass_per_thread_t;
125
126 typedef struct
127 {
128   // IPv4 config
129   u32 timeout_ms;
130   f64 timeout;
131   u32 expire_walk_interval_ms;
132   // maximum number of fragments in one reassembly
133   u32 max_reass_len;
134   // maximum number of reassemblies
135   u32 max_reass_n;
136
137   // IPv4 runtime
138   clib_bihash_16_8_t hash;
139   // per-thread data
140   ip4_sv_reass_per_thread_t *per_thread_data;
141
142   // convenience
143   vlib_main_t *vlib_main;
144   vnet_main_t *vnet_main;
145
146   // node index of ip4-drop node
147   u32 ip4_drop_idx;
148   u32 ip4_sv_reass_expire_node_idx;
149
150   /** Worker handoff */
151   u32 fq_index;
152   u32 fq_feature_index;
153
154   // reference count for enabling/disabling feature - per interface
155   u32 *feature_use_refcount_per_intf;
156
157   // reference count for enabling/disabling feature - per interface
158   u32 *output_feature_use_refcount_per_intf;
159
160 } ip4_sv_reass_main_t;
161
162 extern ip4_sv_reass_main_t ip4_sv_reass_main;
163
164 #ifndef CLIB_MARCH_VARIANT
165 ip4_sv_reass_main_t ip4_sv_reass_main;
166 #endif /* CLIB_MARCH_VARIANT */
167
168 typedef enum
169 {
170   IP4_SV_REASSEMBLY_NEXT_INPUT,
171   IP4_SV_REASSEMBLY_NEXT_DROP,
172   IP4_SV_REASSEMBLY_NEXT_HANDOFF,
173   IP4_SV_REASSEMBLY_N_NEXT,
174 } ip4_sv_reass_next_t;
175
176 typedef enum
177 {
178   REASS_FRAGMENT_CACHE,
179   REASS_FINISH,
180   REASS_FRAGMENT_FORWARD,
181   REASS_PASSTHROUGH,
182 } ip4_sv_reass_trace_operation_e;
183
184 typedef struct
185 {
186   ip4_sv_reass_trace_operation_e action;
187   u32 reass_id;
188   u32 op_id;
189   u8 ip_proto;
190   u16 l4_src_port;
191   u16 l4_dst_port;
192   int l4_layer_truncated;
193 } ip4_sv_reass_trace_t;
194
195 extern vlib_node_registration_t ip4_sv_reass_node;
196 extern vlib_node_registration_t ip4_sv_reass_node_feature;
197
198 static u8 *
199 format_ip4_sv_reass_trace (u8 * s, va_list * args)
200 {
201   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
202   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
203   ip4_sv_reass_trace_t *t = va_arg (*args, ip4_sv_reass_trace_t *);
204   if (REASS_PASSTHROUGH != t->action)
205     {
206       s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
207     }
208   switch (t->action)
209     {
210     case REASS_FRAGMENT_CACHE:
211       s = format (s, "[cached]");
212       break;
213     case REASS_FINISH:
214       s =
215         format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
216                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
217                 clib_net_to_host_u16 (t->l4_dst_port));
218       break;
219     case REASS_FRAGMENT_FORWARD:
220       s =
221         format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
222                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
223                 clib_net_to_host_u16 (t->l4_dst_port));
224       break;
225     case REASS_PASSTHROUGH:
226       s = format (s, "[not-fragmented]");
227       break;
228     }
229   if (t->l4_layer_truncated)
230     {
231       s = format (s, " [l4-layer-truncated]");
232     }
233   return s;
234 }
235
236 static void
237 ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
238                         ip4_sv_reass_t *reass, u32 bi,
239                         ip4_sv_reass_trace_operation_e action, u32 ip_proto,
240                         u16 l4_src_port, u16 l4_dst_port,
241                         int l4_layer_truncated)
242 {
243   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
244   if (pool_is_free_index
245       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
246     {
247       // this buffer's trace is gone
248       b->flags &= ~VLIB_BUFFER_IS_TRACED;
249       return;
250     }
251   ip4_sv_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
252   if (reass)
253     {
254       t->reass_id = reass->id;
255       t->op_id = reass->trace_op_counter;
256       ++reass->trace_op_counter;
257     }
258   t->action = action;
259   t->ip_proto = ip_proto;
260   t->l4_src_port = l4_src_port;
261   t->l4_dst_port = l4_dst_port;
262   t->l4_layer_truncated = l4_layer_truncated;
263 #if 0
264   static u8 *s = NULL;
265   s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
266   printf ("%.*s\n", vec_len (s), s);
267   fflush (stdout);
268   vec_reset_length (s);
269 #endif
270 }
271
272
273 always_inline void
274 ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
275                    ip4_sv_reass_per_thread_t * rt, ip4_sv_reass_t * reass)
276 {
277   clib_bihash_kv_16_8_t kv;
278   kv.key[0] = reass->key.as_u64[0];
279   kv.key[1] = reass->key.as_u64[1];
280   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
281   vlib_buffer_free (vm, reass->cached_buffers,
282                     vec_len (reass->cached_buffers));
283   vec_free (reass->cached_buffers);
284   reass->cached_buffers = NULL;
285   if (~0 != reass->lru_prev)
286     {
287       ip4_sv_reass_t *lru_prev =
288         pool_elt_at_index (rt->pool, reass->lru_prev);
289       lru_prev->lru_next = reass->lru_next;
290     }
291   if (~0 != reass->lru_next)
292     {
293       ip4_sv_reass_t *lru_next =
294         pool_elt_at_index (rt->pool, reass->lru_next);
295       lru_next->lru_prev = reass->lru_prev;
296     }
297   if (rt->lru_first == reass - rt->pool)
298     {
299       rt->lru_first = reass->lru_next;
300     }
301   if (rt->lru_last == reass - rt->pool)
302     {
303       rt->lru_last = reass->lru_prev;
304     }
305   pool_put (rt->pool, reass);
306   --rt->reass_n;
307 }
308
309 always_inline void
310 ip4_sv_reass_init (ip4_sv_reass_t * reass)
311 {
312   reass->cached_buffers = NULL;
313   reass->is_complete = false;
314 }
315
316 always_inline ip4_sv_reass_t *
317 ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
318                              ip4_sv_reass_per_thread_t * rt,
319                              ip4_sv_reass_kv_t * kv, u8 * do_handoff)
320 {
321   ip4_sv_reass_t *reass = NULL;
322   f64 now = vlib_time_now (vm);
323
324 again:
325
326   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
327     {
328       if (vm->thread_index != kv->v.thread_index)
329         {
330           *do_handoff = 1;
331           return NULL;
332         }
333       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
334
335       if (now > reass->last_heard + rm->timeout)
336         {
337           ip4_sv_reass_free (vm, rm, rt, reass);
338           reass = NULL;
339         }
340     }
341
342   if (reass)
343     {
344       reass->last_heard = now;
345       return reass;
346     }
347
348   if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
349     {
350       reass = pool_elt_at_index (rt->pool, rt->lru_first);
351       ip4_sv_reass_free (vm, rm, rt, reass);
352     }
353
354   pool_get (rt->pool, reass);
355   clib_memset (reass, 0, sizeof (*reass));
356   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
357   ++rt->id_counter;
358   ip4_sv_reass_init (reass);
359   ++rt->reass_n;
360   reass->lru_prev = reass->lru_next = ~0;
361
362   if (~0 != rt->lru_last)
363     {
364       ip4_sv_reass_t *lru_last = pool_elt_at_index (rt->pool, rt->lru_last);
365       reass->lru_prev = rt->lru_last;
366       lru_last->lru_next = rt->lru_last = reass - rt->pool;
367     }
368
369   if (~0 == rt->lru_first)
370     {
371       rt->lru_first = rt->lru_last = reass - rt->pool;
372     }
373
374   reass->key.as_u64[0] = kv->kv.key[0];
375   reass->key.as_u64[1] = kv->kv.key[1];
376   kv->v.reass_index = (reass - rt->pool);
377   kv->v.thread_index = vm->thread_index;
378   reass->last_heard = now;
379
380   int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
381   if (rv)
382     {
383       ip4_sv_reass_free (vm, rm, rt, reass);
384       reass = NULL;
385       // if other worker created a context already work with the other copy
386       if (-2 == rv)
387         goto again;
388     }
389
390   return reass;
391 }
392
393 always_inline ip4_sv_reass_rc_t
394 ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
395                      ip4_sv_reass_main_t *rm, ip4_header_t *ip0,
396                      ip4_sv_reass_t *reass, u32 bi0)
397 {
398   vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
399   ip4_sv_reass_rc_t rc = IP4_SV_REASS_RC_OK;
400   const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
401   if (0 == fragment_first)
402     {
403       reass->ip_proto = ip0->protocol;
404       reass->l4_src_port = ip4_get_port (ip0, 1);
405       reass->l4_dst_port = ip4_get_port (ip0, 0);
406       if (!reass->l4_src_port || !reass->l4_dst_port)
407         return IP4_SV_REASS_RC_UNSUPP_IP_PROTO;
408       if (IP_PROTOCOL_TCP == reass->ip_proto)
409         {
410           reass->icmp_type_or_tcp_flags = ((tcp_header_t *) (ip0 + 1))->flags;
411           reass->tcp_ack_number = ((tcp_header_t *) (ip0 + 1))->ack_number;
412           reass->tcp_seq_number = ((tcp_header_t *) (ip0 + 1))->seq_number;
413         }
414       else if (IP_PROTOCOL_ICMP == reass->ip_proto)
415         {
416           reass->icmp_type_or_tcp_flags =
417             ((icmp46_header_t *) (ip0 + 1))->type;
418         }
419       reass->is_complete = true;
420       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
421       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
422         {
423           ip4_sv_reass_add_trace (
424             vm, node, reass, bi0, REASS_FINISH, reass->ip_proto,
425             reass->l4_src_port, reass->l4_dst_port,
426             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
427         }
428     }
429   vec_add1 (reass->cached_buffers, bi0);
430   if (!reass->is_complete)
431     {
432       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
433         {
434           ip4_sv_reass_add_trace (
435             vm, node, reass, bi0, REASS_FRAGMENT_CACHE, ~0, ~0, ~0,
436             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
437         }
438       if (vec_len (reass->cached_buffers) > rm->max_reass_len)
439         {
440           rc = IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS;
441         }
442     }
443   return rc;
444 }
445
446 always_inline int
447 l4_layer_truncated (ip4_header_t *ip)
448 {
449   static const int l4_layer_length[256] = {
450     [IP_PROTOCOL_TCP] = sizeof (tcp_header_t),
451     [IP_PROTOCOL_UDP] = sizeof (udp_header_t),
452     [IP_PROTOCOL_ICMP] = sizeof (icmp46_header_t),
453   };
454
455   return ((u8 *) ip + ip4_header_bytes (ip) + l4_layer_length[ip->protocol] >
456           (u8 *) ip + clib_net_to_host_u16 (ip->length));
457 }
458
459 always_inline uword
460 ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
461                      vlib_frame_t * frame, bool is_feature,
462                      bool is_output_feature, bool is_custom)
463 {
464   u32 *from = vlib_frame_vector_args (frame);
465   u32 n_left_from, n_left_to_next, *to_next, next_index;
466   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
467   ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
468   clib_spinlock_lock (&rt->lock);
469
470   n_left_from = frame->n_vectors;
471   next_index = node->cached_next_index;
472
473   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
474   vlib_get_buffers (vm, from, bufs, n_left_from);
475   u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
476   b = bufs;
477
478   /* optimistic case first - no fragments */
479   while (n_left_from >= 2)
480     {
481       vlib_buffer_t *b0, *b1;
482       u32 next0, next1;
483       b0 = *b;
484       b++;
485       b1 = *b;
486       b++;
487
488       /* Prefetch next iteration. */
489       if (PREDICT_TRUE (n_left_from >= 4))
490         {
491           vlib_buffer_t *p2, *p3;
492
493           p2 = *b;
494           p3 = *(b + 1);
495
496           vlib_prefetch_buffer_header (p2, LOAD);
497           vlib_prefetch_buffer_header (p3, LOAD);
498
499           clib_prefetch_load (p2->data);
500           clib_prefetch_load (p3->data);
501         }
502
503       ip4_header_t *ip0 =
504         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
505                                      (is_output_feature ? 1 : 0) *
506                                      vnet_buffer (b0)->
507                                      ip.save_rewrite_length);
508       ip4_header_t *ip1 =
509         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b1),
510                                      (is_output_feature ? 1 : 0) *
511                                      vnet_buffer (b1)->
512                                      ip.save_rewrite_length);
513
514       if (PREDICT_FALSE
515           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0))
516           || (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
517         {
518           // fragment found, go slow path
519           b -= 2;
520           if (b - bufs > 0)
521             {
522               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
523                                            b - bufs);
524             }
525           goto slow_path;
526         }
527       if (is_feature)
528         {
529           vnet_feature_next (&next0, b0);
530         }
531       else
532         {
533           next0 = is_custom ? vnet_buffer (b0)->ip.reass.next_index :
534             IP4_SV_REASSEMBLY_NEXT_INPUT;
535         }
536       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
537       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
538       if (l4_layer_truncated (ip0))
539         {
540           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
541           vnet_buffer (b0)->ip.reass.l4_src_port = 0;
542           vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
543         }
544       else
545         {
546           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
547           if (IP_PROTOCOL_TCP == ip0->protocol)
548             {
549               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
550                 ((tcp_header_t *) (ip0 + 1))->flags;
551               vnet_buffer (b0)->ip.reass.tcp_ack_number =
552                 ((tcp_header_t *) (ip0 + 1))->ack_number;
553               vnet_buffer (b0)->ip.reass.tcp_seq_number =
554                 ((tcp_header_t *) (ip0 + 1))->seq_number;
555             }
556           else if (IP_PROTOCOL_ICMP == ip0->protocol)
557             {
558               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
559                 ((icmp46_header_t *) (ip0 + 1))->type;
560             }
561           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
562           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
563         }
564       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
565         {
566           ip4_sv_reass_add_trace (
567             vm, node, NULL, from[(b - 2) - bufs], REASS_PASSTHROUGH,
568             vnet_buffer (b0)->ip.reass.ip_proto,
569             vnet_buffer (b0)->ip.reass.l4_src_port,
570             vnet_buffer (b0)->ip.reass.l4_dst_port,
571             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
572         }
573       if (is_feature)
574         {
575           vnet_feature_next (&next1, b1);
576         }
577       else
578         {
579           next1 = is_custom ? vnet_buffer (b1)->ip.reass.next_index :
580             IP4_SV_REASSEMBLY_NEXT_INPUT;
581         }
582       vnet_buffer (b1)->ip.reass.is_non_first_fragment = 0;
583       vnet_buffer (b1)->ip.reass.ip_proto = ip1->protocol;
584       if (l4_layer_truncated (ip1))
585         {
586           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 1;
587           vnet_buffer (b1)->ip.reass.l4_src_port = 0;
588           vnet_buffer (b1)->ip.reass.l4_dst_port = 0;
589         }
590       else
591         {
592           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 0;
593           if (IP_PROTOCOL_TCP == ip1->protocol)
594             {
595               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
596                 ((tcp_header_t *) (ip1 + 1))->flags;
597               vnet_buffer (b1)->ip.reass.tcp_ack_number =
598                 ((tcp_header_t *) (ip1 + 1))->ack_number;
599               vnet_buffer (b1)->ip.reass.tcp_seq_number =
600                 ((tcp_header_t *) (ip1 + 1))->seq_number;
601             }
602           else if (IP_PROTOCOL_ICMP == ip1->protocol)
603             {
604               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
605                 ((icmp46_header_t *) (ip1 + 1))->type;
606             }
607           vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
608           vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
609         }
610       if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
611         {
612           ip4_sv_reass_add_trace (
613             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
614             vnet_buffer (b1)->ip.reass.ip_proto,
615             vnet_buffer (b1)->ip.reass.l4_src_port,
616             vnet_buffer (b1)->ip.reass.l4_dst_port,
617             vnet_buffer (b1)->ip.reass.l4_layer_truncated);
618         }
619
620       n_left_from -= 2;
621       next[0] = next0;
622       next[1] = next1;
623       next += 2;
624     }
625
626   while (n_left_from > 0)
627     {
628       vlib_buffer_t *b0;
629       u32 next0;
630       b0 = *b;
631       b++;
632
633       ip4_header_t *ip0 =
634         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
635                                      (is_output_feature ? 1 : 0) *
636                                      vnet_buffer (b0)->
637                                      ip.save_rewrite_length);
638       if (PREDICT_FALSE
639           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0)))
640         {
641           // fragment found, go slow path
642           b -= 1;
643           if (b - bufs > 0)
644             {
645               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
646                                            b - bufs);
647             }
648           goto slow_path;
649         }
650       if (is_feature)
651         {
652           vnet_feature_next (&next0, b0);
653         }
654       else
655         {
656           next0 =
657             is_custom ? vnet_buffer (b0)->ip.
658             reass.next_index : IP4_SV_REASSEMBLY_NEXT_INPUT;
659         }
660       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
661       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
662       if (l4_layer_truncated (ip0))
663         {
664           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
665         }
666       else
667         {
668           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
669           if (IP_PROTOCOL_TCP == ip0->protocol)
670             {
671               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
672                 ((tcp_header_t *) (ip0 + 1))->flags;
673               vnet_buffer (b0)->ip.reass.tcp_ack_number =
674                 ((tcp_header_t *) (ip0 + 1))->ack_number;
675               vnet_buffer (b0)->ip.reass.tcp_seq_number =
676                 ((tcp_header_t *) (ip0 + 1))->seq_number;
677             }
678           else if (IP_PROTOCOL_ICMP == ip0->protocol)
679             {
680               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
681                 ((icmp46_header_t *) (ip0 + 1))->type;
682             }
683           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
684           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
685         }
686       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
687         {
688           ip4_sv_reass_add_trace (
689             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
690             vnet_buffer (b0)->ip.reass.ip_proto,
691             vnet_buffer (b0)->ip.reass.l4_src_port,
692             vnet_buffer (b0)->ip.reass.l4_dst_port,
693             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
694         }
695
696       n_left_from -= 1;
697       next[0] = next0;
698       next += 1;
699     }
700
701   vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
702                                frame->n_vectors);
703
704   goto done;
705
706 slow_path:
707
708   from += b - bufs;
709
710   while (n_left_from > 0)
711     {
712       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
713
714       while (n_left_from > 0 && n_left_to_next > 0)
715         {
716           u32 bi0;
717           vlib_buffer_t *b0;
718           u32 next0;
719           u32 error0 = IP4_ERROR_NONE;
720
721           bi0 = from[0];
722           b0 = vlib_get_buffer (vm, bi0);
723
724           ip4_header_t *ip0 =
725             (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
726                                          (is_output_feature ? 1 : 0) *
727                                          vnet_buffer (b0)->
728                                          ip.save_rewrite_length);
729           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
730             {
731               // this is a regular packet - no fragmentation
732               if (is_custom)
733                 {
734                   next0 = vnet_buffer (b0)->ip.reass.next_index;
735                 }
736               else
737                 {
738                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
739                 }
740               vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
741               vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
742               if (l4_layer_truncated (ip0))
743                 {
744                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
745                   vnet_buffer (b0)->ip.reass.l4_src_port = 0;
746                   vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
747                 }
748               else
749                 {
750                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
751                   if (IP_PROTOCOL_TCP == ip0->protocol)
752                     {
753                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
754                         ((tcp_header_t *) (ip0 + 1))->flags;
755                       vnet_buffer (b0)->ip.reass.tcp_ack_number =
756                         ((tcp_header_t *) (ip0 + 1))->ack_number;
757                       vnet_buffer (b0)->ip.reass.tcp_seq_number =
758                         ((tcp_header_t *) (ip0 + 1))->seq_number;
759                     }
760                   else if (IP_PROTOCOL_ICMP == ip0->protocol)
761                     {
762                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
763                         ((icmp46_header_t *) (ip0 + 1))->type;
764                     }
765                   vnet_buffer (b0)->ip.reass.l4_src_port =
766                     ip4_get_port (ip0, 1);
767                   vnet_buffer (b0)->ip.reass.l4_dst_port =
768                     ip4_get_port (ip0, 0);
769                 }
770               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
771                 {
772                   ip4_sv_reass_add_trace (
773                     vm, node, NULL, bi0, REASS_PASSTHROUGH,
774                     vnet_buffer (b0)->ip.reass.ip_proto,
775                     vnet_buffer (b0)->ip.reass.l4_src_port,
776                     vnet_buffer (b0)->ip.reass.l4_dst_port,
777                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
778                 }
779               goto packet_enqueue;
780             }
781           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
782           const u32 fragment_length =
783             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
784           const u32 fragment_last = fragment_first + fragment_length - 1;
785           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
786             {
787               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
788               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
789               b0->error = node->errors[error0];
790               goto packet_enqueue;
791             }
792           ip4_sv_reass_kv_t kv;
793           u8 do_handoff = 0;
794
795           kv.k.as_u64[0] =
796             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
797                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
798             (u64) ip0->src_address.as_u32 << 32;
799           kv.k.as_u64[1] =
800             (u64) ip0->dst_address.
801             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
802
803           ip4_sv_reass_t *reass =
804             ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
805
806           if (PREDICT_FALSE (do_handoff))
807             {
808               next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
809               vnet_buffer (b0)->ip.reass.owner_thread_index =
810                 kv.v.thread_index;
811               goto packet_enqueue;
812             }
813
814           if (!reass)
815             {
816               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
817               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
818               b0->error = node->errors[error0];
819               goto packet_enqueue;
820             }
821
822           if (reass->is_complete)
823             {
824               if (is_custom)
825                 {
826                   next0 = vnet_buffer (b0)->ip.reass.next_index;
827                 }
828               else
829                 {
830                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
831                 }
832               vnet_buffer (b0)->ip.reass.is_non_first_fragment =
833                 ! !fragment_first;
834               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
835               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
836                 reass->icmp_type_or_tcp_flags;
837               vnet_buffer (b0)->ip.reass.tcp_ack_number =
838                 reass->tcp_ack_number;
839               vnet_buffer (b0)->ip.reass.tcp_seq_number =
840                 reass->tcp_seq_number;
841               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
842               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
843               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
844                 {
845                   ip4_sv_reass_add_trace (
846                     vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
847                     reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
848                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
849                 }
850               goto packet_enqueue;
851             }
852
853           ip4_sv_reass_rc_t rc =
854             ip4_sv_reass_update (vm, node, rm, ip0, reass, bi0);
855           u32 counter = ~0;
856           switch (rc)
857             {
858             case IP4_SV_REASS_RC_OK:
859               /* nothing to do here */
860               break;
861             case IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS:
862               counter = IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
863               break;
864             case IP4_SV_REASS_RC_UNSUPP_IP_PROTO:
865               counter = IP4_ERROR_REASS_UNSUPP_IP_PROT;
866               break;
867             }
868           if (~0 != counter)
869             {
870               vlib_node_increment_counter (vm, node->node_index, counter, 1);
871               ip4_sv_reass_free (vm, rm, rt, reass);
872               goto next_packet;
873             }
874           if (reass->is_complete)
875             {
876               u32 idx;
877               vec_foreach_index (idx, reass->cached_buffers)
878               {
879                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
880                 vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
881                 ip0 =
882                   (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
883                                                (is_output_feature ? 1 : 0) *
884                                                vnet_buffer (b0)->
885                                                ip.save_rewrite_length);
886                 u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
887                 if (is_feature)
888                   {
889                     vnet_feature_next (&next0, b0);
890                   }
891                 if (is_custom)
892                   {
893                     next0 = vnet_buffer (b0)->ip.reass.next_index;
894                   }
895                 if (0 == n_left_to_next)
896                   {
897                     vlib_put_next_frame (vm, node, next_index,
898                                          n_left_to_next);
899                     vlib_get_next_frame (vm, node, next_index, to_next,
900                                          n_left_to_next);
901                   }
902                 to_next[0] = bi0;
903                 to_next += 1;
904                 n_left_to_next -= 1;
905                 vnet_buffer (b0)->ip.reass.is_non_first_fragment =
906                   ! !ip4_get_fragment_offset (ip0);
907                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
908                 vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
909                   reass->icmp_type_or_tcp_flags;
910                 vnet_buffer (b0)->ip.reass.tcp_ack_number =
911                   reass->tcp_ack_number;
912                 vnet_buffer (b0)->ip.reass.tcp_seq_number =
913                   reass->tcp_seq_number;
914                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
915                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
916                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
917                   {
918                     ip4_sv_reass_add_trace (
919                       vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
920                       reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
921                       vnet_buffer (b0)->ip.reass.l4_layer_truncated);
922                   }
923                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
924                                                  to_next, n_left_to_next, bi0,
925                                                  next0);
926               }
927               _vec_len (reass->cached_buffers) = 0;     // buffers are owned by frame now
928             }
929           goto next_packet;
930
931         packet_enqueue:
932           to_next[0] = bi0;
933           to_next += 1;
934           n_left_to_next -= 1;
935           if (is_feature && IP4_ERROR_NONE == error0)
936             {
937               b0 = vlib_get_buffer (vm, bi0);
938               vnet_feature_next (&next0, b0);
939             }
940           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
941                                            to_next, n_left_to_next,
942                                            bi0, next0);
943
944         next_packet:
945           from += 1;
946           n_left_from -= 1;
947         }
948
949       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
950     }
951
952 done:
953   clib_spinlock_unlock (&rt->lock);
954   return frame->n_vectors;
955 }
956
957 static char *ip4_sv_reass_error_strings[] = {
958 #define _(sym, string) string,
959   foreach_ip4_error
960 #undef _
961 };
962
963 VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
964                                   vlib_node_runtime_t * node,
965                                   vlib_frame_t * frame)
966 {
967   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
968                               false /* is_output_feature */ ,
969                               false /* is_custom */ );
970 }
971
972 /* *INDENT-OFF* */
973 VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
974     .name = "ip4-sv-reassembly",
975     .vector_size = sizeof (u32),
976     .format_trace = format_ip4_sv_reass_trace,
977     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
978     .error_strings = ip4_sv_reass_error_strings,
979     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
980     .next_nodes =
981         {
982                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
983                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
984                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
985
986         },
987 };
988 /* *INDENT-ON* */
989
990 VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
991                                           vlib_node_runtime_t * node,
992                                           vlib_frame_t * frame)
993 {
994   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
995                               false /* is_output_feature */ ,
996                               false /* is_custom */ );
997 }
998
999 /* *INDENT-OFF* */
1000 VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
1001     .name = "ip4-sv-reassembly-feature",
1002     .vector_size = sizeof (u32),
1003     .format_trace = format_ip4_sv_reass_trace,
1004     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
1005     .error_strings = ip4_sv_reass_error_strings,
1006     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1007     .next_nodes =
1008         {
1009                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1010                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1011                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1012         },
1013 };
1014 /* *INDENT-ON* */
1015
1016 /* *INDENT-OFF* */
1017 VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
1018     .arc_name = "ip4-unicast",
1019     .node_name = "ip4-sv-reassembly-feature",
1020     .runs_before = VNET_FEATURES ("ip4-lookup"),
1021     .runs_after = 0,
1022 };
1023 /* *INDENT-ON* */
1024
1025 VLIB_NODE_FN (ip4_sv_reass_node_output_feature) (vlib_main_t * vm,
1026                                                  vlib_node_runtime_t * node,
1027                                                  vlib_frame_t * frame)
1028 {
1029   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
1030                               true /* is_output_feature */ ,
1031                               false /* is_custom */ );
1032 }
1033
1034
1035 /* *INDENT-OFF* */
1036 VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
1037     .name = "ip4-sv-reassembly-output-feature",
1038     .vector_size = sizeof (u32),
1039     .format_trace = format_ip4_sv_reass_trace,
1040     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
1041     .error_strings = ip4_sv_reass_error_strings,
1042     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1043     .next_nodes =
1044         {
1045                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1046                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1047                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1048         },
1049 };
1050 /* *INDENT-ON* */
1051
1052 /* *INDENT-OFF* */
1053 VNET_FEATURE_INIT (ip4_sv_reass_output_feature) = {
1054     .arc_name = "ip4-output",
1055     .node_name = "ip4-sv-reassembly-output-feature",
1056     .runs_before = 0,
1057     .runs_after = 0,
1058 };
1059 /* *INDENT-ON* */
1060
1061 /* *INDENT-OFF* */
1062 VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
1063     .name = "ip4-sv-reassembly-custom-next",
1064     .vector_size = sizeof (u32),
1065     .format_trace = format_ip4_sv_reass_trace,
1066     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
1067     .error_strings = ip4_sv_reass_error_strings,
1068     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1069     .next_nodes =
1070         {
1071                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1072                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1073                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
1074
1075         },
1076 };
1077 /* *INDENT-ON* */
1078
1079 VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
1080                                          vlib_node_runtime_t * node,
1081                                          vlib_frame_t * frame)
1082 {
1083   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
1084                               false /* is_output_feature */ ,
1085                               true /* is_custom */ );
1086 }
1087
1088 #ifndef CLIB_MARCH_VARIANT
1089 always_inline u32
1090 ip4_sv_reass_get_nbuckets ()
1091 {
1092   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1093   u32 nbuckets;
1094   u8 i;
1095
1096   nbuckets = (u32) (rm->max_reass_n / IP4_SV_REASS_HT_LOAD_FACTOR);
1097
1098   for (i = 0; i < 31; i++)
1099     if ((1 << i) >= nbuckets)
1100       break;
1101   nbuckets = 1 << i;
1102
1103   return nbuckets;
1104 }
1105 #endif /* CLIB_MARCH_VARIANT */
1106
1107 typedef enum
1108 {
1109   IP4_EVENT_CONFIG_CHANGED = 1,
1110 } ip4_sv_reass_event_t;
1111
1112 typedef struct
1113 {
1114   int failure;
1115   clib_bihash_16_8_t *new_hash;
1116 } ip4_rehash_cb_ctx;
1117
1118 #ifndef CLIB_MARCH_VARIANT
1119 static int
1120 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1121 {
1122   ip4_rehash_cb_ctx *ctx = _ctx;
1123   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1124     {
1125       ctx->failure = 1;
1126     }
1127   return (BIHASH_WALK_CONTINUE);
1128 }
1129
1130 static void
1131 ip4_sv_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1132                          u32 max_reassembly_length,
1133                          u32 expire_walk_interval_ms)
1134 {
1135   ip4_sv_reass_main.timeout_ms = timeout_ms;
1136   ip4_sv_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1137   ip4_sv_reass_main.max_reass_n = max_reassemblies;
1138   ip4_sv_reass_main.max_reass_len = max_reassembly_length;
1139   ip4_sv_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1140 }
1141
1142 vnet_api_error_t
1143 ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
1144                   u32 max_reassembly_length, u32 expire_walk_interval_ms)
1145 {
1146   u32 old_nbuckets = ip4_sv_reass_get_nbuckets ();
1147   ip4_sv_reass_set_params (timeout_ms, max_reassemblies,
1148                            max_reassembly_length, expire_walk_interval_ms);
1149   vlib_process_signal_event (ip4_sv_reass_main.vlib_main,
1150                              ip4_sv_reass_main.ip4_sv_reass_expire_node_idx,
1151                              IP4_EVENT_CONFIG_CHANGED, 0);
1152   u32 new_nbuckets = ip4_sv_reass_get_nbuckets ();
1153   if (ip4_sv_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1154     {
1155       clib_bihash_16_8_t new_hash;
1156       clib_memset (&new_hash, 0, sizeof (new_hash));
1157       ip4_rehash_cb_ctx ctx;
1158       ctx.failure = 0;
1159       ctx.new_hash = &new_hash;
1160       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1161                              new_nbuckets * 1024);
1162       clib_bihash_foreach_key_value_pair_16_8 (&ip4_sv_reass_main.hash,
1163                                                ip4_rehash_cb, &ctx);
1164       if (ctx.failure)
1165         {
1166           clib_bihash_free_16_8 (&new_hash);
1167           return -1;
1168         }
1169       else
1170         {
1171           clib_bihash_free_16_8 (&ip4_sv_reass_main.hash);
1172           clib_memcpy_fast (&ip4_sv_reass_main.hash, &new_hash,
1173                             sizeof (ip4_sv_reass_main.hash));
1174           clib_bihash_copied (&ip4_sv_reass_main.hash, &new_hash);
1175         }
1176     }
1177   return 0;
1178 }
1179
1180 vnet_api_error_t
1181 ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1182                   u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1183 {
1184   *timeout_ms = ip4_sv_reass_main.timeout_ms;
1185   *max_reassemblies = ip4_sv_reass_main.max_reass_n;
1186   *max_reassembly_length = ip4_sv_reass_main.max_reass_len;
1187   *expire_walk_interval_ms = ip4_sv_reass_main.expire_walk_interval_ms;
1188   return 0;
1189 }
1190
1191 static clib_error_t *
1192 ip4_sv_reass_init_function (vlib_main_t * vm)
1193 {
1194   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1195   clib_error_t *error = 0;
1196   u32 nbuckets;
1197   vlib_node_t *node;
1198
1199   rm->vlib_main = vm;
1200   rm->vnet_main = vnet_get_main ();
1201
1202   vec_validate (rm->per_thread_data, vlib_num_workers ());
1203   ip4_sv_reass_per_thread_t *rt;
1204   vec_foreach (rt, rm->per_thread_data)
1205   {
1206     clib_spinlock_init (&rt->lock);
1207     pool_alloc (rt->pool, rm->max_reass_n);
1208     rt->lru_first = rt->lru_last = ~0;
1209   }
1210
1211   node = vlib_get_node_by_name (vm, (u8 *) "ip4-sv-reassembly-expire-walk");
1212   ASSERT (node);
1213   rm->ip4_sv_reass_expire_node_idx = node->index;
1214
1215   ip4_sv_reass_set_params (IP4_SV_REASS_TIMEOUT_DEFAULT_MS,
1216                            IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT,
1217                            IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1218                            IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1219
1220   nbuckets = ip4_sv_reass_get_nbuckets ();
1221   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1222
1223   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1224   ASSERT (node);
1225   rm->ip4_drop_idx = node->index;
1226
1227   rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
1228   rm->fq_feature_index =
1229     vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
1230
1231   rm->feature_use_refcount_per_intf = NULL;
1232   rm->output_feature_use_refcount_per_intf = NULL;
1233
1234   return error;
1235 }
1236
1237 VLIB_INIT_FUNCTION (ip4_sv_reass_init_function);
1238 #endif /* CLIB_MARCH_VARIANT */
1239
1240 static uword
1241 ip4_sv_reass_walk_expired (vlib_main_t *vm,
1242                            CLIB_UNUSED (vlib_node_runtime_t *node),
1243                            CLIB_UNUSED (vlib_frame_t *f))
1244 {
1245   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1246   uword event_type, *event_data = 0;
1247
1248   while (true)
1249     {
1250       vlib_process_wait_for_event_or_clock (vm,
1251                                             (f64)
1252                                             rm->expire_walk_interval_ms /
1253                                             (f64) MSEC_PER_SEC);
1254       event_type = vlib_process_get_events (vm, &event_data);
1255
1256       switch (event_type)
1257         {
1258         case ~0:
1259           /* no events => timeout */
1260           /* fallthrough */
1261         case IP4_EVENT_CONFIG_CHANGED:
1262           /* nothing to do here */
1263           break;
1264         default:
1265           clib_warning ("BUG: event type 0x%wx", event_type);
1266           break;
1267         }
1268       f64 now = vlib_time_now (vm);
1269
1270       ip4_sv_reass_t *reass;
1271       int *pool_indexes_to_free = NULL;
1272
1273       uword thread_index = 0;
1274       int index;
1275       const uword nthreads = vlib_num_workers () + 1;
1276       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1277         {
1278           ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1279           clib_spinlock_lock (&rt->lock);
1280
1281           vec_reset_length (pool_indexes_to_free);
1282           /* *INDENT-OFF* */
1283           pool_foreach_index (index, rt->pool)  {
1284                                 reass = pool_elt_at_index (rt->pool, index);
1285                                 if (now > reass->last_heard + rm->timeout)
1286                                   {
1287                                     vec_add1 (pool_indexes_to_free, index);
1288                                   }
1289                               }
1290           /* *INDENT-ON* */
1291           int *i;
1292           /* *INDENT-OFF* */
1293           vec_foreach (i, pool_indexes_to_free)
1294           {
1295             ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1296             ip4_sv_reass_free (vm, rm, rt, reass);
1297           }
1298           /* *INDENT-ON* */
1299
1300           clib_spinlock_unlock (&rt->lock);
1301         }
1302
1303       vec_free (pool_indexes_to_free);
1304       if (event_data)
1305         {
1306           _vec_len (event_data) = 0;
1307         }
1308     }
1309
1310   return 0;
1311 }
1312
1313 /* *INDENT-OFF* */
1314 VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
1315     .function = ip4_sv_reass_walk_expired,
1316     .type = VLIB_NODE_TYPE_PROCESS,
1317     .name = "ip4-sv-reassembly-expire-walk",
1318     .format_trace = format_ip4_sv_reass_trace,
1319     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
1320     .error_strings = ip4_sv_reass_error_strings,
1321
1322 };
1323 /* *INDENT-ON* */
1324
1325 static u8 *
1326 format_ip4_sv_reass_key (u8 * s, va_list * args)
1327 {
1328   ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
1329   s =
1330     format (s, "fib_index: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1331             key->fib_index, format_ip4_address, &key->src, format_ip4_address,
1332             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1333   return s;
1334 }
1335
1336 static u8 *
1337 format_ip4_sv_reass (u8 * s, va_list * args)
1338 {
1339   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1340   ip4_sv_reass_t *reass = va_arg (*args, ip4_sv_reass_t *);
1341
1342   s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n",
1343               reass->id, format_ip4_sv_reass_key, &reass->key,
1344               reass->trace_op_counter);
1345
1346   vlib_buffer_t *b;
1347   u32 *bip;
1348   u32 counter = 0;
1349   vec_foreach (bip, reass->cached_buffers)
1350   {
1351     u32 bi = *bip;
1352     do
1353       {
1354         b = vlib_get_buffer (vm, bi);
1355         s = format (s, "  #%03u: bi: %u, ", counter, bi);
1356         ++counter;
1357         bi = b->next_buffer;
1358       }
1359     while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
1360   }
1361   return s;
1362 }
1363
1364 static clib_error_t *
1365 show_ip4_reass (vlib_main_t * vm,
1366                 unformat_input_t * input,
1367                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1368 {
1369   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1370
1371   vlib_cli_output (vm, "---------------------");
1372   vlib_cli_output (vm, "IP4 reassembly status");
1373   vlib_cli_output (vm, "---------------------");
1374   bool details = false;
1375   if (unformat (input, "details"))
1376     {
1377       details = true;
1378     }
1379
1380   u32 sum_reass_n = 0;
1381   ip4_sv_reass_t *reass;
1382   uword thread_index;
1383   const uword nthreads = vlib_num_workers () + 1;
1384   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1385     {
1386       ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1387       clib_spinlock_lock (&rt->lock);
1388       if (details)
1389         {
1390           /* *INDENT-OFF* */
1391           pool_foreach (reass, rt->pool) {
1392             vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
1393           }
1394           /* *INDENT-ON* */
1395         }
1396       sum_reass_n += rt->reass_n;
1397       clib_spinlock_unlock (&rt->lock);
1398     }
1399   vlib_cli_output (vm, "---------------------");
1400   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1401                    (long unsigned) sum_reass_n);
1402   vlib_cli_output (vm,
1403                    "Maximum configured concurrent shallow virtual IP4 reassemblies per worker-thread: %lu\n",
1404                    (long unsigned) rm->max_reass_n);
1405   vlib_cli_output (vm,
1406                    "Maximum configured amount of fragments per shallow "
1407                    "virtual IP4 reassembly: %lu\n",
1408                    (long unsigned) rm->max_reass_len);
1409   vlib_cli_output (vm,
1410                    "Maximum configured shallow virtual IP4 reassembly timeout: %lums\n",
1411                    (long unsigned) rm->timeout_ms);
1412   vlib_cli_output (vm,
1413                    "Maximum configured shallow virtual IP4 reassembly expire walk interval: %lums\n",
1414                    (long unsigned) rm->expire_walk_interval_ms);
1415   return 0;
1416 }
1417
1418 /* *INDENT-OFF* */
1419 VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
1420     .path = "show ip4-sv-reassembly",
1421     .short_help = "show ip4-sv-reassembly [details]",
1422     .function = show_ip4_reass,
1423 };
1424 /* *INDENT-ON* */
1425
1426 #ifndef CLIB_MARCH_VARIANT
1427 vnet_api_error_t
1428 ip4_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1429 {
1430   return ip4_sv_reass_enable_disable_with_refcnt (sw_if_index,
1431                                                   enable_disable);
1432 }
1433 #endif /* CLIB_MARCH_VARIANT */
1434
1435
1436 #define foreach_ip4_sv_reass_handoff_error                       \
1437 _(CONGESTION_DROP, "congestion drop")
1438
1439
1440 typedef enum
1441 {
1442 #define _(sym,str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
1443   foreach_ip4_sv_reass_handoff_error
1444 #undef _
1445     IP4_SV_REASSEMBLY_HANDOFF_N_ERROR,
1446 } ip4_sv_reass_handoff_error_t;
1447
1448 static char *ip4_sv_reass_handoff_error_strings[] = {
1449 #define _(sym,string) string,
1450   foreach_ip4_sv_reass_handoff_error
1451 #undef _
1452 };
1453
1454 typedef struct
1455 {
1456   u32 next_worker_index;
1457 } ip4_sv_reass_handoff_trace_t;
1458
1459 static u8 *
1460 format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
1461 {
1462   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1463   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1464   ip4_sv_reass_handoff_trace_t *t =
1465     va_arg (*args, ip4_sv_reass_handoff_trace_t *);
1466
1467   s =
1468     format (s, "ip4-sv-reassembly-handoff: next-worker %d",
1469             t->next_worker_index);
1470
1471   return s;
1472 }
1473
1474 always_inline uword
1475 ip4_sv_reass_handoff_node_inline (vlib_main_t * vm,
1476                                   vlib_node_runtime_t * node,
1477                                   vlib_frame_t * frame, bool is_feature)
1478 {
1479   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1480
1481   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1482   u32 n_enq, n_left_from, *from;
1483   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1484   u32 fq_index;
1485
1486   from = vlib_frame_vector_args (frame);
1487   n_left_from = frame->n_vectors;
1488   vlib_get_buffers (vm, from, bufs, n_left_from);
1489
1490   b = bufs;
1491   ti = thread_indices;
1492
1493   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1494
1495   while (n_left_from > 0)
1496     {
1497       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1498
1499       if (PREDICT_FALSE
1500           ((node->flags & VLIB_NODE_FLAG_TRACE)
1501            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1502         {
1503           ip4_sv_reass_handoff_trace_t *t =
1504             vlib_add_trace (vm, node, b[0], sizeof (*t));
1505           t->next_worker_index = ti[0];
1506         }
1507
1508       n_left_from -= 1;
1509       ti += 1;
1510       b += 1;
1511     }
1512   n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
1513                                          thread_indices, frame->n_vectors, 1);
1514
1515   if (n_enq < frame->n_vectors)
1516     vlib_node_increment_counter (vm, node->node_index,
1517                                  IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1518                                  frame->n_vectors - n_enq);
1519   return frame->n_vectors;
1520 }
1521
1522 VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
1523                                           vlib_node_runtime_t * node,
1524                                           vlib_frame_t * frame)
1525 {
1526   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1527                                            false /* is_feature */ );
1528 }
1529
1530
1531 /* *INDENT-OFF* */
1532 VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
1533   .name = "ip4-sv-reassembly-handoff",
1534   .vector_size = sizeof (u32),
1535   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1536   .error_strings = ip4_sv_reass_handoff_error_strings,
1537   .format_trace = format_ip4_sv_reass_handoff_trace,
1538
1539   .n_next_nodes = 1,
1540
1541   .next_nodes = {
1542     [0] = "error-drop",
1543   },
1544 };
1545 /* *INDENT-ON* */
1546
1547
1548 /* *INDENT-OFF* */
1549 VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
1550                                                     vlib_node_runtime_t *
1551                                                     node,
1552                                                     vlib_frame_t * frame)
1553 {
1554   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1555                                              true /* is_feature */ );
1556 }
1557 /* *INDENT-ON* */
1558
1559
1560 /* *INDENT-OFF* */
1561 VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
1562   .name = "ip4-sv-reass-feature-hoff",
1563   .vector_size = sizeof (u32),
1564   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1565   .error_strings = ip4_sv_reass_handoff_error_strings,
1566   .format_trace = format_ip4_sv_reass_handoff_trace,
1567
1568   .n_next_nodes = 1,
1569
1570   .next_nodes = {
1571     [0] = "error-drop",
1572   },
1573 };
1574 /* *INDENT-ON* */
1575
1576 #ifndef CLIB_MARCH_VARIANT
1577 int
1578 ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1579 {
1580   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1581   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1582   if (is_enable)
1583     {
1584       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1585         {
1586           ++rm->feature_use_refcount_per_intf[sw_if_index];
1587           return vnet_feature_enable_disable ("ip4-unicast",
1588                                               "ip4-sv-reassembly-feature",
1589                                               sw_if_index, 1, 0, 0);
1590         }
1591       ++rm->feature_use_refcount_per_intf[sw_if_index];
1592     }
1593   else
1594     {
1595       if (rm->feature_use_refcount_per_intf[sw_if_index])
1596         --rm->feature_use_refcount_per_intf[sw_if_index];
1597       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1598         return vnet_feature_enable_disable ("ip4-unicast",
1599                                             "ip4-sv-reassembly-feature",
1600                                             sw_if_index, 0, 0, 0);
1601     }
1602   return 0;
1603 }
1604
1605 uword
1606 ip4_sv_reass_custom_register_next_node (uword node_index)
1607 {
1608   return vlib_node_add_next (vlib_get_main (), ip4_sv_reass_custom_node.index,
1609                              node_index);
1610 }
1611
1612 int
1613 ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
1614                                                 int is_enable)
1615 {
1616   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1617   vec_validate (rm->output_feature_use_refcount_per_intf, sw_if_index);
1618   if (is_enable)
1619     {
1620       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1621         {
1622           ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1623           return vnet_feature_enable_disable ("ip4-output",
1624                                               "ip4-sv-reassembly-output-feature",
1625                                               sw_if_index, 1, 0, 0);
1626         }
1627       ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1628     }
1629   else
1630     {
1631       if (rm->output_feature_use_refcount_per_intf[sw_if_index])
1632         --rm->output_feature_use_refcount_per_intf[sw_if_index];
1633       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1634         return vnet_feature_enable_disable ("ip4-output",
1635                                             "ip4-sv-reassembly-output-feature",
1636                                             sw_if_index, 0, 0, 0);
1637     }
1638   return 0;
1639 }
1640 #endif
1641
1642 /*
1643  * fd.io coding-style-patch-verification: ON
1644  *
1645  * Local Variables:
1646  * eval: (c-set-style "gnu")
1647  * End:
1648  */