ip: Use .api declared error counters
[vpp.git] / src / vnet / ip / reass / ip4_sv_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Shallow Virtual Reassembly.
19  *
20  * This file contains the source code for IPv4 Shallow Virtual reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vnet/ip/ip4_to_ip6.h>
27 #include <vppinfra/fifo.h>
28 #include <vppinfra/bihash_16_8.h>
29 #include <vnet/ip/reass/ip4_sv_reass.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_SV_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000      // 10 seconds default
34 #define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_SV_REASS_HT_LOAD_FACTOR (0.75)
37
38 typedef enum
39 {
40   IP4_SV_REASS_RC_OK,
41   IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS,
42   IP4_SV_REASS_RC_UNSUPP_IP_PROTO,
43 } ip4_sv_reass_rc_t;
44
45 typedef struct
46 {
47   union
48   {
49     struct
50     {
51       u32 fib_index;
52       ip4_address_t src;
53       ip4_address_t dst;
54       u16 frag_id;
55       u8 proto;
56       u8 unused;
57     };
58     u64 as_u64[2];
59   };
60 } ip4_sv_reass_key_t;
61
62 typedef union
63 {
64   struct
65   {
66     u32 reass_index;
67     u32 thread_index;
68   };
69   u64 as_u64;
70 } ip4_sv_reass_val_t;
71
72 typedef union
73 {
74   struct
75   {
76     ip4_sv_reass_key_t k;
77     ip4_sv_reass_val_t v;
78   };
79   clib_bihash_kv_16_8_t kv;
80 } ip4_sv_reass_kv_t;
81
82 typedef struct
83 {
84   // hash table key
85   ip4_sv_reass_key_t key;
86   // time when last packet was received
87   f64 last_heard;
88   // internal id of this reassembly
89   u64 id;
90   // trace operation counter
91   u32 trace_op_counter;
92   // minimum fragment length for this reassembly - used to estimate MTU
93   u16 min_fragment_length;
94   // buffer indexes of buffers in this reassembly in chronological order -
95   // including overlaps and duplicate fragments
96   u32 *cached_buffers;
97   // set to true when this reassembly is completed
98   bool is_complete;
99   // ip protocol
100   u8 ip_proto;
101   u8 icmp_type_or_tcp_flags;
102   u32 tcp_ack_number;
103   u32 tcp_seq_number;
104   // l4 src port
105   u16 l4_src_port;
106   // l4 dst port
107   u16 l4_dst_port;
108   u32 next_index;
109   // lru indexes
110   u32 lru_prev;
111   u32 lru_next;
112 } ip4_sv_reass_t;
113
114 typedef struct
115 {
116   ip4_sv_reass_t *pool;
117   u32 reass_n;
118   u32 id_counter;
119   clib_spinlock_t lock;
120   // lru indexes
121   u32 lru_first;
122   u32 lru_last;
123
124 } ip4_sv_reass_per_thread_t;
125
126 typedef struct
127 {
128   // IPv4 config
129   u32 timeout_ms;
130   f64 timeout;
131   u32 expire_walk_interval_ms;
132   // maximum number of fragments in one reassembly
133   u32 max_reass_len;
134   // maximum number of reassemblies
135   u32 max_reass_n;
136
137   // IPv4 runtime
138   clib_bihash_16_8_t hash;
139   // per-thread data
140   ip4_sv_reass_per_thread_t *per_thread_data;
141
142   // convenience
143   vlib_main_t *vlib_main;
144   vnet_main_t *vnet_main;
145
146   // node index of ip4-drop node
147   u32 ip4_drop_idx;
148   u32 ip4_sv_reass_expire_node_idx;
149
150   /** Worker handoff */
151   u32 fq_index;
152   u32 fq_feature_index;
153
154   // reference count for enabling/disabling feature - per interface
155   u32 *feature_use_refcount_per_intf;
156
157   // reference count for enabling/disabling feature - per interface
158   u32 *output_feature_use_refcount_per_intf;
159
160 } ip4_sv_reass_main_t;
161
162 extern ip4_sv_reass_main_t ip4_sv_reass_main;
163
164 #ifndef CLIB_MARCH_VARIANT
165 ip4_sv_reass_main_t ip4_sv_reass_main;
166 #endif /* CLIB_MARCH_VARIANT */
167
168 typedef enum
169 {
170   IP4_SV_REASSEMBLY_NEXT_INPUT,
171   IP4_SV_REASSEMBLY_NEXT_DROP,
172   IP4_SV_REASSEMBLY_NEXT_HANDOFF,
173   IP4_SV_REASSEMBLY_N_NEXT,
174 } ip4_sv_reass_next_t;
175
176 typedef enum
177 {
178   REASS_FRAGMENT_CACHE,
179   REASS_FINISH,
180   REASS_FRAGMENT_FORWARD,
181   REASS_PASSTHROUGH,
182 } ip4_sv_reass_trace_operation_e;
183
184 typedef struct
185 {
186   ip4_sv_reass_trace_operation_e action;
187   u32 reass_id;
188   u32 op_id;
189   u8 ip_proto;
190   u16 l4_src_port;
191   u16 l4_dst_port;
192   int l4_layer_truncated;
193 } ip4_sv_reass_trace_t;
194
195 extern vlib_node_registration_t ip4_sv_reass_node;
196 extern vlib_node_registration_t ip4_sv_reass_node_feature;
197
198 static u8 *
199 format_ip4_sv_reass_trace (u8 * s, va_list * args)
200 {
201   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
202   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
203   ip4_sv_reass_trace_t *t = va_arg (*args, ip4_sv_reass_trace_t *);
204   if (REASS_PASSTHROUGH != t->action)
205     {
206       s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
207     }
208   switch (t->action)
209     {
210     case REASS_FRAGMENT_CACHE:
211       s = format (s, "[cached]");
212       break;
213     case REASS_FINISH:
214       s =
215         format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
216                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
217                 clib_net_to_host_u16 (t->l4_dst_port));
218       break;
219     case REASS_FRAGMENT_FORWARD:
220       s =
221         format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
222                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
223                 clib_net_to_host_u16 (t->l4_dst_port));
224       break;
225     case REASS_PASSTHROUGH:
226       s = format (s, "[not-fragmented]");
227       break;
228     }
229   if (t->l4_layer_truncated)
230     {
231       s = format (s, " [l4-layer-truncated]");
232     }
233   return s;
234 }
235
236 static void
237 ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
238                         ip4_sv_reass_t *reass, u32 bi,
239                         ip4_sv_reass_trace_operation_e action, u32 ip_proto,
240                         u16 l4_src_port, u16 l4_dst_port,
241                         int l4_layer_truncated)
242 {
243   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
244   if (pool_is_free_index
245       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
246     {
247       // this buffer's trace is gone
248       b->flags &= ~VLIB_BUFFER_IS_TRACED;
249       return;
250     }
251   ip4_sv_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
252   if (reass)
253     {
254       t->reass_id = reass->id;
255       t->op_id = reass->trace_op_counter;
256       ++reass->trace_op_counter;
257     }
258   t->action = action;
259   t->ip_proto = ip_proto;
260   t->l4_src_port = l4_src_port;
261   t->l4_dst_port = l4_dst_port;
262   t->l4_layer_truncated = l4_layer_truncated;
263 #if 0
264   static u8 *s = NULL;
265   s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
266   printf ("%.*s\n", vec_len (s), s);
267   fflush (stdout);
268   vec_reset_length (s);
269 #endif
270 }
271
272
273 always_inline void
274 ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
275                    ip4_sv_reass_per_thread_t * rt, ip4_sv_reass_t * reass)
276 {
277   clib_bihash_kv_16_8_t kv;
278   kv.key[0] = reass->key.as_u64[0];
279   kv.key[1] = reass->key.as_u64[1];
280   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
281   vlib_buffer_free (vm, reass->cached_buffers,
282                     vec_len (reass->cached_buffers));
283   vec_free (reass->cached_buffers);
284   reass->cached_buffers = NULL;
285   if (~0 != reass->lru_prev)
286     {
287       ip4_sv_reass_t *lru_prev =
288         pool_elt_at_index (rt->pool, reass->lru_prev);
289       lru_prev->lru_next = reass->lru_next;
290     }
291   if (~0 != reass->lru_next)
292     {
293       ip4_sv_reass_t *lru_next =
294         pool_elt_at_index (rt->pool, reass->lru_next);
295       lru_next->lru_prev = reass->lru_prev;
296     }
297   if (rt->lru_first == reass - rt->pool)
298     {
299       rt->lru_first = reass->lru_next;
300     }
301   if (rt->lru_last == reass - rt->pool)
302     {
303       rt->lru_last = reass->lru_prev;
304     }
305   pool_put (rt->pool, reass);
306   --rt->reass_n;
307 }
308
309 always_inline void
310 ip4_sv_reass_init (ip4_sv_reass_t * reass)
311 {
312   reass->cached_buffers = NULL;
313   reass->is_complete = false;
314 }
315
316 always_inline ip4_sv_reass_t *
317 ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
318                              ip4_sv_reass_per_thread_t * rt,
319                              ip4_sv_reass_kv_t * kv, u8 * do_handoff)
320 {
321   ip4_sv_reass_t *reass = NULL;
322   f64 now = vlib_time_now (vm);
323
324 again:
325
326   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
327     {
328       if (vm->thread_index != kv->v.thread_index)
329         {
330           *do_handoff = 1;
331           return NULL;
332         }
333       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
334
335       if (now > reass->last_heard + rm->timeout)
336         {
337           ip4_sv_reass_free (vm, rm, rt, reass);
338           reass = NULL;
339         }
340     }
341
342   if (reass)
343     {
344       reass->last_heard = now;
345       return reass;
346     }
347
348   if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
349     {
350       reass = pool_elt_at_index (rt->pool, rt->lru_first);
351       ip4_sv_reass_free (vm, rm, rt, reass);
352     }
353
354   pool_get (rt->pool, reass);
355   clib_memset (reass, 0, sizeof (*reass));
356   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
357   ++rt->id_counter;
358   ip4_sv_reass_init (reass);
359   ++rt->reass_n;
360   reass->lru_prev = reass->lru_next = ~0;
361
362   if (~0 != rt->lru_last)
363     {
364       ip4_sv_reass_t *lru_last = pool_elt_at_index (rt->pool, rt->lru_last);
365       reass->lru_prev = rt->lru_last;
366       lru_last->lru_next = rt->lru_last = reass - rt->pool;
367     }
368
369   if (~0 == rt->lru_first)
370     {
371       rt->lru_first = rt->lru_last = reass - rt->pool;
372     }
373
374   reass->key.as_u64[0] = kv->kv.key[0];
375   reass->key.as_u64[1] = kv->kv.key[1];
376   kv->v.reass_index = (reass - rt->pool);
377   kv->v.thread_index = vm->thread_index;
378   reass->last_heard = now;
379
380   int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
381   if (rv)
382     {
383       ip4_sv_reass_free (vm, rm, rt, reass);
384       reass = NULL;
385       // if other worker created a context already work with the other copy
386       if (-2 == rv)
387         goto again;
388     }
389
390   return reass;
391 }
392
393 always_inline ip4_sv_reass_rc_t
394 ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
395                      ip4_sv_reass_main_t *rm, ip4_header_t *ip0,
396                      ip4_sv_reass_t *reass, u32 bi0)
397 {
398   vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
399   ip4_sv_reass_rc_t rc = IP4_SV_REASS_RC_OK;
400   const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
401   if (0 == fragment_first)
402     {
403       reass->ip_proto = ip0->protocol;
404       reass->l4_src_port = ip4_get_port (ip0, 1);
405       reass->l4_dst_port = ip4_get_port (ip0, 0);
406       if (!reass->l4_src_port || !reass->l4_dst_port)
407         return IP4_SV_REASS_RC_UNSUPP_IP_PROTO;
408       if (IP_PROTOCOL_TCP == reass->ip_proto)
409         {
410           reass->icmp_type_or_tcp_flags = ((tcp_header_t *) (ip0 + 1))->flags;
411           reass->tcp_ack_number = ((tcp_header_t *) (ip0 + 1))->ack_number;
412           reass->tcp_seq_number = ((tcp_header_t *) (ip0 + 1))->seq_number;
413         }
414       else if (IP_PROTOCOL_ICMP == reass->ip_proto)
415         {
416           reass->icmp_type_or_tcp_flags =
417             ((icmp46_header_t *) (ip0 + 1))->type;
418         }
419       reass->is_complete = true;
420       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
421       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
422         {
423           ip4_sv_reass_add_trace (
424             vm, node, reass, bi0, REASS_FINISH, reass->ip_proto,
425             reass->l4_src_port, reass->l4_dst_port,
426             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
427         }
428     }
429   vec_add1 (reass->cached_buffers, bi0);
430   if (!reass->is_complete)
431     {
432       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
433         {
434           ip4_sv_reass_add_trace (
435             vm, node, reass, bi0, REASS_FRAGMENT_CACHE, ~0, ~0, ~0,
436             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
437         }
438       if (vec_len (reass->cached_buffers) > rm->max_reass_len)
439         {
440           rc = IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS;
441         }
442     }
443   return rc;
444 }
445
446 always_inline int
447 l4_layer_truncated (ip4_header_t *ip)
448 {
449   static const int l4_layer_length[256] = {
450     [IP_PROTOCOL_TCP] = sizeof (tcp_header_t),
451     [IP_PROTOCOL_UDP] = sizeof (udp_header_t),
452     [IP_PROTOCOL_ICMP] = sizeof (icmp46_header_t),
453   };
454
455   return ((u8 *) ip + ip4_header_bytes (ip) + l4_layer_length[ip->protocol] >
456           (u8 *) ip + clib_net_to_host_u16 (ip->length));
457 }
458
459 always_inline uword
460 ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
461                      vlib_frame_t * frame, bool is_feature,
462                      bool is_output_feature, bool is_custom)
463 {
464   u32 *from = vlib_frame_vector_args (frame);
465   u32 n_left_from, n_left_to_next, *to_next, next_index;
466   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
467   ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
468   clib_spinlock_lock (&rt->lock);
469
470   n_left_from = frame->n_vectors;
471   next_index = node->cached_next_index;
472
473   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
474   vlib_get_buffers (vm, from, bufs, n_left_from);
475   u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
476   b = bufs;
477
478   /* optimistic case first - no fragments */
479   while (n_left_from >= 2)
480     {
481       vlib_buffer_t *b0, *b1;
482       u32 next0, next1;
483       b0 = *b;
484       b++;
485       b1 = *b;
486       b++;
487
488       /* Prefetch next iteration. */
489       if (PREDICT_TRUE (n_left_from >= 4))
490         {
491           vlib_buffer_t *p2, *p3;
492
493           p2 = *b;
494           p3 = *(b + 1);
495
496           vlib_prefetch_buffer_header (p2, LOAD);
497           vlib_prefetch_buffer_header (p3, LOAD);
498
499           clib_prefetch_load (p2->data);
500           clib_prefetch_load (p3->data);
501         }
502
503       ip4_header_t *ip0 =
504         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
505                                      (is_output_feature ? 1 : 0) *
506                                      vnet_buffer (b0)->
507                                      ip.save_rewrite_length);
508       ip4_header_t *ip1 =
509         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b1),
510                                      (is_output_feature ? 1 : 0) *
511                                      vnet_buffer (b1)->
512                                      ip.save_rewrite_length);
513
514       if (PREDICT_FALSE
515           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0))
516           || (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
517         {
518           // fragment found, go slow path
519           b -= 2;
520           if (b - bufs > 0)
521             {
522               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
523                                            b - bufs);
524             }
525           goto slow_path;
526         }
527       if (is_feature)
528         {
529           vnet_feature_next (&next0, b0);
530         }
531       else
532         {
533           next0 = is_custom ? vnet_buffer (b0)->ip.reass.next_index :
534             IP4_SV_REASSEMBLY_NEXT_INPUT;
535         }
536       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
537       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
538       if (l4_layer_truncated (ip0))
539         {
540           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
541           vnet_buffer (b0)->ip.reass.l4_src_port = 0;
542           vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
543         }
544       else
545         {
546           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
547           if (IP_PROTOCOL_TCP == ip0->protocol)
548             {
549               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
550                 ((tcp_header_t *) (ip0 + 1))->flags;
551               vnet_buffer (b0)->ip.reass.tcp_ack_number =
552                 ((tcp_header_t *) (ip0 + 1))->ack_number;
553               vnet_buffer (b0)->ip.reass.tcp_seq_number =
554                 ((tcp_header_t *) (ip0 + 1))->seq_number;
555             }
556           else if (IP_PROTOCOL_ICMP == ip0->protocol)
557             {
558               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
559                 ((icmp46_header_t *) (ip0 + 1))->type;
560             }
561           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
562           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
563         }
564       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
565         {
566           ip4_sv_reass_add_trace (
567             vm, node, NULL, from[(b - 2) - bufs], REASS_PASSTHROUGH,
568             vnet_buffer (b0)->ip.reass.ip_proto,
569             vnet_buffer (b0)->ip.reass.l4_src_port,
570             vnet_buffer (b0)->ip.reass.l4_dst_port,
571             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
572         }
573       if (is_feature)
574         {
575           vnet_feature_next (&next1, b1);
576         }
577       else
578         {
579           next1 = is_custom ? vnet_buffer (b1)->ip.reass.next_index :
580             IP4_SV_REASSEMBLY_NEXT_INPUT;
581         }
582       vnet_buffer (b1)->ip.reass.is_non_first_fragment = 0;
583       vnet_buffer (b1)->ip.reass.ip_proto = ip1->protocol;
584       if (l4_layer_truncated (ip1))
585         {
586           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 1;
587           vnet_buffer (b1)->ip.reass.l4_src_port = 0;
588           vnet_buffer (b1)->ip.reass.l4_dst_port = 0;
589         }
590       else
591         {
592           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 0;
593           if (IP_PROTOCOL_TCP == ip1->protocol)
594             {
595               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
596                 ((tcp_header_t *) (ip1 + 1))->flags;
597               vnet_buffer (b1)->ip.reass.tcp_ack_number =
598                 ((tcp_header_t *) (ip1 + 1))->ack_number;
599               vnet_buffer (b1)->ip.reass.tcp_seq_number =
600                 ((tcp_header_t *) (ip1 + 1))->seq_number;
601             }
602           else if (IP_PROTOCOL_ICMP == ip1->protocol)
603             {
604               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
605                 ((icmp46_header_t *) (ip1 + 1))->type;
606             }
607           vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
608           vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
609         }
610       if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
611         {
612           ip4_sv_reass_add_trace (
613             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
614             vnet_buffer (b1)->ip.reass.ip_proto,
615             vnet_buffer (b1)->ip.reass.l4_src_port,
616             vnet_buffer (b1)->ip.reass.l4_dst_port,
617             vnet_buffer (b1)->ip.reass.l4_layer_truncated);
618         }
619
620       n_left_from -= 2;
621       next[0] = next0;
622       next[1] = next1;
623       next += 2;
624     }
625
626   while (n_left_from > 0)
627     {
628       vlib_buffer_t *b0;
629       u32 next0;
630       b0 = *b;
631       b++;
632
633       ip4_header_t *ip0 =
634         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
635                                      (is_output_feature ? 1 : 0) *
636                                      vnet_buffer (b0)->
637                                      ip.save_rewrite_length);
638       if (PREDICT_FALSE
639           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0)))
640         {
641           // fragment found, go slow path
642           b -= 1;
643           if (b - bufs > 0)
644             {
645               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
646                                            b - bufs);
647             }
648           goto slow_path;
649         }
650       if (is_feature)
651         {
652           vnet_feature_next (&next0, b0);
653         }
654       else
655         {
656           next0 =
657             is_custom ? vnet_buffer (b0)->ip.
658             reass.next_index : IP4_SV_REASSEMBLY_NEXT_INPUT;
659         }
660       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
661       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
662       if (l4_layer_truncated (ip0))
663         {
664           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
665         }
666       else
667         {
668           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
669           if (IP_PROTOCOL_TCP == ip0->protocol)
670             {
671               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
672                 ((tcp_header_t *) (ip0 + 1))->flags;
673               vnet_buffer (b0)->ip.reass.tcp_ack_number =
674                 ((tcp_header_t *) (ip0 + 1))->ack_number;
675               vnet_buffer (b0)->ip.reass.tcp_seq_number =
676                 ((tcp_header_t *) (ip0 + 1))->seq_number;
677             }
678           else if (IP_PROTOCOL_ICMP == ip0->protocol)
679             {
680               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
681                 ((icmp46_header_t *) (ip0 + 1))->type;
682             }
683           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
684           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
685         }
686       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
687         {
688           ip4_sv_reass_add_trace (
689             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
690             vnet_buffer (b0)->ip.reass.ip_proto,
691             vnet_buffer (b0)->ip.reass.l4_src_port,
692             vnet_buffer (b0)->ip.reass.l4_dst_port,
693             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
694         }
695
696       n_left_from -= 1;
697       next[0] = next0;
698       next += 1;
699     }
700
701   vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
702                                frame->n_vectors);
703
704   goto done;
705
706 slow_path:
707
708   from += b - bufs;
709
710   while (n_left_from > 0)
711     {
712       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
713
714       while (n_left_from > 0 && n_left_to_next > 0)
715         {
716           u32 bi0;
717           vlib_buffer_t *b0;
718           u32 next0;
719           u32 error0 = IP4_ERROR_NONE;
720
721           bi0 = from[0];
722           b0 = vlib_get_buffer (vm, bi0);
723
724           ip4_header_t *ip0 =
725             (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
726                                          (is_output_feature ? 1 : 0) *
727                                          vnet_buffer (b0)->
728                                          ip.save_rewrite_length);
729           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
730             {
731               // this is a regular packet - no fragmentation
732               if (is_custom)
733                 {
734                   next0 = vnet_buffer (b0)->ip.reass.next_index;
735                 }
736               else
737                 {
738                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
739                 }
740               vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
741               vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
742               if (l4_layer_truncated (ip0))
743                 {
744                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
745                   vnet_buffer (b0)->ip.reass.l4_src_port = 0;
746                   vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
747                 }
748               else
749                 {
750                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
751                   if (IP_PROTOCOL_TCP == ip0->protocol)
752                     {
753                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
754                         ((tcp_header_t *) (ip0 + 1))->flags;
755                       vnet_buffer (b0)->ip.reass.tcp_ack_number =
756                         ((tcp_header_t *) (ip0 + 1))->ack_number;
757                       vnet_buffer (b0)->ip.reass.tcp_seq_number =
758                         ((tcp_header_t *) (ip0 + 1))->seq_number;
759                     }
760                   else if (IP_PROTOCOL_ICMP == ip0->protocol)
761                     {
762                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
763                         ((icmp46_header_t *) (ip0 + 1))->type;
764                     }
765                   vnet_buffer (b0)->ip.reass.l4_src_port =
766                     ip4_get_port (ip0, 1);
767                   vnet_buffer (b0)->ip.reass.l4_dst_port =
768                     ip4_get_port (ip0, 0);
769                 }
770               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
771                 {
772                   ip4_sv_reass_add_trace (
773                     vm, node, NULL, bi0, REASS_PASSTHROUGH,
774                     vnet_buffer (b0)->ip.reass.ip_proto,
775                     vnet_buffer (b0)->ip.reass.l4_src_port,
776                     vnet_buffer (b0)->ip.reass.l4_dst_port,
777                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
778                 }
779               goto packet_enqueue;
780             }
781           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
782           const u32 fragment_length =
783             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
784           const u32 fragment_last = fragment_first + fragment_length - 1;
785           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
786             {
787               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
788               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
789               b0->error = node->errors[error0];
790               goto packet_enqueue;
791             }
792           ip4_sv_reass_kv_t kv;
793           u8 do_handoff = 0;
794
795           kv.k.as_u64[0] =
796             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
797                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
798             (u64) ip0->src_address.as_u32 << 32;
799           kv.k.as_u64[1] =
800             (u64) ip0->dst_address.
801             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
802
803           ip4_sv_reass_t *reass =
804             ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
805
806           if (PREDICT_FALSE (do_handoff))
807             {
808               next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
809               vnet_buffer (b0)->ip.reass.owner_thread_index =
810                 kv.v.thread_index;
811               goto packet_enqueue;
812             }
813
814           if (!reass)
815             {
816               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
817               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
818               b0->error = node->errors[error0];
819               goto packet_enqueue;
820             }
821
822           if (reass->is_complete)
823             {
824               if (is_custom)
825                 {
826                   next0 = vnet_buffer (b0)->ip.reass.next_index;
827                 }
828               else
829                 {
830                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
831                 }
832               vnet_buffer (b0)->ip.reass.is_non_first_fragment =
833                 ! !fragment_first;
834               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
835               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
836                 reass->icmp_type_or_tcp_flags;
837               vnet_buffer (b0)->ip.reass.tcp_ack_number =
838                 reass->tcp_ack_number;
839               vnet_buffer (b0)->ip.reass.tcp_seq_number =
840                 reass->tcp_seq_number;
841               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
842               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
843               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
844                 {
845                   ip4_sv_reass_add_trace (
846                     vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
847                     reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
848                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
849                 }
850               goto packet_enqueue;
851             }
852
853           ip4_sv_reass_rc_t rc =
854             ip4_sv_reass_update (vm, node, rm, ip0, reass, bi0);
855           u32 counter = ~0;
856           switch (rc)
857             {
858             case IP4_SV_REASS_RC_OK:
859               /* nothing to do here */
860               break;
861             case IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS:
862               counter = IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
863               break;
864             case IP4_SV_REASS_RC_UNSUPP_IP_PROTO:
865               counter = IP4_ERROR_REASS_UNSUPP_IP_PROT;
866               break;
867             }
868           if (~0 != counter)
869             {
870               vlib_node_increment_counter (vm, node->node_index, counter, 1);
871               ip4_sv_reass_free (vm, rm, rt, reass);
872               goto next_packet;
873             }
874           if (reass->is_complete)
875             {
876               u32 idx;
877               vec_foreach_index (idx, reass->cached_buffers)
878               {
879                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
880                 vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
881                 ip0 =
882                   (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
883                                                (is_output_feature ? 1 : 0) *
884                                                vnet_buffer (b0)->
885                                                ip.save_rewrite_length);
886                 u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
887                 if (is_feature)
888                   {
889                     vnet_feature_next (&next0, b0);
890                   }
891                 if (is_custom)
892                   {
893                     next0 = vnet_buffer (b0)->ip.reass.next_index;
894                   }
895                 if (0 == n_left_to_next)
896                   {
897                     vlib_put_next_frame (vm, node, next_index,
898                                          n_left_to_next);
899                     vlib_get_next_frame (vm, node, next_index, to_next,
900                                          n_left_to_next);
901                   }
902                 to_next[0] = bi0;
903                 to_next += 1;
904                 n_left_to_next -= 1;
905                 vnet_buffer (b0)->ip.reass.is_non_first_fragment =
906                   ! !ip4_get_fragment_offset (ip0);
907                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
908                 vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
909                   reass->icmp_type_or_tcp_flags;
910                 vnet_buffer (b0)->ip.reass.tcp_ack_number =
911                   reass->tcp_ack_number;
912                 vnet_buffer (b0)->ip.reass.tcp_seq_number =
913                   reass->tcp_seq_number;
914                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
915                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
916                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
917                   {
918                     ip4_sv_reass_add_trace (
919                       vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
920                       reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
921                       vnet_buffer (b0)->ip.reass.l4_layer_truncated);
922                   }
923                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
924                                                  to_next, n_left_to_next, bi0,
925                                                  next0);
926               }
927               vec_set_len (reass->cached_buffers,
928                            0); // buffers are owned by frame now
929             }
930           goto next_packet;
931
932         packet_enqueue:
933           to_next[0] = bi0;
934           to_next += 1;
935           n_left_to_next -= 1;
936           if (is_feature && IP4_ERROR_NONE == error0)
937             {
938               b0 = vlib_get_buffer (vm, bi0);
939               vnet_feature_next (&next0, b0);
940             }
941           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
942                                            to_next, n_left_to_next,
943                                            bi0, next0);
944
945         next_packet:
946           from += 1;
947           n_left_from -= 1;
948         }
949
950       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
951     }
952
953 done:
954   clib_spinlock_unlock (&rt->lock);
955   return frame->n_vectors;
956 }
957
958 VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
959                                   vlib_node_runtime_t * node,
960                                   vlib_frame_t * frame)
961 {
962   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
963                               false /* is_output_feature */ ,
964                               false /* is_custom */ );
965 }
966
967 /* *INDENT-OFF* */
968 VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
969     .name = "ip4-sv-reassembly",
970     .vector_size = sizeof (u32),
971     .format_trace = format_ip4_sv_reass_trace,
972     .n_errors = IP4_N_ERROR,
973     .error_counters = ip4_error_counters,
974     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
975     .next_nodes =
976         {
977                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
978                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
979                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
980
981         },
982 };
983 /* *INDENT-ON* */
984
985 VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
986                                           vlib_node_runtime_t * node,
987                                           vlib_frame_t * frame)
988 {
989   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
990                               false /* is_output_feature */ ,
991                               false /* is_custom */ );
992 }
993
994 /* *INDENT-OFF* */
995 VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
996     .name = "ip4-sv-reassembly-feature",
997     .vector_size = sizeof (u32),
998     .format_trace = format_ip4_sv_reass_trace,
999     .n_errors = IP4_N_ERROR,
1000     .error_counters = ip4_error_counters,
1001     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1002     .next_nodes =
1003         {
1004                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1005                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1006                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1007         },
1008 };
1009 /* *INDENT-ON* */
1010
1011 /* *INDENT-OFF* */
1012 VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
1013     .arc_name = "ip4-unicast",
1014     .node_name = "ip4-sv-reassembly-feature",
1015     .runs_before = VNET_FEATURES ("ip4-lookup"),
1016     .runs_after = 0,
1017 };
1018 /* *INDENT-ON* */
1019
1020 VLIB_NODE_FN (ip4_sv_reass_node_output_feature) (vlib_main_t * vm,
1021                                                  vlib_node_runtime_t * node,
1022                                                  vlib_frame_t * frame)
1023 {
1024   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
1025                               true /* is_output_feature */ ,
1026                               false /* is_custom */ );
1027 }
1028
1029
1030 /* *INDENT-OFF* */
1031 VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
1032     .name = "ip4-sv-reassembly-output-feature",
1033     .vector_size = sizeof (u32),
1034     .format_trace = format_ip4_sv_reass_trace,
1035     .n_errors = IP4_N_ERROR,
1036     .error_counters = ip4_error_counters,
1037     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1038     .next_nodes =
1039         {
1040                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1041                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1042                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1043         },
1044 };
1045 /* *INDENT-ON* */
1046
1047 /* *INDENT-OFF* */
1048 VNET_FEATURE_INIT (ip4_sv_reass_output_feature) = {
1049     .arc_name = "ip4-output",
1050     .node_name = "ip4-sv-reassembly-output-feature",
1051     .runs_before = 0,
1052     .runs_after = 0,
1053 };
1054 /* *INDENT-ON* */
1055
1056 /* *INDENT-OFF* */
1057 VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
1058     .name = "ip4-sv-reassembly-custom-next",
1059     .vector_size = sizeof (u32),
1060     .format_trace = format_ip4_sv_reass_trace,
1061     .n_errors = IP4_N_ERROR,
1062     .error_counters = ip4_error_counters,
1063     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1064     .next_nodes =
1065         {
1066                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1067                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1068                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
1069
1070         },
1071 };
1072 /* *INDENT-ON* */
1073
1074 VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
1075                                          vlib_node_runtime_t * node,
1076                                          vlib_frame_t * frame)
1077 {
1078   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
1079                               false /* is_output_feature */ ,
1080                               true /* is_custom */ );
1081 }
1082
1083 #ifndef CLIB_MARCH_VARIANT
1084 always_inline u32
1085 ip4_sv_reass_get_nbuckets ()
1086 {
1087   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1088   u32 nbuckets;
1089   u8 i;
1090
1091   nbuckets = (u32) (rm->max_reass_n / IP4_SV_REASS_HT_LOAD_FACTOR);
1092
1093   for (i = 0; i < 31; i++)
1094     if ((1 << i) >= nbuckets)
1095       break;
1096   nbuckets = 1 << i;
1097
1098   return nbuckets;
1099 }
1100 #endif /* CLIB_MARCH_VARIANT */
1101
1102 typedef enum
1103 {
1104   IP4_EVENT_CONFIG_CHANGED = 1,
1105 } ip4_sv_reass_event_t;
1106
1107 typedef struct
1108 {
1109   int failure;
1110   clib_bihash_16_8_t *new_hash;
1111 } ip4_rehash_cb_ctx;
1112
1113 #ifndef CLIB_MARCH_VARIANT
1114 static int
1115 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1116 {
1117   ip4_rehash_cb_ctx *ctx = _ctx;
1118   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1119     {
1120       ctx->failure = 1;
1121     }
1122   return (BIHASH_WALK_CONTINUE);
1123 }
1124
1125 static void
1126 ip4_sv_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1127                          u32 max_reassembly_length,
1128                          u32 expire_walk_interval_ms)
1129 {
1130   ip4_sv_reass_main.timeout_ms = timeout_ms;
1131   ip4_sv_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1132   ip4_sv_reass_main.max_reass_n = max_reassemblies;
1133   ip4_sv_reass_main.max_reass_len = max_reassembly_length;
1134   ip4_sv_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1135 }
1136
1137 vnet_api_error_t
1138 ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
1139                   u32 max_reassembly_length, u32 expire_walk_interval_ms)
1140 {
1141   u32 old_nbuckets = ip4_sv_reass_get_nbuckets ();
1142   ip4_sv_reass_set_params (timeout_ms, max_reassemblies,
1143                            max_reassembly_length, expire_walk_interval_ms);
1144   vlib_process_signal_event (ip4_sv_reass_main.vlib_main,
1145                              ip4_sv_reass_main.ip4_sv_reass_expire_node_idx,
1146                              IP4_EVENT_CONFIG_CHANGED, 0);
1147   u32 new_nbuckets = ip4_sv_reass_get_nbuckets ();
1148   if (ip4_sv_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1149     {
1150       clib_bihash_16_8_t new_hash;
1151       clib_memset (&new_hash, 0, sizeof (new_hash));
1152       ip4_rehash_cb_ctx ctx;
1153       ctx.failure = 0;
1154       ctx.new_hash = &new_hash;
1155       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1156                              new_nbuckets * 1024);
1157       clib_bihash_foreach_key_value_pair_16_8 (&ip4_sv_reass_main.hash,
1158                                                ip4_rehash_cb, &ctx);
1159       if (ctx.failure)
1160         {
1161           clib_bihash_free_16_8 (&new_hash);
1162           return -1;
1163         }
1164       else
1165         {
1166           clib_bihash_free_16_8 (&ip4_sv_reass_main.hash);
1167           clib_memcpy_fast (&ip4_sv_reass_main.hash, &new_hash,
1168                             sizeof (ip4_sv_reass_main.hash));
1169           clib_bihash_copied (&ip4_sv_reass_main.hash, &new_hash);
1170         }
1171     }
1172   return 0;
1173 }
1174
1175 vnet_api_error_t
1176 ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1177                   u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1178 {
1179   *timeout_ms = ip4_sv_reass_main.timeout_ms;
1180   *max_reassemblies = ip4_sv_reass_main.max_reass_n;
1181   *max_reassembly_length = ip4_sv_reass_main.max_reass_len;
1182   *expire_walk_interval_ms = ip4_sv_reass_main.expire_walk_interval_ms;
1183   return 0;
1184 }
1185
1186 static clib_error_t *
1187 ip4_sv_reass_init_function (vlib_main_t * vm)
1188 {
1189   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1190   clib_error_t *error = 0;
1191   u32 nbuckets;
1192   vlib_node_t *node;
1193
1194   rm->vlib_main = vm;
1195   rm->vnet_main = vnet_get_main ();
1196
1197   vec_validate (rm->per_thread_data, vlib_num_workers ());
1198   ip4_sv_reass_per_thread_t *rt;
1199   vec_foreach (rt, rm->per_thread_data)
1200   {
1201     clib_spinlock_init (&rt->lock);
1202     pool_alloc (rt->pool, rm->max_reass_n);
1203     rt->lru_first = rt->lru_last = ~0;
1204   }
1205
1206   node = vlib_get_node_by_name (vm, (u8 *) "ip4-sv-reassembly-expire-walk");
1207   ASSERT (node);
1208   rm->ip4_sv_reass_expire_node_idx = node->index;
1209
1210   ip4_sv_reass_set_params (IP4_SV_REASS_TIMEOUT_DEFAULT_MS,
1211                            IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT,
1212                            IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1213                            IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1214
1215   nbuckets = ip4_sv_reass_get_nbuckets ();
1216   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1217
1218   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1219   ASSERT (node);
1220   rm->ip4_drop_idx = node->index;
1221
1222   rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
1223   rm->fq_feature_index =
1224     vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
1225
1226   rm->feature_use_refcount_per_intf = NULL;
1227   rm->output_feature_use_refcount_per_intf = NULL;
1228
1229   return error;
1230 }
1231
1232 VLIB_INIT_FUNCTION (ip4_sv_reass_init_function);
1233 #endif /* CLIB_MARCH_VARIANT */
1234
1235 static uword
1236 ip4_sv_reass_walk_expired (vlib_main_t *vm,
1237                            CLIB_UNUSED (vlib_node_runtime_t *node),
1238                            CLIB_UNUSED (vlib_frame_t *f))
1239 {
1240   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1241   uword event_type, *event_data = 0;
1242
1243   while (true)
1244     {
1245       vlib_process_wait_for_event_or_clock (vm,
1246                                             (f64)
1247                                             rm->expire_walk_interval_ms /
1248                                             (f64) MSEC_PER_SEC);
1249       event_type = vlib_process_get_events (vm, &event_data);
1250
1251       switch (event_type)
1252         {
1253         case ~0:
1254           /* no events => timeout */
1255           /* fallthrough */
1256         case IP4_EVENT_CONFIG_CHANGED:
1257           /* nothing to do here */
1258           break;
1259         default:
1260           clib_warning ("BUG: event type 0x%wx", event_type);
1261           break;
1262         }
1263       f64 now = vlib_time_now (vm);
1264
1265       ip4_sv_reass_t *reass;
1266       int *pool_indexes_to_free = NULL;
1267
1268       uword thread_index = 0;
1269       int index;
1270       const uword nthreads = vlib_num_workers () + 1;
1271       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1272         {
1273           ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1274           clib_spinlock_lock (&rt->lock);
1275
1276           vec_reset_length (pool_indexes_to_free);
1277           /* *INDENT-OFF* */
1278           pool_foreach_index (index, rt->pool)  {
1279                                 reass = pool_elt_at_index (rt->pool, index);
1280                                 if (now > reass->last_heard + rm->timeout)
1281                                   {
1282                                     vec_add1 (pool_indexes_to_free, index);
1283                                   }
1284                               }
1285           /* *INDENT-ON* */
1286           int *i;
1287           /* *INDENT-OFF* */
1288           vec_foreach (i, pool_indexes_to_free)
1289           {
1290             ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1291             ip4_sv_reass_free (vm, rm, rt, reass);
1292           }
1293           /* *INDENT-ON* */
1294
1295           clib_spinlock_unlock (&rt->lock);
1296         }
1297
1298       vec_free (pool_indexes_to_free);
1299       if (event_data)
1300         {
1301           vec_set_len (event_data, 0);
1302         }
1303     }
1304
1305   return 0;
1306 }
1307
1308 /* *INDENT-OFF* */
1309 VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
1310   .function = ip4_sv_reass_walk_expired,
1311   .type = VLIB_NODE_TYPE_PROCESS,
1312   .name = "ip4-sv-reassembly-expire-walk",
1313   .format_trace = format_ip4_sv_reass_trace,
1314   .n_errors = IP4_N_ERROR,
1315   .error_counters = ip4_error_counters,
1316 };
1317 /* *INDENT-ON* */
1318
1319 static u8 *
1320 format_ip4_sv_reass_key (u8 * s, va_list * args)
1321 {
1322   ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
1323   s =
1324     format (s, "fib_index: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1325             key->fib_index, format_ip4_address, &key->src, format_ip4_address,
1326             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1327   return s;
1328 }
1329
1330 static u8 *
1331 format_ip4_sv_reass (u8 * s, va_list * args)
1332 {
1333   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1334   ip4_sv_reass_t *reass = va_arg (*args, ip4_sv_reass_t *);
1335
1336   s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n",
1337               reass->id, format_ip4_sv_reass_key, &reass->key,
1338               reass->trace_op_counter);
1339
1340   vlib_buffer_t *b;
1341   u32 *bip;
1342   u32 counter = 0;
1343   vec_foreach (bip, reass->cached_buffers)
1344   {
1345     u32 bi = *bip;
1346     do
1347       {
1348         b = vlib_get_buffer (vm, bi);
1349         s = format (s, "  #%03u: bi: %u, ", counter, bi);
1350         ++counter;
1351         bi = b->next_buffer;
1352       }
1353     while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
1354   }
1355   return s;
1356 }
1357
1358 static clib_error_t *
1359 show_ip4_reass (vlib_main_t * vm,
1360                 unformat_input_t * input,
1361                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1362 {
1363   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1364
1365   vlib_cli_output (vm, "---------------------");
1366   vlib_cli_output (vm, "IP4 reassembly status");
1367   vlib_cli_output (vm, "---------------------");
1368   bool details = false;
1369   if (unformat (input, "details"))
1370     {
1371       details = true;
1372     }
1373
1374   u32 sum_reass_n = 0;
1375   ip4_sv_reass_t *reass;
1376   uword thread_index;
1377   const uword nthreads = vlib_num_workers () + 1;
1378   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1379     {
1380       ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1381       clib_spinlock_lock (&rt->lock);
1382       if (details)
1383         {
1384           /* *INDENT-OFF* */
1385           pool_foreach (reass, rt->pool) {
1386             vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
1387           }
1388           /* *INDENT-ON* */
1389         }
1390       sum_reass_n += rt->reass_n;
1391       clib_spinlock_unlock (&rt->lock);
1392     }
1393   vlib_cli_output (vm, "---------------------");
1394   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1395                    (long unsigned) sum_reass_n);
1396   vlib_cli_output (vm,
1397                    "Maximum configured concurrent shallow virtual IP4 reassemblies per worker-thread: %lu\n",
1398                    (long unsigned) rm->max_reass_n);
1399   vlib_cli_output (vm,
1400                    "Maximum configured amount of fragments per shallow "
1401                    "virtual IP4 reassembly: %lu\n",
1402                    (long unsigned) rm->max_reass_len);
1403   vlib_cli_output (vm,
1404                    "Maximum configured shallow virtual IP4 reassembly timeout: %lums\n",
1405                    (long unsigned) rm->timeout_ms);
1406   vlib_cli_output (vm,
1407                    "Maximum configured shallow virtual IP4 reassembly expire walk interval: %lums\n",
1408                    (long unsigned) rm->expire_walk_interval_ms);
1409   return 0;
1410 }
1411
1412 /* *INDENT-OFF* */
1413 VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
1414     .path = "show ip4-sv-reassembly",
1415     .short_help = "show ip4-sv-reassembly [details]",
1416     .function = show_ip4_reass,
1417 };
1418 /* *INDENT-ON* */
1419
1420 #ifndef CLIB_MARCH_VARIANT
1421 vnet_api_error_t
1422 ip4_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1423 {
1424   return ip4_sv_reass_enable_disable_with_refcnt (sw_if_index,
1425                                                   enable_disable);
1426 }
1427 #endif /* CLIB_MARCH_VARIANT */
1428
1429
1430 #define foreach_ip4_sv_reass_handoff_error                       \
1431 _(CONGESTION_DROP, "congestion drop")
1432
1433
1434 typedef enum
1435 {
1436 #define _(sym,str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
1437   foreach_ip4_sv_reass_handoff_error
1438 #undef _
1439     IP4_SV_REASSEMBLY_HANDOFF_N_ERROR,
1440 } ip4_sv_reass_handoff_error_t;
1441
1442 static char *ip4_sv_reass_handoff_error_strings[] = {
1443 #define _(sym,string) string,
1444   foreach_ip4_sv_reass_handoff_error
1445 #undef _
1446 };
1447
1448 typedef struct
1449 {
1450   u32 next_worker_index;
1451 } ip4_sv_reass_handoff_trace_t;
1452
1453 static u8 *
1454 format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
1455 {
1456   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1457   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1458   ip4_sv_reass_handoff_trace_t *t =
1459     va_arg (*args, ip4_sv_reass_handoff_trace_t *);
1460
1461   s =
1462     format (s, "ip4-sv-reassembly-handoff: next-worker %d",
1463             t->next_worker_index);
1464
1465   return s;
1466 }
1467
1468 always_inline uword
1469 ip4_sv_reass_handoff_node_inline (vlib_main_t * vm,
1470                                   vlib_node_runtime_t * node,
1471                                   vlib_frame_t * frame, bool is_feature)
1472 {
1473   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1474
1475   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1476   u32 n_enq, n_left_from, *from;
1477   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1478   u32 fq_index;
1479
1480   from = vlib_frame_vector_args (frame);
1481   n_left_from = frame->n_vectors;
1482   vlib_get_buffers (vm, from, bufs, n_left_from);
1483
1484   b = bufs;
1485   ti = thread_indices;
1486
1487   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1488
1489   while (n_left_from > 0)
1490     {
1491       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1492
1493       if (PREDICT_FALSE
1494           ((node->flags & VLIB_NODE_FLAG_TRACE)
1495            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1496         {
1497           ip4_sv_reass_handoff_trace_t *t =
1498             vlib_add_trace (vm, node, b[0], sizeof (*t));
1499           t->next_worker_index = ti[0];
1500         }
1501
1502       n_left_from -= 1;
1503       ti += 1;
1504       b += 1;
1505     }
1506   n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
1507                                          thread_indices, frame->n_vectors, 1);
1508
1509   if (n_enq < frame->n_vectors)
1510     vlib_node_increment_counter (vm, node->node_index,
1511                                  IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1512                                  frame->n_vectors - n_enq);
1513   return frame->n_vectors;
1514 }
1515
1516 VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
1517                                           vlib_node_runtime_t * node,
1518                                           vlib_frame_t * frame)
1519 {
1520   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1521                                            false /* is_feature */ );
1522 }
1523
1524
1525 /* *INDENT-OFF* */
1526 VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
1527   .name = "ip4-sv-reassembly-handoff",
1528   .vector_size = sizeof (u32),
1529   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1530   .error_strings = ip4_sv_reass_handoff_error_strings,
1531   .format_trace = format_ip4_sv_reass_handoff_trace,
1532
1533   .n_next_nodes = 1,
1534
1535   .next_nodes = {
1536     [0] = "error-drop",
1537   },
1538 };
1539 /* *INDENT-ON* */
1540
1541
1542 /* *INDENT-OFF* */
1543 VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
1544                                                     vlib_node_runtime_t *
1545                                                     node,
1546                                                     vlib_frame_t * frame)
1547 {
1548   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1549                                              true /* is_feature */ );
1550 }
1551 /* *INDENT-ON* */
1552
1553
1554 /* *INDENT-OFF* */
1555 VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
1556   .name = "ip4-sv-reass-feature-hoff",
1557   .vector_size = sizeof (u32),
1558   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1559   .error_strings = ip4_sv_reass_handoff_error_strings,
1560   .format_trace = format_ip4_sv_reass_handoff_trace,
1561
1562   .n_next_nodes = 1,
1563
1564   .next_nodes = {
1565     [0] = "error-drop",
1566   },
1567 };
1568 /* *INDENT-ON* */
1569
1570 #ifndef CLIB_MARCH_VARIANT
1571 int
1572 ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1573 {
1574   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1575   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1576   if (is_enable)
1577     {
1578       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1579         {
1580           ++rm->feature_use_refcount_per_intf[sw_if_index];
1581           return vnet_feature_enable_disable ("ip4-unicast",
1582                                               "ip4-sv-reassembly-feature",
1583                                               sw_if_index, 1, 0, 0);
1584         }
1585       ++rm->feature_use_refcount_per_intf[sw_if_index];
1586     }
1587   else
1588     {
1589       if (rm->feature_use_refcount_per_intf[sw_if_index])
1590         --rm->feature_use_refcount_per_intf[sw_if_index];
1591       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1592         return vnet_feature_enable_disable ("ip4-unicast",
1593                                             "ip4-sv-reassembly-feature",
1594                                             sw_if_index, 0, 0, 0);
1595     }
1596   return 0;
1597 }
1598
1599 uword
1600 ip4_sv_reass_custom_register_next_node (uword node_index)
1601 {
1602   return vlib_node_add_next (vlib_get_main (), ip4_sv_reass_custom_node.index,
1603                              node_index);
1604 }
1605
1606 int
1607 ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
1608                                                 int is_enable)
1609 {
1610   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1611   vec_validate (rm->output_feature_use_refcount_per_intf, sw_if_index);
1612   if (is_enable)
1613     {
1614       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1615         {
1616           ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1617           return vnet_feature_enable_disable ("ip4-output",
1618                                               "ip4-sv-reassembly-output-feature",
1619                                               sw_if_index, 1, 0, 0);
1620         }
1621       ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1622     }
1623   else
1624     {
1625       if (rm->output_feature_use_refcount_per_intf[sw_if_index])
1626         --rm->output_feature_use_refcount_per_intf[sw_if_index];
1627       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1628         return vnet_feature_enable_disable ("ip4-output",
1629                                             "ip4-sv-reassembly-output-feature",
1630                                             sw_if_index, 0, 0, 0);
1631     }
1632   return 0;
1633 }
1634 #endif
1635
1636 /*
1637  * fd.io coding-style-patch-verification: ON
1638  *
1639  * Local Variables:
1640  * eval: (c-set-style "gnu")
1641  * End:
1642  */