vppinfra: bihash walk cb typedef and continue/stop controls
[vpp.git] / src / vnet / ip / reass / ip4_sv_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Shallow Virtual Reassembly.
19  *
20  * This file contains the source code for IPv4 Shallow Virtual reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vnet/ip/ip4_to_ip6.h>
27 #include <vppinfra/fifo.h>
28 #include <vppinfra/bihash_16_8.h>
29 #include <vnet/ip/reass/ip4_sv_reass.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_SV_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000      // 10 seconds default
34 #define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_SV_REASS_HT_LOAD_FACTOR (0.75)
37
38 typedef enum
39 {
40   IP4_SV_REASS_RC_OK,
41   IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS,
42   IP4_SV_REASS_RC_UNSUPP_IP_PROTO,
43 } ip4_sv_reass_rc_t;
44
45 typedef struct
46 {
47   union
48   {
49     struct
50     {
51       u32 xx_id;
52       ip4_address_t src;
53       ip4_address_t dst;
54       u16 frag_id;
55       u8 proto;
56       u8 unused;
57     };
58     u64 as_u64[2];
59   };
60 } ip4_sv_reass_key_t;
61
62 typedef union
63 {
64   struct
65   {
66     u32 reass_index;
67     u32 thread_index;
68   };
69   u64 as_u64;
70 } ip4_sv_reass_val_t;
71
72 typedef union
73 {
74   struct
75   {
76     ip4_sv_reass_key_t k;
77     ip4_sv_reass_val_t v;
78   };
79   clib_bihash_kv_16_8_t kv;
80 } ip4_sv_reass_kv_t;
81
82 typedef struct
83 {
84   // hash table key
85   ip4_sv_reass_key_t key;
86   // time when last packet was received
87   f64 last_heard;
88   // internal id of this reassembly
89   u64 id;
90   // trace operation counter
91   u32 trace_op_counter;
92   // minimum fragment length for this reassembly - used to estimate MTU
93   u16 min_fragment_length;
94   // buffer indexes of buffers in this reassembly in chronological order -
95   // including overlaps and duplicate fragments
96   u32 *cached_buffers;
97   // set to true when this reassembly is completed
98   bool is_complete;
99   // ip protocol
100   u8 ip_proto;
101   // l4 src port
102   u16 l4_src_port;
103   // l4 dst port
104   u16 l4_dst_port;
105   u32 next_index;
106   // lru indexes
107   u32 lru_prev;
108   u32 lru_next;
109 } ip4_sv_reass_t;
110
111 typedef struct
112 {
113   ip4_sv_reass_t *pool;
114   u32 reass_n;
115   u32 id_counter;
116   clib_spinlock_t lock;
117   // lru indexes
118   u32 lru_first;
119   u32 lru_last;
120
121 } ip4_sv_reass_per_thread_t;
122
123 typedef struct
124 {
125   // IPv4 config
126   u32 timeout_ms;
127   f64 timeout;
128   u32 expire_walk_interval_ms;
129   // maximum number of fragments in one reassembly
130   u32 max_reass_len;
131   // maximum number of reassemblies
132   u32 max_reass_n;
133
134   // IPv4 runtime
135   clib_bihash_16_8_t hash;
136   // per-thread data
137   ip4_sv_reass_per_thread_t *per_thread_data;
138
139   // convenience
140   vlib_main_t *vlib_main;
141   vnet_main_t *vnet_main;
142
143   // node index of ip4-drop node
144   u32 ip4_drop_idx;
145   u32 ip4_sv_reass_expire_node_idx;
146
147   /** Worker handoff */
148   u32 fq_index;
149   u32 fq_feature_index;
150
151   // reference count for enabling/disabling feature - per interface
152   u32 *feature_use_refcount_per_intf;
153
154 } ip4_sv_reass_main_t;
155
156 extern ip4_sv_reass_main_t ip4_sv_reass_main;
157
158 #ifndef CLIB_MARCH_VARIANT
159 ip4_sv_reass_main_t ip4_sv_reass_main;
160 #endif /* CLIB_MARCH_VARIANT */
161
162 typedef enum
163 {
164   IP4_SV_REASSEMBLY_NEXT_INPUT,
165   IP4_SV_REASSEMBLY_NEXT_DROP,
166   IP4_SV_REASSEMBLY_NEXT_HANDOFF,
167   IP4_SV_REASSEMBLY_N_NEXT,
168 } ip4_sv_reass_next_t;
169
170 typedef enum
171 {
172   REASS_FRAGMENT_CACHE,
173   REASS_FINISH,
174   REASS_FRAGMENT_FORWARD,
175 } ip4_sv_reass_trace_operation_e;
176
177 typedef struct
178 {
179   ip4_sv_reass_trace_operation_e action;
180   u32 reass_id;
181   u32 op_id;
182   u8 ip_proto;
183   u16 l4_src_port;
184   u16 l4_dst_port;
185 } ip4_sv_reass_trace_t;
186
187 extern vlib_node_registration_t ip4_sv_reass_node;
188 extern vlib_node_registration_t ip4_sv_reass_node_feature;
189
190 static u8 *
191 format_ip4_sv_reass_trace (u8 * s, va_list * args)
192 {
193   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
194   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
195   ip4_sv_reass_trace_t *t = va_arg (*args, ip4_sv_reass_trace_t *);
196   s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
197   switch (t->action)
198     {
199     case REASS_FRAGMENT_CACHE:
200       s = format (s, "[cached]");
201       break;
202     case REASS_FINISH:
203       s =
204         format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
205                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
206                 clib_net_to_host_u16 (t->l4_dst_port));
207       break;
208     case REASS_FRAGMENT_FORWARD:
209       s =
210         format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
211                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
212                 clib_net_to_host_u16 (t->l4_dst_port));
213       break;
214     }
215   return s;
216 }
217
218 static void
219 ip4_sv_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
220                         ip4_sv_reass_main_t * rm, ip4_sv_reass_t * reass,
221                         u32 bi, ip4_sv_reass_trace_operation_e action,
222                         u32 ip_proto, u16 l4_src_port, u16 l4_dst_port)
223 {
224   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
225   ip4_sv_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
226   t->reass_id = reass->id;
227   t->action = action;
228   t->op_id = reass->trace_op_counter;
229   t->ip_proto = ip_proto;
230   t->l4_src_port = l4_src_port;
231   t->l4_dst_port = l4_dst_port;
232   ++reass->trace_op_counter;
233 #if 0
234   static u8 *s = NULL;
235   s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
236   printf ("%.*s\n", vec_len (s), s);
237   fflush (stdout);
238   vec_reset_length (s);
239 #endif
240 }
241
242
243 always_inline void
244 ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
245                    ip4_sv_reass_per_thread_t * rt, ip4_sv_reass_t * reass)
246 {
247   clib_bihash_kv_16_8_t kv;
248   kv.key[0] = reass->key.as_u64[0];
249   kv.key[1] = reass->key.as_u64[1];
250   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
251   vlib_buffer_free (vm, reass->cached_buffers,
252                     vec_len (reass->cached_buffers));
253   vec_free (reass->cached_buffers);
254   reass->cached_buffers = NULL;
255   if (~0 != reass->lru_prev)
256     {
257       ip4_sv_reass_t *lru_prev =
258         pool_elt_at_index (rt->pool, reass->lru_prev);
259       lru_prev->lru_next = reass->lru_next;
260     }
261   if (~0 != reass->lru_next)
262     {
263       ip4_sv_reass_t *lru_next =
264         pool_elt_at_index (rt->pool, reass->lru_next);
265       lru_next->lru_prev = reass->lru_prev;
266     }
267   if (rt->lru_first == reass - rt->pool)
268     {
269       rt->lru_first = reass->lru_next;
270     }
271   if (rt->lru_last == reass - rt->pool)
272     {
273       rt->lru_last = reass->lru_prev;
274     }
275   pool_put (rt->pool, reass);
276   --rt->reass_n;
277 }
278
279 always_inline void
280 ip4_sv_reass_init (ip4_sv_reass_t * reass)
281 {
282   reass->cached_buffers = NULL;
283   reass->is_complete = false;
284 }
285
286 always_inline ip4_sv_reass_t *
287 ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
288                              ip4_sv_reass_per_thread_t * rt,
289                              ip4_sv_reass_kv_t * kv, u8 * do_handoff)
290 {
291   ip4_sv_reass_t *reass = NULL;
292   f64 now = vlib_time_now (rm->vlib_main);
293
294   if (!clib_bihash_search_16_8
295       (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
296     {
297       if (vm->thread_index != kv->v.thread_index)
298         {
299           *do_handoff = 1;
300           return NULL;
301         }
302       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
303
304       if (now > reass->last_heard + rm->timeout)
305         {
306           ip4_sv_reass_free (vm, rm, rt, reass);
307           reass = NULL;
308         }
309     }
310
311   if (reass)
312     {
313       reass->last_heard = now;
314       return reass;
315     }
316
317   if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
318     {
319       reass = pool_elt_at_index (rt->pool, rt->lru_last);
320       ip4_sv_reass_free (vm, rm, rt, reass);
321     }
322
323   pool_get (rt->pool, reass);
324   clib_memset (reass, 0, sizeof (*reass));
325   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
326   ++rt->id_counter;
327   ip4_sv_reass_init (reass);
328   ++rt->reass_n;
329   reass->lru_prev = reass->lru_next = ~0;
330
331   if (~0 != rt->lru_last)
332     {
333       ip4_sv_reass_t *lru_last = pool_elt_at_index (rt->pool, rt->lru_last);
334       reass->lru_prev = rt->lru_last;
335       lru_last->lru_next = rt->lru_last = reass - rt->pool;
336     }
337
338   if (~0 == rt->lru_first)
339     {
340       rt->lru_first = rt->lru_last = reass - rt->pool;
341     }
342
343   reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
344   reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
345   kv->v.reass_index = (reass - rt->pool);
346   kv->v.thread_index = vm->thread_index;
347   reass->last_heard = now;
348
349   if (clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 1))
350     {
351       ip4_sv_reass_free (vm, rm, rt, reass);
352       reass = NULL;
353     }
354
355   return reass;
356 }
357
358 always_inline ip4_sv_reass_rc_t
359 ip4_sv_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
360                      ip4_sv_reass_main_t * rm, ip4_sv_reass_per_thread_t * rt,
361                      ip4_sv_reass_t * reass, u32 bi0)
362 {
363   vlib_buffer_t *fb = vlib_get_buffer (vm, bi0);
364   ip4_sv_reass_rc_t rc = IP4_SV_REASS_RC_OK;
365   ip4_header_t *fip = vlib_buffer_get_current (fb);
366   const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
367   if (0 == fragment_first)
368     {
369       reass->ip_proto = fip->protocol;
370       reass->l4_src_port = ip4_get_port (fip, 1);
371       reass->l4_dst_port = ip4_get_port (fip, 0);
372       if (!reass->l4_src_port || !reass->l4_dst_port)
373         return IP4_SV_REASS_RC_UNSUPP_IP_PROTO;
374       reass->is_complete = true;
375       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
376       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
377         {
378           ip4_sv_reass_add_trace (vm, node, rm, reass, bi0, REASS_FINISH,
379                                   reass->ip_proto, reass->l4_src_port,
380                                   reass->l4_dst_port);
381         }
382     }
383   vec_add1 (reass->cached_buffers, bi0);
384   if (!reass->is_complete)
385     {
386       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
387         {
388           ip4_sv_reass_add_trace (vm, node, rm, reass, bi0,
389                                   REASS_FRAGMENT_CACHE, ~0, ~0, ~0);
390         }
391       if (vec_len (reass->cached_buffers) > rm->max_reass_len)
392         {
393           rc = IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS;
394         }
395     }
396   return rc;
397 }
398
399 always_inline uword
400 ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
401                      vlib_frame_t * frame, bool is_feature, bool is_custom)
402 {
403   u32 *from = vlib_frame_vector_args (frame);
404   u32 n_left_from, n_left_to_next, *to_next, next_index;
405   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
406   ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
407   clib_spinlock_lock (&rt->lock);
408
409   n_left_from = frame->n_vectors;
410   next_index = node->cached_next_index;
411
412   while (n_left_from > 0)
413     {
414       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
415
416       while (n_left_from > 0 && n_left_to_next > 0)
417         {
418           u32 bi0;
419           vlib_buffer_t *b0;
420           u32 next0;
421           u32 error0 = IP4_ERROR_NONE;
422
423           bi0 = from[0];
424           b0 = vlib_get_buffer (vm, bi0);
425
426           ip4_header_t *ip0 = vlib_buffer_get_current (b0);
427           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
428             {
429               // this is a regular packet - no fragmentation
430               vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
431               vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
432               vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
433               next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
434               goto packet_enqueue;
435             }
436           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
437           const u32 fragment_length =
438             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
439           const u32 fragment_last = fragment_first + fragment_length - 1;
440           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
441             {
442               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
443               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
444               goto packet_enqueue;
445             }
446           ip4_sv_reass_kv_t kv;
447           u8 do_handoff = 0;
448
449           kv.k.as_u64[0] =
450             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
451                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
452             (u64) ip0->src_address.as_u32 << 32;
453           kv.k.as_u64[1] =
454             (u64) ip0->dst_address.
455             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
456
457           ip4_sv_reass_t *reass =
458             ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
459
460           if (PREDICT_FALSE (do_handoff))
461             {
462               next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
463               vnet_buffer (b0)->ip.reass.owner_thread_index =
464                 kv.v.thread_index;
465               goto packet_enqueue;
466             }
467
468           if (!reass)
469             {
470               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
471               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
472               goto packet_enqueue;
473             }
474
475           if (reass->is_complete)
476             {
477               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
478               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
479               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
480               next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
481               error0 = IP4_ERROR_NONE;
482               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
483                 {
484                   ip4_sv_reass_add_trace (vm, node, rm, reass, bi0,
485                                           REASS_FRAGMENT_FORWARD,
486                                           reass->ip_proto,
487                                           reass->l4_src_port,
488                                           reass->l4_dst_port);
489                 }
490               goto packet_enqueue;
491             }
492
493           ip4_sv_reass_rc_t rc =
494             ip4_sv_reass_update (vm, node, rm, rt, reass, bi0);
495           switch (rc)
496             {
497             case IP4_SV_REASS_RC_OK:
498               /* nothing to do here */
499               break;
500             case IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS:
501               vlib_node_increment_counter (vm, node->node_index,
502                                            IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
503                                            1);
504               ip4_sv_reass_free (vm, rm, rt, reass);
505               goto next_packet;
506               break;
507             case IP4_SV_REASS_RC_UNSUPP_IP_PROTO:
508               vlib_node_increment_counter (vm, node->node_index,
509                                            IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
510                                            1);
511               ip4_sv_reass_free (vm, rm, rt, reass);
512               goto next_packet;
513               break;
514             }
515           if (reass->is_complete)
516             {
517               u32 idx;
518               vec_foreach_index (idx, reass->cached_buffers)
519               {
520                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
521                 vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
522                 u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
523                 if (is_feature)
524                   {
525                     vnet_feature_next (&next0, b0);
526                   }
527                 if (is_custom)
528                   {
529                     next0 = vnet_buffer (b0)->ip.reass.next_index;
530                   }
531                 if (0 == n_left_to_next)
532                   {
533                     vlib_put_next_frame (vm, node, next_index,
534                                          n_left_to_next);
535                     vlib_get_next_frame (vm, node, next_index, to_next,
536                                          n_left_to_next);
537                   }
538                 to_next[0] = bi0;
539                 to_next += 1;
540                 n_left_to_next -= 1;
541                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
542                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
543                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
544                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
545                   {
546                     ip4_sv_reass_add_trace (vm, node, rm, reass, bi0,
547                                             REASS_FRAGMENT_FORWARD,
548                                             reass->ip_proto,
549                                             reass->l4_src_port,
550                                             reass->l4_dst_port);
551                   }
552                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
553                                                  to_next, n_left_to_next, bi0,
554                                                  next0);
555               }
556               _vec_len (reass->cached_buffers) = 0;     // buffers are owned by frame now
557             }
558           goto next_packet;
559
560         packet_enqueue:
561           b0->error = node->errors[error0];
562
563           to_next[0] = bi0;
564           to_next += 1;
565           n_left_to_next -= 1;
566           if (is_feature && IP4_ERROR_NONE == error0)
567             {
568               b0 = vlib_get_buffer (vm, bi0);
569               vnet_feature_next (&next0, b0);
570             }
571           if (is_custom)
572             {
573               next0 = vnet_buffer (b0)->ip.reass.next_index;
574             }
575           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
576                                            to_next, n_left_to_next,
577                                            bi0, next0);
578
579         next_packet:
580           from += 1;
581           n_left_from -= 1;
582         }
583
584       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
585     }
586
587   clib_spinlock_unlock (&rt->lock);
588   return frame->n_vectors;
589 }
590
591 static char *ip4_sv_reass_error_strings[] = {
592 #define _(sym, string) string,
593   foreach_ip4_error
594 #undef _
595 };
596
597 VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
598                                   vlib_node_runtime_t * node,
599                                   vlib_frame_t * frame)
600 {
601   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
602                               false /* is_custom */ );
603 }
604
605 /* *INDENT-OFF* */
606 VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
607     .name = "ip4-sv-reassembly",
608     .vector_size = sizeof (u32),
609     .format_trace = format_ip4_sv_reass_trace,
610     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
611     .error_strings = ip4_sv_reass_error_strings,
612     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
613     .next_nodes =
614         {
615                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
616                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
617                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
618
619         },
620 };
621 /* *INDENT-ON* */
622
623 VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
624                                           vlib_node_runtime_t * node,
625                                           vlib_frame_t * frame)
626 {
627   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
628                               false /* is_custom */ );
629 }
630
631 /* *INDENT-OFF* */
632 VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
633     .name = "ip4-sv-reassembly-feature",
634     .vector_size = sizeof (u32),
635     .format_trace = format_ip4_sv_reass_trace,
636     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
637     .error_strings = ip4_sv_reass_error_strings,
638     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
639     .next_nodes =
640         {
641                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
642                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
643                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
644         },
645 };
646 /* *INDENT-ON* */
647
648 /* *INDENT-OFF* */
649 VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
650     .arc_name = "ip4-unicast",
651     .node_name = "ip4-sv-reassembly-feature",
652     .runs_before = VNET_FEATURES ("ip4-lookup"),
653     .runs_after = 0,
654 };
655 /* *INDENT-ON* */
656
657 /* *INDENT-OFF* */
658 VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
659     .name = "ip4-sv-reassembly-custom-next",
660     .vector_size = sizeof (u32),
661     .format_trace = format_ip4_sv_reass_trace,
662     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
663     .error_strings = ip4_sv_reass_error_strings,
664     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
665     .next_nodes =
666         {
667                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
668                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
669                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
670
671         },
672 };
673 /* *INDENT-ON* */
674
675 VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
676                                          vlib_node_runtime_t * node,
677                                          vlib_frame_t * frame)
678 {
679   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
680                               true /* is_custom */ );
681 }
682
683 #ifndef CLIB_MARCH_VARIANT
684 always_inline u32
685 ip4_sv_reass_get_nbuckets ()
686 {
687   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
688   u32 nbuckets;
689   u8 i;
690
691   nbuckets = (u32) (rm->max_reass_n / IP4_SV_REASS_HT_LOAD_FACTOR);
692
693   for (i = 0; i < 31; i++)
694     if ((1 << i) >= nbuckets)
695       break;
696   nbuckets = 1 << i;
697
698   return nbuckets;
699 }
700 #endif /* CLIB_MARCH_VARIANT */
701
702 typedef enum
703 {
704   IP4_EVENT_CONFIG_CHANGED = 1,
705 } ip4_sv_reass_event_t;
706
707 typedef struct
708 {
709   int failure;
710   clib_bihash_16_8_t *new_hash;
711 } ip4_rehash_cb_ctx;
712
713 #ifndef CLIB_MARCH_VARIANT
714 static int
715 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
716 {
717   ip4_rehash_cb_ctx *ctx = _ctx;
718   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
719     {
720       ctx->failure = 1;
721     }
722   return (BIHASH_WALK_CONTINUE);
723 }
724
725 static void
726 ip4_sv_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
727                          u32 max_reassembly_length,
728                          u32 expire_walk_interval_ms)
729 {
730   ip4_sv_reass_main.timeout_ms = timeout_ms;
731   ip4_sv_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
732   ip4_sv_reass_main.max_reass_n = max_reassemblies;
733   ip4_sv_reass_main.max_reass_len = max_reassembly_length;
734   ip4_sv_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
735 }
736
737 vnet_api_error_t
738 ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
739                   u32 max_reassembly_length, u32 expire_walk_interval_ms)
740 {
741   u32 old_nbuckets = ip4_sv_reass_get_nbuckets ();
742   ip4_sv_reass_set_params (timeout_ms, max_reassemblies,
743                            max_reassembly_length, expire_walk_interval_ms);
744   vlib_process_signal_event (ip4_sv_reass_main.vlib_main,
745                              ip4_sv_reass_main.ip4_sv_reass_expire_node_idx,
746                              IP4_EVENT_CONFIG_CHANGED, 0);
747   u32 new_nbuckets = ip4_sv_reass_get_nbuckets ();
748   if (ip4_sv_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
749     {
750       clib_bihash_16_8_t new_hash;
751       clib_memset (&new_hash, 0, sizeof (new_hash));
752       ip4_rehash_cb_ctx ctx;
753       ctx.failure = 0;
754       ctx.new_hash = &new_hash;
755       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
756                              new_nbuckets * 1024);
757       clib_bihash_foreach_key_value_pair_16_8 (&ip4_sv_reass_main.hash,
758                                                ip4_rehash_cb, &ctx);
759       if (ctx.failure)
760         {
761           clib_bihash_free_16_8 (&new_hash);
762           return -1;
763         }
764       else
765         {
766           clib_bihash_free_16_8 (&ip4_sv_reass_main.hash);
767           clib_memcpy_fast (&ip4_sv_reass_main.hash, &new_hash,
768                             sizeof (ip4_sv_reass_main.hash));
769           clib_bihash_copied (&ip4_sv_reass_main.hash, &new_hash);
770         }
771     }
772   return 0;
773 }
774
775 vnet_api_error_t
776 ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
777                   u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
778 {
779   *timeout_ms = ip4_sv_reass_main.timeout_ms;
780   *max_reassemblies = ip4_sv_reass_main.max_reass_n;
781   *max_reassembly_length = ip4_sv_reass_main.max_reass_len;
782   *expire_walk_interval_ms = ip4_sv_reass_main.expire_walk_interval_ms;
783   return 0;
784 }
785
786 static clib_error_t *
787 ip4_sv_reass_init_function (vlib_main_t * vm)
788 {
789   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
790   clib_error_t *error = 0;
791   u32 nbuckets;
792   vlib_node_t *node;
793
794   rm->vlib_main = vm;
795   rm->vnet_main = vnet_get_main ();
796
797   vec_validate (rm->per_thread_data, vlib_num_workers ());
798   ip4_sv_reass_per_thread_t *rt;
799   vec_foreach (rt, rm->per_thread_data)
800   {
801     clib_spinlock_init (&rt->lock);
802     pool_alloc (rt->pool, rm->max_reass_n);
803     rt->lru_first = rt->lru_last = ~0;
804   }
805
806   node = vlib_get_node_by_name (vm, (u8 *) "ip4-sv-reassembly-expire-walk");
807   ASSERT (node);
808   rm->ip4_sv_reass_expire_node_idx = node->index;
809
810   ip4_sv_reass_set_params (IP4_SV_REASS_TIMEOUT_DEFAULT_MS,
811                            IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT,
812                            IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
813                            IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
814
815   nbuckets = ip4_sv_reass_get_nbuckets ();
816   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
817
818   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
819   ASSERT (node);
820   rm->ip4_drop_idx = node->index;
821
822   rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
823   rm->fq_feature_index =
824     vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
825
826   rm->feature_use_refcount_per_intf = NULL;
827
828   return error;
829 }
830
831 VLIB_INIT_FUNCTION (ip4_sv_reass_init_function);
832 #endif /* CLIB_MARCH_VARIANT */
833
834 static uword
835 ip4_sv_reass_walk_expired (vlib_main_t * vm,
836                            vlib_node_runtime_t * node, vlib_frame_t * f)
837 {
838   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
839   uword event_type, *event_data = 0;
840
841   while (true)
842     {
843       vlib_process_wait_for_event_or_clock (vm,
844                                             (f64)
845                                             rm->expire_walk_interval_ms /
846                                             (f64) MSEC_PER_SEC);
847       event_type = vlib_process_get_events (vm, &event_data);
848
849       switch (event_type)
850         {
851         case ~0:                /* no events => timeout */
852           /* nothing to do here */
853           break;
854         case IP4_EVENT_CONFIG_CHANGED:
855           break;
856         default:
857           clib_warning ("BUG: event type 0x%wx", event_type);
858           break;
859         }
860       f64 now = vlib_time_now (vm);
861
862       ip4_sv_reass_t *reass;
863       int *pool_indexes_to_free = NULL;
864
865       uword thread_index = 0;
866       int index;
867       const uword nthreads = vlib_num_workers () + 1;
868       for (thread_index = 0; thread_index < nthreads; ++thread_index)
869         {
870           ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
871           clib_spinlock_lock (&rt->lock);
872
873           vec_reset_length (pool_indexes_to_free);
874           /* *INDENT-OFF* */
875           pool_foreach_index (index, rt->pool, ({
876                                 reass = pool_elt_at_index (rt->pool, index);
877                                 if (now > reass->last_heard + rm->timeout)
878                                   {
879                                     vec_add1 (pool_indexes_to_free, index);
880                                   }
881                               }));
882           /* *INDENT-ON* */
883           int *i;
884           /* *INDENT-OFF* */
885           vec_foreach (i, pool_indexes_to_free)
886           {
887             ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
888             ip4_sv_reass_free (vm, rm, rt, reass);
889           }
890           /* *INDENT-ON* */
891
892           clib_spinlock_unlock (&rt->lock);
893         }
894
895       vec_free (pool_indexes_to_free);
896       if (event_data)
897         {
898           _vec_len (event_data) = 0;
899         }
900     }
901
902   return 0;
903 }
904
905 /* *INDENT-OFF* */
906 VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
907     .function = ip4_sv_reass_walk_expired,
908     .type = VLIB_NODE_TYPE_PROCESS,
909     .name = "ip4-sv-reassembly-expire-walk",
910     .format_trace = format_ip4_sv_reass_trace,
911     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
912     .error_strings = ip4_sv_reass_error_strings,
913
914 };
915 /* *INDENT-ON* */
916
917 static u8 *
918 format_ip4_sv_reass_key (u8 * s, va_list * args)
919 {
920   ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
921   s =
922     format (s,
923             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
924             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
925             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
926   return s;
927 }
928
929 static u8 *
930 format_ip4_sv_reass (u8 * s, va_list * args)
931 {
932   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
933   ip4_sv_reass_t *reass = va_arg (*args, ip4_sv_reass_t *);
934
935   s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n",
936               reass->id, format_ip4_sv_reass_key, &reass->key,
937               reass->trace_op_counter);
938
939   vlib_buffer_t *b;
940   u32 *bip;
941   u32 counter = 0;
942   vec_foreach (bip, reass->cached_buffers)
943   {
944     u32 bi = *bip;
945     do
946       {
947         b = vlib_get_buffer (vm, bi);
948         s = format (s, "  #%03u: bi: %u, ", counter, bi);
949         ++counter;
950         bi = b->next_buffer;
951       }
952     while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
953   }
954   return s;
955 }
956
957 static clib_error_t *
958 show_ip4_reass (vlib_main_t * vm,
959                 unformat_input_t * input,
960                 CLIB_UNUSED (vlib_cli_command_t * lmd))
961 {
962   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
963
964   vlib_cli_output (vm, "---------------------");
965   vlib_cli_output (vm, "IP4 reassembly status");
966   vlib_cli_output (vm, "---------------------");
967   bool details = false;
968   if (unformat (input, "details"))
969     {
970       details = true;
971     }
972
973   u32 sum_reass_n = 0;
974   ip4_sv_reass_t *reass;
975   uword thread_index;
976   const uword nthreads = vlib_num_workers () + 1;
977   for (thread_index = 0; thread_index < nthreads; ++thread_index)
978     {
979       ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
980       clib_spinlock_lock (&rt->lock);
981       if (details)
982         {
983           /* *INDENT-OFF* */
984           pool_foreach (reass, rt->pool, {
985             vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
986           });
987           /* *INDENT-ON* */
988         }
989       sum_reass_n += rt->reass_n;
990       clib_spinlock_unlock (&rt->lock);
991     }
992   vlib_cli_output (vm, "---------------------");
993   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
994                    (long unsigned) sum_reass_n);
995   vlib_cli_output (vm,
996                    "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
997                    (long unsigned) rm->max_reass_n);
998   return 0;
999 }
1000
1001 /* *INDENT-OFF* */
1002 VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
1003     .path = "show ip4-sv-reassembly",
1004     .short_help = "show ip4-sv-reassembly [details]",
1005     .function = show_ip4_reass,
1006 };
1007 /* *INDENT-ON* */
1008
1009 #ifndef CLIB_MARCH_VARIANT
1010 vnet_api_error_t
1011 ip4_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1012 {
1013   return vnet_feature_enable_disable ("ip4-unicast",
1014                                       "ip4-sv-reassembly-feature",
1015                                       sw_if_index, enable_disable, 0, 0);
1016 }
1017 #endif /* CLIB_MARCH_VARIANT */
1018
1019
1020 #define foreach_ip4_sv_reass_handoff_error                       \
1021 _(CONGESTION_DROP, "congestion drop")
1022
1023
1024 typedef enum
1025 {
1026 #define _(sym,str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
1027   foreach_ip4_sv_reass_handoff_error
1028 #undef _
1029     IP4_SV_REASSEMBLY_HANDOFF_N_ERROR,
1030 } ip4_sv_reass_handoff_error_t;
1031
1032 static char *ip4_sv_reass_handoff_error_strings[] = {
1033 #define _(sym,string) string,
1034   foreach_ip4_sv_reass_handoff_error
1035 #undef _
1036 };
1037
1038 typedef struct
1039 {
1040   u32 next_worker_index;
1041 } ip4_sv_reass_handoff_trace_t;
1042
1043 static u8 *
1044 format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
1045 {
1046   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1047   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1048   ip4_sv_reass_handoff_trace_t *t =
1049     va_arg (*args, ip4_sv_reass_handoff_trace_t *);
1050
1051   s =
1052     format (s, "ip4-sv-reassembly-handoff: next-worker %d",
1053             t->next_worker_index);
1054
1055   return s;
1056 }
1057
1058 always_inline uword
1059 ip4_sv_reass_handoff_node_inline (vlib_main_t * vm,
1060                                   vlib_node_runtime_t * node,
1061                                   vlib_frame_t * frame, bool is_feature)
1062 {
1063   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1064
1065   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1066   u32 n_enq, n_left_from, *from;
1067   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1068   u32 fq_index;
1069
1070   from = vlib_frame_vector_args (frame);
1071   n_left_from = frame->n_vectors;
1072   vlib_get_buffers (vm, from, bufs, n_left_from);
1073
1074   b = bufs;
1075   ti = thread_indices;
1076
1077   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1078
1079   while (n_left_from > 0)
1080     {
1081       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1082
1083       if (PREDICT_FALSE
1084           ((node->flags & VLIB_NODE_FLAG_TRACE)
1085            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1086         {
1087           ip4_sv_reass_handoff_trace_t *t =
1088             vlib_add_trace (vm, node, b[0], sizeof (*t));
1089           t->next_worker_index = ti[0];
1090         }
1091
1092       n_left_from -= 1;
1093       ti += 1;
1094       b += 1;
1095     }
1096   n_enq =
1097     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1098                                    frame->n_vectors, 1);
1099
1100   if (n_enq < frame->n_vectors)
1101     vlib_node_increment_counter (vm, node->node_index,
1102                                  IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1103                                  frame->n_vectors - n_enq);
1104   return frame->n_vectors;
1105 }
1106
1107 VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
1108                                           vlib_node_runtime_t * node,
1109                                           vlib_frame_t * frame)
1110 {
1111   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1112                                            false /* is_feature */ );
1113 }
1114
1115
1116 /* *INDENT-OFF* */
1117 VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
1118   .name = "ip4-sv-reassembly-handoff",
1119   .vector_size = sizeof (u32),
1120   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1121   .error_strings = ip4_sv_reass_handoff_error_strings,
1122   .format_trace = format_ip4_sv_reass_handoff_trace,
1123
1124   .n_next_nodes = 1,
1125
1126   .next_nodes = {
1127     [0] = "error-drop",
1128   },
1129 };
1130 /* *INDENT-ON* */
1131
1132
1133 /* *INDENT-OFF* */
1134 VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
1135                                                     vlib_node_runtime_t *
1136                                                     node,
1137                                                     vlib_frame_t * frame)
1138 {
1139   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1140                                              true /* is_feature */ );
1141 }
1142 /* *INDENT-ON* */
1143
1144
1145 /* *INDENT-OFF* */
1146 VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
1147   .name = "ip4-sv-reass-feature-hoff",
1148   .vector_size = sizeof (u32),
1149   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1150   .error_strings = ip4_sv_reass_handoff_error_strings,
1151   .format_trace = format_ip4_sv_reass_handoff_trace,
1152
1153   .n_next_nodes = 1,
1154
1155   .next_nodes = {
1156     [0] = "error-drop",
1157   },
1158 };
1159 /* *INDENT-ON* */
1160
1161 #ifndef CLIB_MARCH_VARIANT
1162 int
1163 ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1164 {
1165   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1166   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1167   if (is_enable)
1168     {
1169       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1170         {
1171           ++rm->feature_use_refcount_per_intf[sw_if_index];
1172           return vnet_feature_enable_disable ("ip4-unicast",
1173                                               "ip4-sv-reassembly-feature",
1174                                               sw_if_index, 1, 0, 0);
1175         }
1176       ++rm->feature_use_refcount_per_intf[sw_if_index];
1177     }
1178   else
1179     {
1180       --rm->feature_use_refcount_per_intf[sw_if_index];
1181       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1182         return vnet_feature_enable_disable ("ip4-unicast",
1183                                             "ip4-sv-reassembly-feature",
1184                                             sw_if_index, 0, 0, 0);
1185     }
1186   return 0;
1187 }
1188
1189 uword
1190 ip4_sv_reass_custom_register_next_node (uword node_index)
1191 {
1192   return vlib_node_add_next (vlib_get_main (), ip4_sv_reass_custom_node.index,
1193                              node_index);
1194 }
1195 #endif
1196
1197 /*
1198  * fd.io coding-style-patch-verification: ON
1199  *
1200  * Local Variables:
1201  * eval: (c-set-style "gnu")
1202  * End:
1203  */