ip: reassembly: avoid reading truncated L4 headers
[vpp.git] / src / vnet / ip / reass / ip4_sv_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Shallow Virtual Reassembly.
19  *
20  * This file contains the source code for IPv4 Shallow Virtual reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vnet/ip/ip4_to_ip6.h>
27 #include <vppinfra/fifo.h>
28 #include <vppinfra/bihash_16_8.h>
29 #include <vnet/ip/reass/ip4_sv_reass.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_SV_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000      // 10 seconds default
34 #define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_SV_REASS_HT_LOAD_FACTOR (0.75)
37
38 typedef enum
39 {
40   IP4_SV_REASS_RC_OK,
41   IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS,
42   IP4_SV_REASS_RC_UNSUPP_IP_PROTO,
43 } ip4_sv_reass_rc_t;
44
45 typedef struct
46 {
47   union
48   {
49     struct
50     {
51       u32 fib_index;
52       ip4_address_t src;
53       ip4_address_t dst;
54       u16 frag_id;
55       u8 proto;
56       u8 unused;
57     };
58     u64 as_u64[2];
59   };
60 } ip4_sv_reass_key_t;
61
62 typedef union
63 {
64   struct
65   {
66     u32 reass_index;
67     u32 thread_index;
68   };
69   u64 as_u64;
70 } ip4_sv_reass_val_t;
71
72 typedef union
73 {
74   struct
75   {
76     ip4_sv_reass_key_t k;
77     ip4_sv_reass_val_t v;
78   };
79   clib_bihash_kv_16_8_t kv;
80 } ip4_sv_reass_kv_t;
81
82 typedef struct
83 {
84   // hash table key
85   ip4_sv_reass_key_t key;
86   // time when last packet was received
87   f64 last_heard;
88   // internal id of this reassembly
89   u64 id;
90   // trace operation counter
91   u32 trace_op_counter;
92   // minimum fragment length for this reassembly - used to estimate MTU
93   u16 min_fragment_length;
94   // buffer indexes of buffers in this reassembly in chronological order -
95   // including overlaps and duplicate fragments
96   u32 *cached_buffers;
97   // set to true when this reassembly is completed
98   bool is_complete;
99   // ip protocol
100   u8 ip_proto;
101   u8 icmp_type_or_tcp_flags;
102   u32 tcp_ack_number;
103   u32 tcp_seq_number;
104   // l4 src port
105   u16 l4_src_port;
106   // l4 dst port
107   u16 l4_dst_port;
108   u32 next_index;
109   // lru indexes
110   u32 lru_prev;
111   u32 lru_next;
112 } ip4_sv_reass_t;
113
114 typedef struct
115 {
116   ip4_sv_reass_t *pool;
117   u32 reass_n;
118   u32 id_counter;
119   clib_spinlock_t lock;
120   // lru indexes
121   u32 lru_first;
122   u32 lru_last;
123
124 } ip4_sv_reass_per_thread_t;
125
126 typedef struct
127 {
128   // IPv4 config
129   u32 timeout_ms;
130   f64 timeout;
131   u32 expire_walk_interval_ms;
132   // maximum number of fragments in one reassembly
133   u32 max_reass_len;
134   // maximum number of reassemblies
135   u32 max_reass_n;
136
137   // IPv4 runtime
138   clib_bihash_16_8_t hash;
139   // per-thread data
140   ip4_sv_reass_per_thread_t *per_thread_data;
141
142   // convenience
143   vlib_main_t *vlib_main;
144   vnet_main_t *vnet_main;
145
146   // node index of ip4-drop node
147   u32 ip4_drop_idx;
148   u32 ip4_sv_reass_expire_node_idx;
149
150   /** Worker handoff */
151   u32 fq_index;
152   u32 fq_feature_index;
153
154   // reference count for enabling/disabling feature - per interface
155   u32 *feature_use_refcount_per_intf;
156
157   // reference count for enabling/disabling feature - per interface
158   u32 *output_feature_use_refcount_per_intf;
159
160 } ip4_sv_reass_main_t;
161
162 extern ip4_sv_reass_main_t ip4_sv_reass_main;
163
164 #ifndef CLIB_MARCH_VARIANT
165 ip4_sv_reass_main_t ip4_sv_reass_main;
166 #endif /* CLIB_MARCH_VARIANT */
167
168 typedef enum
169 {
170   IP4_SV_REASSEMBLY_NEXT_INPUT,
171   IP4_SV_REASSEMBLY_NEXT_DROP,
172   IP4_SV_REASSEMBLY_NEXT_HANDOFF,
173   IP4_SV_REASSEMBLY_N_NEXT,
174 } ip4_sv_reass_next_t;
175
176 typedef enum
177 {
178   REASS_FRAGMENT_CACHE,
179   REASS_FINISH,
180   REASS_FRAGMENT_FORWARD,
181   REASS_PASSTHROUGH,
182 } ip4_sv_reass_trace_operation_e;
183
184 typedef struct
185 {
186   ip4_sv_reass_trace_operation_e action;
187   u32 reass_id;
188   u32 op_id;
189   u8 ip_proto;
190   u16 l4_src_port;
191   u16 l4_dst_port;
192   int l4_layer_truncated;
193 } ip4_sv_reass_trace_t;
194
195 extern vlib_node_registration_t ip4_sv_reass_node;
196 extern vlib_node_registration_t ip4_sv_reass_node_feature;
197
198 static u8 *
199 format_ip4_sv_reass_trace (u8 * s, va_list * args)
200 {
201   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
202   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
203   ip4_sv_reass_trace_t *t = va_arg (*args, ip4_sv_reass_trace_t *);
204   if (REASS_PASSTHROUGH != t->action)
205     {
206       s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
207     }
208   switch (t->action)
209     {
210     case REASS_FRAGMENT_CACHE:
211       s = format (s, "[cached]");
212       break;
213     case REASS_FINISH:
214       s =
215         format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
216                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
217                 clib_net_to_host_u16 (t->l4_dst_port));
218       break;
219     case REASS_FRAGMENT_FORWARD:
220       s =
221         format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
222                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
223                 clib_net_to_host_u16 (t->l4_dst_port));
224       break;
225     case REASS_PASSTHROUGH:
226       s = format (s, "[not-fragmented]");
227       break;
228     }
229   if (t->l4_layer_truncated)
230     {
231       s = format (s, " [l4-layer-truncated]");
232     }
233   return s;
234 }
235
236 static void
237 ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
238                         ip4_sv_reass_t *reass, u32 bi,
239                         ip4_sv_reass_trace_operation_e action, u32 ip_proto,
240                         u16 l4_src_port, u16 l4_dst_port,
241                         int l4_layer_truncated)
242 {
243   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
244   if (pool_is_free_index
245       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
246     {
247       // this buffer's trace is gone
248       b->flags &= ~VLIB_BUFFER_IS_TRACED;
249       return;
250     }
251   ip4_sv_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
252   if (reass)
253     {
254       t->reass_id = reass->id;
255       t->op_id = reass->trace_op_counter;
256       ++reass->trace_op_counter;
257     }
258   t->action = action;
259   t->ip_proto = ip_proto;
260   t->l4_src_port = l4_src_port;
261   t->l4_dst_port = l4_dst_port;
262   t->l4_layer_truncated = l4_layer_truncated;
263 #if 0
264   static u8 *s = NULL;
265   s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
266   printf ("%.*s\n", vec_len (s), s);
267   fflush (stdout);
268   vec_reset_length (s);
269 #endif
270 }
271
272
273 always_inline void
274 ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
275                    ip4_sv_reass_per_thread_t * rt, ip4_sv_reass_t * reass)
276 {
277   clib_bihash_kv_16_8_t kv;
278   kv.key[0] = reass->key.as_u64[0];
279   kv.key[1] = reass->key.as_u64[1];
280   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
281   vlib_buffer_free (vm, reass->cached_buffers,
282                     vec_len (reass->cached_buffers));
283   vec_free (reass->cached_buffers);
284   reass->cached_buffers = NULL;
285   if (~0 != reass->lru_prev)
286     {
287       ip4_sv_reass_t *lru_prev =
288         pool_elt_at_index (rt->pool, reass->lru_prev);
289       lru_prev->lru_next = reass->lru_next;
290     }
291   if (~0 != reass->lru_next)
292     {
293       ip4_sv_reass_t *lru_next =
294         pool_elt_at_index (rt->pool, reass->lru_next);
295       lru_next->lru_prev = reass->lru_prev;
296     }
297   if (rt->lru_first == reass - rt->pool)
298     {
299       rt->lru_first = reass->lru_next;
300     }
301   if (rt->lru_last == reass - rt->pool)
302     {
303       rt->lru_last = reass->lru_prev;
304     }
305   pool_put (rt->pool, reass);
306   --rt->reass_n;
307 }
308
309 always_inline void
310 ip4_sv_reass_init (ip4_sv_reass_t * reass)
311 {
312   reass->cached_buffers = NULL;
313   reass->is_complete = false;
314 }
315
316 always_inline ip4_sv_reass_t *
317 ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
318                              ip4_sv_reass_per_thread_t * rt,
319                              ip4_sv_reass_kv_t * kv, u8 * do_handoff)
320 {
321   ip4_sv_reass_t *reass = NULL;
322   f64 now = vlib_time_now (vm);
323
324   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
325     {
326       if (vm->thread_index != kv->v.thread_index)
327         {
328           *do_handoff = 1;
329           return NULL;
330         }
331       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
332
333       if (now > reass->last_heard + rm->timeout)
334         {
335           ip4_sv_reass_free (vm, rm, rt, reass);
336           reass = NULL;
337         }
338     }
339
340   if (reass)
341     {
342       reass->last_heard = now;
343       return reass;
344     }
345
346   if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
347     {
348       reass = pool_elt_at_index (rt->pool, rt->lru_first);
349       ip4_sv_reass_free (vm, rm, rt, reass);
350     }
351
352   pool_get (rt->pool, reass);
353   clib_memset (reass, 0, sizeof (*reass));
354   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
355   ++rt->id_counter;
356   ip4_sv_reass_init (reass);
357   ++rt->reass_n;
358   reass->lru_prev = reass->lru_next = ~0;
359
360   if (~0 != rt->lru_last)
361     {
362       ip4_sv_reass_t *lru_last = pool_elt_at_index (rt->pool, rt->lru_last);
363       reass->lru_prev = rt->lru_last;
364       lru_last->lru_next = rt->lru_last = reass - rt->pool;
365     }
366
367   if (~0 == rt->lru_first)
368     {
369       rt->lru_first = rt->lru_last = reass - rt->pool;
370     }
371
372   reass->key.as_u64[0] = kv->kv.key[0];
373   reass->key.as_u64[1] = kv->kv.key[1];
374   kv->v.reass_index = (reass - rt->pool);
375   kv->v.thread_index = vm->thread_index;
376   reass->last_heard = now;
377
378   if (clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 1))
379     {
380       ip4_sv_reass_free (vm, rm, rt, reass);
381       reass = NULL;
382     }
383
384   return reass;
385 }
386
387 always_inline ip4_sv_reass_rc_t
388 ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
389                      ip4_sv_reass_main_t *rm, ip4_header_t *ip0,
390                      ip4_sv_reass_t *reass, u32 bi0)
391 {
392   vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
393   ip4_sv_reass_rc_t rc = IP4_SV_REASS_RC_OK;
394   const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
395   if (0 == fragment_first)
396     {
397       reass->ip_proto = ip0->protocol;
398       reass->l4_src_port = ip4_get_port (ip0, 1);
399       reass->l4_dst_port = ip4_get_port (ip0, 0);
400       if (!reass->l4_src_port || !reass->l4_dst_port)
401         return IP4_SV_REASS_RC_UNSUPP_IP_PROTO;
402       if (IP_PROTOCOL_TCP == reass->ip_proto)
403         {
404           reass->icmp_type_or_tcp_flags = ((tcp_header_t *) (ip0 + 1))->flags;
405           reass->tcp_ack_number = ((tcp_header_t *) (ip0 + 1))->ack_number;
406           reass->tcp_seq_number = ((tcp_header_t *) (ip0 + 1))->seq_number;
407         }
408       else if (IP_PROTOCOL_ICMP == reass->ip_proto)
409         {
410           reass->icmp_type_or_tcp_flags =
411             ((icmp46_header_t *) (ip0 + 1))->type;
412         }
413       reass->is_complete = true;
414       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
415       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
416         {
417           ip4_sv_reass_add_trace (
418             vm, node, reass, bi0, REASS_FINISH, reass->ip_proto,
419             reass->l4_src_port, reass->l4_dst_port,
420             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
421         }
422     }
423   vec_add1 (reass->cached_buffers, bi0);
424   if (!reass->is_complete)
425     {
426       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
427         {
428           ip4_sv_reass_add_trace (
429             vm, node, reass, bi0, REASS_FRAGMENT_CACHE, ~0, ~0, ~0,
430             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
431         }
432       if (vec_len (reass->cached_buffers) > rm->max_reass_len)
433         {
434           rc = IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS;
435         }
436     }
437   return rc;
438 }
439
440 always_inline int
441 l4_layer_truncated (ip4_header_t *ip)
442 {
443   static const int l4_layer_length[256] = {
444     [IP_PROTOCOL_TCP] = sizeof (tcp_header_t),
445     [IP_PROTOCOL_UDP] = sizeof (udp_header_t),
446     [IP_PROTOCOL_ICMP] = sizeof (icmp46_header_t),
447   };
448
449   return ((u8 *) ip + ip4_header_bytes (ip) + l4_layer_length[ip->protocol] >
450           (u8 *) ip + clib_net_to_host_u16 (ip->length));
451 }
452
453 always_inline uword
454 ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
455                      vlib_frame_t * frame, bool is_feature,
456                      bool is_output_feature, bool is_custom)
457 {
458   u32 *from = vlib_frame_vector_args (frame);
459   u32 n_left_from, n_left_to_next, *to_next, next_index;
460   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
461   ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
462   clib_spinlock_lock (&rt->lock);
463
464   n_left_from = frame->n_vectors;
465   next_index = node->cached_next_index;
466
467   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
468   vlib_get_buffers (vm, from, bufs, n_left_from);
469   u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
470   b = bufs;
471
472   /* optimistic case first - no fragments */
473   while (n_left_from >= 2)
474     {
475       vlib_buffer_t *b0, *b1;
476       u32 next0, next1;
477       b0 = *b;
478       b++;
479       b1 = *b;
480       b++;
481
482       /* Prefetch next iteration. */
483       if (PREDICT_TRUE (n_left_from >= 4))
484         {
485           vlib_buffer_t *p2, *p3;
486
487           p2 = *b;
488           p3 = *(b + 1);
489
490           vlib_prefetch_buffer_header (p2, LOAD);
491           vlib_prefetch_buffer_header (p3, LOAD);
492
493           clib_prefetch_load (p2->data);
494           clib_prefetch_load (p3->data);
495         }
496
497       ip4_header_t *ip0 =
498         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
499                                      (is_output_feature ? 1 : 0) *
500                                      vnet_buffer (b0)->
501                                      ip.save_rewrite_length);
502       ip4_header_t *ip1 =
503         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b1),
504                                      (is_output_feature ? 1 : 0) *
505                                      vnet_buffer (b1)->
506                                      ip.save_rewrite_length);
507
508       if (PREDICT_FALSE
509           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0))
510           || (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
511         {
512           // fragment found, go slow path
513           b -= 2;
514           if (b - bufs > 0)
515             {
516               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
517                                            b - bufs);
518             }
519           goto slow_path;
520         }
521       if (is_feature)
522         {
523           vnet_feature_next (&next0, b0);
524         }
525       else
526         {
527           next0 = is_custom ? vnet_buffer (b0)->ip.reass.next_index :
528             IP4_SV_REASSEMBLY_NEXT_INPUT;
529         }
530       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
531       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
532       if (l4_layer_truncated (ip0))
533         {
534           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
535           vnet_buffer (b0)->ip.reass.l4_src_port = 0;
536           vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
537         }
538       else
539         {
540           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
541           if (IP_PROTOCOL_TCP == ip0->protocol)
542             {
543               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
544                 ((tcp_header_t *) (ip0 + 1))->flags;
545               vnet_buffer (b0)->ip.reass.tcp_ack_number =
546                 ((tcp_header_t *) (ip0 + 1))->ack_number;
547               vnet_buffer (b0)->ip.reass.tcp_seq_number =
548                 ((tcp_header_t *) (ip0 + 1))->seq_number;
549             }
550           else if (IP_PROTOCOL_ICMP == ip0->protocol)
551             {
552               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
553                 ((icmp46_header_t *) (ip0 + 1))->type;
554             }
555           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
556           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
557         }
558       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
559         {
560           ip4_sv_reass_add_trace (
561             vm, node, NULL, from[(b - 2) - bufs], REASS_PASSTHROUGH,
562             vnet_buffer (b0)->ip.reass.ip_proto,
563             vnet_buffer (b0)->ip.reass.l4_src_port,
564             vnet_buffer (b0)->ip.reass.l4_dst_port,
565             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
566         }
567       if (is_feature)
568         {
569           vnet_feature_next (&next1, b1);
570         }
571       else
572         {
573           next1 = is_custom ? vnet_buffer (b1)->ip.reass.next_index :
574             IP4_SV_REASSEMBLY_NEXT_INPUT;
575         }
576       vnet_buffer (b1)->ip.reass.is_non_first_fragment = 0;
577       vnet_buffer (b1)->ip.reass.ip_proto = ip1->protocol;
578       if (l4_layer_truncated (ip1))
579         {
580           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 1;
581           vnet_buffer (b1)->ip.reass.l4_src_port = 0;
582           vnet_buffer (b1)->ip.reass.l4_dst_port = 0;
583         }
584       else
585         {
586           vnet_buffer (b1)->ip.reass.l4_layer_truncated = 0;
587           if (IP_PROTOCOL_TCP == ip1->protocol)
588             {
589               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
590                 ((tcp_header_t *) (ip1 + 1))->flags;
591               vnet_buffer (b1)->ip.reass.tcp_ack_number =
592                 ((tcp_header_t *) (ip1 + 1))->ack_number;
593               vnet_buffer (b1)->ip.reass.tcp_seq_number =
594                 ((tcp_header_t *) (ip1 + 1))->seq_number;
595             }
596           else if (IP_PROTOCOL_ICMP == ip1->protocol)
597             {
598               vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
599                 ((icmp46_header_t *) (ip1 + 1))->type;
600             }
601           vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
602           vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
603         }
604       if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
605         {
606           ip4_sv_reass_add_trace (
607             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
608             vnet_buffer (b1)->ip.reass.ip_proto,
609             vnet_buffer (b1)->ip.reass.l4_src_port,
610             vnet_buffer (b1)->ip.reass.l4_dst_port,
611             vnet_buffer (b1)->ip.reass.l4_layer_truncated);
612         }
613
614       n_left_from -= 2;
615       next[0] = next0;
616       next[1] = next1;
617       next += 2;
618     }
619
620   while (n_left_from > 0)
621     {
622       vlib_buffer_t *b0;
623       u32 next0;
624       b0 = *b;
625       b++;
626
627       ip4_header_t *ip0 =
628         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
629                                      (is_output_feature ? 1 : 0) *
630                                      vnet_buffer (b0)->
631                                      ip.save_rewrite_length);
632       if (PREDICT_FALSE
633           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0)))
634         {
635           // fragment found, go slow path
636           b -= 1;
637           if (b - bufs > 0)
638             {
639               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
640                                            b - bufs);
641             }
642           goto slow_path;
643         }
644       if (is_feature)
645         {
646           vnet_feature_next (&next0, b0);
647         }
648       else
649         {
650           next0 =
651             is_custom ? vnet_buffer (b0)->ip.
652             reass.next_index : IP4_SV_REASSEMBLY_NEXT_INPUT;
653         }
654       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
655       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
656       if (l4_layer_truncated (ip0))
657         {
658           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
659         }
660       else
661         {
662           vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
663           if (IP_PROTOCOL_TCP == ip0->protocol)
664             {
665               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
666                 ((tcp_header_t *) (ip0 + 1))->flags;
667               vnet_buffer (b0)->ip.reass.tcp_ack_number =
668                 ((tcp_header_t *) (ip0 + 1))->ack_number;
669               vnet_buffer (b0)->ip.reass.tcp_seq_number =
670                 ((tcp_header_t *) (ip0 + 1))->seq_number;
671             }
672           else if (IP_PROTOCOL_ICMP == ip0->protocol)
673             {
674               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
675                 ((icmp46_header_t *) (ip0 + 1))->type;
676             }
677           vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
678           vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
679         }
680       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
681         {
682           ip4_sv_reass_add_trace (
683             vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
684             vnet_buffer (b0)->ip.reass.ip_proto,
685             vnet_buffer (b0)->ip.reass.l4_src_port,
686             vnet_buffer (b0)->ip.reass.l4_dst_port,
687             vnet_buffer (b0)->ip.reass.l4_layer_truncated);
688         }
689
690       n_left_from -= 1;
691       next[0] = next0;
692       next += 1;
693     }
694
695   vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
696                                frame->n_vectors);
697
698   goto done;
699
700 slow_path:
701
702   from += b - bufs;
703
704   while (n_left_from > 0)
705     {
706       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
707
708       while (n_left_from > 0 && n_left_to_next > 0)
709         {
710           u32 bi0;
711           vlib_buffer_t *b0;
712           u32 next0;
713           u32 error0 = IP4_ERROR_NONE;
714
715           bi0 = from[0];
716           b0 = vlib_get_buffer (vm, bi0);
717
718           ip4_header_t *ip0 =
719             (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
720                                          (is_output_feature ? 1 : 0) *
721                                          vnet_buffer (b0)->
722                                          ip.save_rewrite_length);
723           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
724             {
725               // this is a regular packet - no fragmentation
726               if (is_custom)
727                 {
728                   next0 = vnet_buffer (b0)->ip.reass.next_index;
729                 }
730               else
731                 {
732                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
733                 }
734               vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
735               vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
736               if (l4_layer_truncated (ip0))
737                 {
738                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
739                   vnet_buffer (b0)->ip.reass.l4_src_port = 0;
740                   vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
741                 }
742               else
743                 {
744                   vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
745                   if (IP_PROTOCOL_TCP == ip0->protocol)
746                     {
747                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
748                         ((tcp_header_t *) (ip0 + 1))->flags;
749                       vnet_buffer (b0)->ip.reass.tcp_ack_number =
750                         ((tcp_header_t *) (ip0 + 1))->ack_number;
751                       vnet_buffer (b0)->ip.reass.tcp_seq_number =
752                         ((tcp_header_t *) (ip0 + 1))->seq_number;
753                     }
754                   else if (IP_PROTOCOL_ICMP == ip0->protocol)
755                     {
756                       vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
757                         ((icmp46_header_t *) (ip0 + 1))->type;
758                     }
759                   vnet_buffer (b0)->ip.reass.l4_src_port =
760                     ip4_get_port (ip0, 1);
761                   vnet_buffer (b0)->ip.reass.l4_dst_port =
762                     ip4_get_port (ip0, 0);
763                 }
764               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
765                 {
766                   ip4_sv_reass_add_trace (
767                     vm, node, NULL, bi0, REASS_PASSTHROUGH,
768                     vnet_buffer (b0)->ip.reass.ip_proto,
769                     vnet_buffer (b0)->ip.reass.l4_src_port,
770                     vnet_buffer (b0)->ip.reass.l4_dst_port,
771                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
772                 }
773               goto packet_enqueue;
774             }
775           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
776           const u32 fragment_length =
777             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
778           const u32 fragment_last = fragment_first + fragment_length - 1;
779           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
780             {
781               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
782               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
783               b0->error = node->errors[error0];
784               goto packet_enqueue;
785             }
786           ip4_sv_reass_kv_t kv;
787           u8 do_handoff = 0;
788
789           kv.k.as_u64[0] =
790             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
791                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
792             (u64) ip0->src_address.as_u32 << 32;
793           kv.k.as_u64[1] =
794             (u64) ip0->dst_address.
795             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
796
797           ip4_sv_reass_t *reass =
798             ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
799
800           if (PREDICT_FALSE (do_handoff))
801             {
802               next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
803               vnet_buffer (b0)->ip.reass.owner_thread_index =
804                 kv.v.thread_index;
805               goto packet_enqueue;
806             }
807
808           if (!reass)
809             {
810               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
811               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
812               b0->error = node->errors[error0];
813               goto packet_enqueue;
814             }
815
816           if (reass->is_complete)
817             {
818               if (is_custom)
819                 {
820                   next0 = vnet_buffer (b0)->ip.reass.next_index;
821                 }
822               else
823                 {
824                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
825                 }
826               vnet_buffer (b0)->ip.reass.is_non_first_fragment =
827                 ! !fragment_first;
828               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
829               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
830                 reass->icmp_type_or_tcp_flags;
831               vnet_buffer (b0)->ip.reass.tcp_ack_number =
832                 reass->tcp_ack_number;
833               vnet_buffer (b0)->ip.reass.tcp_seq_number =
834                 reass->tcp_seq_number;
835               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
836               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
837               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
838                 {
839                   ip4_sv_reass_add_trace (
840                     vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
841                     reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
842                     vnet_buffer (b0)->ip.reass.l4_layer_truncated);
843                 }
844               goto packet_enqueue;
845             }
846
847           ip4_sv_reass_rc_t rc =
848             ip4_sv_reass_update (vm, node, rm, ip0, reass, bi0);
849           switch (rc)
850             {
851             case IP4_SV_REASS_RC_OK:
852               /* nothing to do here */
853               break;
854             case IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS:
855               vlib_node_increment_counter (vm, node->node_index,
856                                            IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
857                                            1);
858               ip4_sv_reass_free (vm, rm, rt, reass);
859               goto next_packet;
860               break;
861             case IP4_SV_REASS_RC_UNSUPP_IP_PROTO:
862               vlib_node_increment_counter (vm, node->node_index,
863                                            IP4_ERROR_REASS_UNSUPP_IP_PROT, 1);
864               ip4_sv_reass_free (vm, rm, rt, reass);
865               goto next_packet;
866               break;
867             }
868           if (reass->is_complete)
869             {
870               u32 idx;
871               vec_foreach_index (idx, reass->cached_buffers)
872               {
873                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
874                 vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
875                 ip0 =
876                   (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
877                                                (is_output_feature ? 1 : 0) *
878                                                vnet_buffer (b0)->
879                                                ip.save_rewrite_length);
880                 u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
881                 if (is_feature)
882                   {
883                     vnet_feature_next (&next0, b0);
884                   }
885                 if (is_custom)
886                   {
887                     next0 = vnet_buffer (b0)->ip.reass.next_index;
888                   }
889                 if (0 == n_left_to_next)
890                   {
891                     vlib_put_next_frame (vm, node, next_index,
892                                          n_left_to_next);
893                     vlib_get_next_frame (vm, node, next_index, to_next,
894                                          n_left_to_next);
895                   }
896                 to_next[0] = bi0;
897                 to_next += 1;
898                 n_left_to_next -= 1;
899                 vnet_buffer (b0)->ip.reass.is_non_first_fragment =
900                   ! !ip4_get_fragment_offset (ip0);
901                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
902                 vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
903                   reass->icmp_type_or_tcp_flags;
904                 vnet_buffer (b0)->ip.reass.tcp_ack_number =
905                   reass->tcp_ack_number;
906                 vnet_buffer (b0)->ip.reass.tcp_seq_number =
907                   reass->tcp_seq_number;
908                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
909                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
910                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
911                   {
912                     ip4_sv_reass_add_trace (
913                       vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
914                       reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
915                       vnet_buffer (b0)->ip.reass.l4_layer_truncated);
916                   }
917                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
918                                                  to_next, n_left_to_next, bi0,
919                                                  next0);
920               }
921               _vec_len (reass->cached_buffers) = 0;     // buffers are owned by frame now
922             }
923           goto next_packet;
924
925         packet_enqueue:
926           to_next[0] = bi0;
927           to_next += 1;
928           n_left_to_next -= 1;
929           if (is_feature && IP4_ERROR_NONE == error0)
930             {
931               b0 = vlib_get_buffer (vm, bi0);
932               vnet_feature_next (&next0, b0);
933             }
934           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
935                                            to_next, n_left_to_next,
936                                            bi0, next0);
937
938         next_packet:
939           from += 1;
940           n_left_from -= 1;
941         }
942
943       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
944     }
945
946 done:
947   clib_spinlock_unlock (&rt->lock);
948   return frame->n_vectors;
949 }
950
951 static char *ip4_sv_reass_error_strings[] = {
952 #define _(sym, string) string,
953   foreach_ip4_error
954 #undef _
955 };
956
957 VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
958                                   vlib_node_runtime_t * node,
959                                   vlib_frame_t * frame)
960 {
961   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
962                               false /* is_output_feature */ ,
963                               false /* is_custom */ );
964 }
965
966 /* *INDENT-OFF* */
967 VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
968     .name = "ip4-sv-reassembly",
969     .vector_size = sizeof (u32),
970     .format_trace = format_ip4_sv_reass_trace,
971     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
972     .error_strings = ip4_sv_reass_error_strings,
973     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
974     .next_nodes =
975         {
976                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
977                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
978                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
979
980         },
981 };
982 /* *INDENT-ON* */
983
984 VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
985                                           vlib_node_runtime_t * node,
986                                           vlib_frame_t * frame)
987 {
988   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
989                               false /* is_output_feature */ ,
990                               false /* is_custom */ );
991 }
992
993 /* *INDENT-OFF* */
994 VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
995     .name = "ip4-sv-reassembly-feature",
996     .vector_size = sizeof (u32),
997     .format_trace = format_ip4_sv_reass_trace,
998     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
999     .error_strings = ip4_sv_reass_error_strings,
1000     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1001     .next_nodes =
1002         {
1003                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1004                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1005                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1006         },
1007 };
1008 /* *INDENT-ON* */
1009
1010 /* *INDENT-OFF* */
1011 VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
1012     .arc_name = "ip4-unicast",
1013     .node_name = "ip4-sv-reassembly-feature",
1014     .runs_before = VNET_FEATURES ("ip4-lookup"),
1015     .runs_after = 0,
1016 };
1017 /* *INDENT-ON* */
1018
1019 VLIB_NODE_FN (ip4_sv_reass_node_output_feature) (vlib_main_t * vm,
1020                                                  vlib_node_runtime_t * node,
1021                                                  vlib_frame_t * frame)
1022 {
1023   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
1024                               true /* is_output_feature */ ,
1025                               false /* is_custom */ );
1026 }
1027
1028
1029 /* *INDENT-OFF* */
1030 VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
1031     .name = "ip4-sv-reassembly-output-feature",
1032     .vector_size = sizeof (u32),
1033     .format_trace = format_ip4_sv_reass_trace,
1034     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
1035     .error_strings = ip4_sv_reass_error_strings,
1036     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1037     .next_nodes =
1038         {
1039                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1040                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1041                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
1042         },
1043 };
1044 /* *INDENT-ON* */
1045
1046 /* *INDENT-OFF* */
1047 VNET_FEATURE_INIT (ip4_sv_reass_output_feature) = {
1048     .arc_name = "ip4-output",
1049     .node_name = "ip4-sv-reassembly-output-feature",
1050     .runs_before = 0,
1051     .runs_after = 0,
1052 };
1053 /* *INDENT-ON* */
1054
1055 /* *INDENT-OFF* */
1056 VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
1057     .name = "ip4-sv-reassembly-custom-next",
1058     .vector_size = sizeof (u32),
1059     .format_trace = format_ip4_sv_reass_trace,
1060     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
1061     .error_strings = ip4_sv_reass_error_strings,
1062     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1063     .next_nodes =
1064         {
1065                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1066                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1067                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
1068
1069         },
1070 };
1071 /* *INDENT-ON* */
1072
1073 VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
1074                                          vlib_node_runtime_t * node,
1075                                          vlib_frame_t * frame)
1076 {
1077   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
1078                               false /* is_output_feature */ ,
1079                               true /* is_custom */ );
1080 }
1081
1082 #ifndef CLIB_MARCH_VARIANT
1083 always_inline u32
1084 ip4_sv_reass_get_nbuckets ()
1085 {
1086   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1087   u32 nbuckets;
1088   u8 i;
1089
1090   nbuckets = (u32) (rm->max_reass_n / IP4_SV_REASS_HT_LOAD_FACTOR);
1091
1092   for (i = 0; i < 31; i++)
1093     if ((1 << i) >= nbuckets)
1094       break;
1095   nbuckets = 1 << i;
1096
1097   return nbuckets;
1098 }
1099 #endif /* CLIB_MARCH_VARIANT */
1100
1101 typedef enum
1102 {
1103   IP4_EVENT_CONFIG_CHANGED = 1,
1104 } ip4_sv_reass_event_t;
1105
1106 typedef struct
1107 {
1108   int failure;
1109   clib_bihash_16_8_t *new_hash;
1110 } ip4_rehash_cb_ctx;
1111
1112 #ifndef CLIB_MARCH_VARIANT
1113 static int
1114 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1115 {
1116   ip4_rehash_cb_ctx *ctx = _ctx;
1117   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1118     {
1119       ctx->failure = 1;
1120     }
1121   return (BIHASH_WALK_CONTINUE);
1122 }
1123
1124 static void
1125 ip4_sv_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1126                          u32 max_reassembly_length,
1127                          u32 expire_walk_interval_ms)
1128 {
1129   ip4_sv_reass_main.timeout_ms = timeout_ms;
1130   ip4_sv_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1131   ip4_sv_reass_main.max_reass_n = max_reassemblies;
1132   ip4_sv_reass_main.max_reass_len = max_reassembly_length;
1133   ip4_sv_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1134 }
1135
1136 vnet_api_error_t
1137 ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
1138                   u32 max_reassembly_length, u32 expire_walk_interval_ms)
1139 {
1140   u32 old_nbuckets = ip4_sv_reass_get_nbuckets ();
1141   ip4_sv_reass_set_params (timeout_ms, max_reassemblies,
1142                            max_reassembly_length, expire_walk_interval_ms);
1143   vlib_process_signal_event (ip4_sv_reass_main.vlib_main,
1144                              ip4_sv_reass_main.ip4_sv_reass_expire_node_idx,
1145                              IP4_EVENT_CONFIG_CHANGED, 0);
1146   u32 new_nbuckets = ip4_sv_reass_get_nbuckets ();
1147   if (ip4_sv_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1148     {
1149       clib_bihash_16_8_t new_hash;
1150       clib_memset (&new_hash, 0, sizeof (new_hash));
1151       ip4_rehash_cb_ctx ctx;
1152       ctx.failure = 0;
1153       ctx.new_hash = &new_hash;
1154       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1155                              new_nbuckets * 1024);
1156       clib_bihash_foreach_key_value_pair_16_8 (&ip4_sv_reass_main.hash,
1157                                                ip4_rehash_cb, &ctx);
1158       if (ctx.failure)
1159         {
1160           clib_bihash_free_16_8 (&new_hash);
1161           return -1;
1162         }
1163       else
1164         {
1165           clib_bihash_free_16_8 (&ip4_sv_reass_main.hash);
1166           clib_memcpy_fast (&ip4_sv_reass_main.hash, &new_hash,
1167                             sizeof (ip4_sv_reass_main.hash));
1168           clib_bihash_copied (&ip4_sv_reass_main.hash, &new_hash);
1169         }
1170     }
1171   return 0;
1172 }
1173
1174 vnet_api_error_t
1175 ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1176                   u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1177 {
1178   *timeout_ms = ip4_sv_reass_main.timeout_ms;
1179   *max_reassemblies = ip4_sv_reass_main.max_reass_n;
1180   *max_reassembly_length = ip4_sv_reass_main.max_reass_len;
1181   *expire_walk_interval_ms = ip4_sv_reass_main.expire_walk_interval_ms;
1182   return 0;
1183 }
1184
1185 static clib_error_t *
1186 ip4_sv_reass_init_function (vlib_main_t * vm)
1187 {
1188   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1189   clib_error_t *error = 0;
1190   u32 nbuckets;
1191   vlib_node_t *node;
1192
1193   rm->vlib_main = vm;
1194   rm->vnet_main = vnet_get_main ();
1195
1196   vec_validate (rm->per_thread_data, vlib_num_workers ());
1197   ip4_sv_reass_per_thread_t *rt;
1198   vec_foreach (rt, rm->per_thread_data)
1199   {
1200     clib_spinlock_init (&rt->lock);
1201     pool_alloc (rt->pool, rm->max_reass_n);
1202     rt->lru_first = rt->lru_last = ~0;
1203   }
1204
1205   node = vlib_get_node_by_name (vm, (u8 *) "ip4-sv-reassembly-expire-walk");
1206   ASSERT (node);
1207   rm->ip4_sv_reass_expire_node_idx = node->index;
1208
1209   ip4_sv_reass_set_params (IP4_SV_REASS_TIMEOUT_DEFAULT_MS,
1210                            IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT,
1211                            IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1212                            IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1213
1214   nbuckets = ip4_sv_reass_get_nbuckets ();
1215   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1216
1217   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1218   ASSERT (node);
1219   rm->ip4_drop_idx = node->index;
1220
1221   rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
1222   rm->fq_feature_index =
1223     vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
1224
1225   rm->feature_use_refcount_per_intf = NULL;
1226   rm->output_feature_use_refcount_per_intf = NULL;
1227
1228   return error;
1229 }
1230
1231 VLIB_INIT_FUNCTION (ip4_sv_reass_init_function);
1232 #endif /* CLIB_MARCH_VARIANT */
1233
1234 static uword
1235 ip4_sv_reass_walk_expired (vlib_main_t *vm,
1236                            CLIB_UNUSED (vlib_node_runtime_t *node),
1237                            CLIB_UNUSED (vlib_frame_t *f))
1238 {
1239   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1240   uword event_type, *event_data = 0;
1241
1242   while (true)
1243     {
1244       vlib_process_wait_for_event_or_clock (vm,
1245                                             (f64)
1246                                             rm->expire_walk_interval_ms /
1247                                             (f64) MSEC_PER_SEC);
1248       event_type = vlib_process_get_events (vm, &event_data);
1249
1250       switch (event_type)
1251         {
1252         case ~0:
1253           /* no events => timeout */
1254           /* fallthrough */
1255         case IP4_EVENT_CONFIG_CHANGED:
1256           /* nothing to do here */
1257           break;
1258         default:
1259           clib_warning ("BUG: event type 0x%wx", event_type);
1260           break;
1261         }
1262       f64 now = vlib_time_now (vm);
1263
1264       ip4_sv_reass_t *reass;
1265       int *pool_indexes_to_free = NULL;
1266
1267       uword thread_index = 0;
1268       int index;
1269       const uword nthreads = vlib_num_workers () + 1;
1270       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1271         {
1272           ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1273           clib_spinlock_lock (&rt->lock);
1274
1275           vec_reset_length (pool_indexes_to_free);
1276           /* *INDENT-OFF* */
1277           pool_foreach_index (index, rt->pool)  {
1278                                 reass = pool_elt_at_index (rt->pool, index);
1279                                 if (now > reass->last_heard + rm->timeout)
1280                                   {
1281                                     vec_add1 (pool_indexes_to_free, index);
1282                                   }
1283                               }
1284           /* *INDENT-ON* */
1285           int *i;
1286           /* *INDENT-OFF* */
1287           vec_foreach (i, pool_indexes_to_free)
1288           {
1289             ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1290             ip4_sv_reass_free (vm, rm, rt, reass);
1291           }
1292           /* *INDENT-ON* */
1293
1294           clib_spinlock_unlock (&rt->lock);
1295         }
1296
1297       vec_free (pool_indexes_to_free);
1298       if (event_data)
1299         {
1300           _vec_len (event_data) = 0;
1301         }
1302     }
1303
1304   return 0;
1305 }
1306
1307 /* *INDENT-OFF* */
1308 VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
1309     .function = ip4_sv_reass_walk_expired,
1310     .type = VLIB_NODE_TYPE_PROCESS,
1311     .name = "ip4-sv-reassembly-expire-walk",
1312     .format_trace = format_ip4_sv_reass_trace,
1313     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
1314     .error_strings = ip4_sv_reass_error_strings,
1315
1316 };
1317 /* *INDENT-ON* */
1318
1319 static u8 *
1320 format_ip4_sv_reass_key (u8 * s, va_list * args)
1321 {
1322   ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
1323   s =
1324     format (s, "fib_index: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1325             key->fib_index, format_ip4_address, &key->src, format_ip4_address,
1326             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1327   return s;
1328 }
1329
1330 static u8 *
1331 format_ip4_sv_reass (u8 * s, va_list * args)
1332 {
1333   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1334   ip4_sv_reass_t *reass = va_arg (*args, ip4_sv_reass_t *);
1335
1336   s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n",
1337               reass->id, format_ip4_sv_reass_key, &reass->key,
1338               reass->trace_op_counter);
1339
1340   vlib_buffer_t *b;
1341   u32 *bip;
1342   u32 counter = 0;
1343   vec_foreach (bip, reass->cached_buffers)
1344   {
1345     u32 bi = *bip;
1346     do
1347       {
1348         b = vlib_get_buffer (vm, bi);
1349         s = format (s, "  #%03u: bi: %u, ", counter, bi);
1350         ++counter;
1351         bi = b->next_buffer;
1352       }
1353     while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
1354   }
1355   return s;
1356 }
1357
1358 static clib_error_t *
1359 show_ip4_reass (vlib_main_t * vm,
1360                 unformat_input_t * input,
1361                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1362 {
1363   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1364
1365   vlib_cli_output (vm, "---------------------");
1366   vlib_cli_output (vm, "IP4 reassembly status");
1367   vlib_cli_output (vm, "---------------------");
1368   bool details = false;
1369   if (unformat (input, "details"))
1370     {
1371       details = true;
1372     }
1373
1374   u32 sum_reass_n = 0;
1375   ip4_sv_reass_t *reass;
1376   uword thread_index;
1377   const uword nthreads = vlib_num_workers () + 1;
1378   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1379     {
1380       ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1381       clib_spinlock_lock (&rt->lock);
1382       if (details)
1383         {
1384           /* *INDENT-OFF* */
1385           pool_foreach (reass, rt->pool) {
1386             vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
1387           }
1388           /* *INDENT-ON* */
1389         }
1390       sum_reass_n += rt->reass_n;
1391       clib_spinlock_unlock (&rt->lock);
1392     }
1393   vlib_cli_output (vm, "---------------------");
1394   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1395                    (long unsigned) sum_reass_n);
1396   vlib_cli_output (vm,
1397                    "Maximum configured concurrent shallow virtual IP4 reassemblies per worker-thread: %lu\n",
1398                    (long unsigned) rm->max_reass_n);
1399   vlib_cli_output (vm,
1400                    "Maximum configured amount of fragments per shallow "
1401                    "virtual IP4 reassembly: %lu\n",
1402                    (long unsigned) rm->max_reass_len);
1403   vlib_cli_output (vm,
1404                    "Maximum configured shallow virtual IP4 reassembly timeout: %lums\n",
1405                    (long unsigned) rm->timeout_ms);
1406   vlib_cli_output (vm,
1407                    "Maximum configured shallow virtual IP4 reassembly expire walk interval: %lums\n",
1408                    (long unsigned) rm->expire_walk_interval_ms);
1409   return 0;
1410 }
1411
1412 /* *INDENT-OFF* */
1413 VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
1414     .path = "show ip4-sv-reassembly",
1415     .short_help = "show ip4-sv-reassembly [details]",
1416     .function = show_ip4_reass,
1417 };
1418 /* *INDENT-ON* */
1419
1420 #ifndef CLIB_MARCH_VARIANT
1421 vnet_api_error_t
1422 ip4_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1423 {
1424   return ip4_sv_reass_enable_disable_with_refcnt (sw_if_index,
1425                                                   enable_disable);
1426 }
1427 #endif /* CLIB_MARCH_VARIANT */
1428
1429
1430 #define foreach_ip4_sv_reass_handoff_error                       \
1431 _(CONGESTION_DROP, "congestion drop")
1432
1433
1434 typedef enum
1435 {
1436 #define _(sym,str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
1437   foreach_ip4_sv_reass_handoff_error
1438 #undef _
1439     IP4_SV_REASSEMBLY_HANDOFF_N_ERROR,
1440 } ip4_sv_reass_handoff_error_t;
1441
1442 static char *ip4_sv_reass_handoff_error_strings[] = {
1443 #define _(sym,string) string,
1444   foreach_ip4_sv_reass_handoff_error
1445 #undef _
1446 };
1447
1448 typedef struct
1449 {
1450   u32 next_worker_index;
1451 } ip4_sv_reass_handoff_trace_t;
1452
1453 static u8 *
1454 format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
1455 {
1456   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1457   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1458   ip4_sv_reass_handoff_trace_t *t =
1459     va_arg (*args, ip4_sv_reass_handoff_trace_t *);
1460
1461   s =
1462     format (s, "ip4-sv-reassembly-handoff: next-worker %d",
1463             t->next_worker_index);
1464
1465   return s;
1466 }
1467
1468 always_inline uword
1469 ip4_sv_reass_handoff_node_inline (vlib_main_t * vm,
1470                                   vlib_node_runtime_t * node,
1471                                   vlib_frame_t * frame, bool is_feature)
1472 {
1473   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1474
1475   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1476   u32 n_enq, n_left_from, *from;
1477   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1478   u32 fq_index;
1479
1480   from = vlib_frame_vector_args (frame);
1481   n_left_from = frame->n_vectors;
1482   vlib_get_buffers (vm, from, bufs, n_left_from);
1483
1484   b = bufs;
1485   ti = thread_indices;
1486
1487   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1488
1489   while (n_left_from > 0)
1490     {
1491       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1492
1493       if (PREDICT_FALSE
1494           ((node->flags & VLIB_NODE_FLAG_TRACE)
1495            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1496         {
1497           ip4_sv_reass_handoff_trace_t *t =
1498             vlib_add_trace (vm, node, b[0], sizeof (*t));
1499           t->next_worker_index = ti[0];
1500         }
1501
1502       n_left_from -= 1;
1503       ti += 1;
1504       b += 1;
1505     }
1506   n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
1507                                          thread_indices, frame->n_vectors, 1);
1508
1509   if (n_enq < frame->n_vectors)
1510     vlib_node_increment_counter (vm, node->node_index,
1511                                  IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1512                                  frame->n_vectors - n_enq);
1513   return frame->n_vectors;
1514 }
1515
1516 VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
1517                                           vlib_node_runtime_t * node,
1518                                           vlib_frame_t * frame)
1519 {
1520   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1521                                            false /* is_feature */ );
1522 }
1523
1524
1525 /* *INDENT-OFF* */
1526 VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
1527   .name = "ip4-sv-reassembly-handoff",
1528   .vector_size = sizeof (u32),
1529   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1530   .error_strings = ip4_sv_reass_handoff_error_strings,
1531   .format_trace = format_ip4_sv_reass_handoff_trace,
1532
1533   .n_next_nodes = 1,
1534
1535   .next_nodes = {
1536     [0] = "error-drop",
1537   },
1538 };
1539 /* *INDENT-ON* */
1540
1541
1542 /* *INDENT-OFF* */
1543 VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
1544                                                     vlib_node_runtime_t *
1545                                                     node,
1546                                                     vlib_frame_t * frame)
1547 {
1548   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1549                                              true /* is_feature */ );
1550 }
1551 /* *INDENT-ON* */
1552
1553
1554 /* *INDENT-OFF* */
1555 VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
1556   .name = "ip4-sv-reass-feature-hoff",
1557   .vector_size = sizeof (u32),
1558   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1559   .error_strings = ip4_sv_reass_handoff_error_strings,
1560   .format_trace = format_ip4_sv_reass_handoff_trace,
1561
1562   .n_next_nodes = 1,
1563
1564   .next_nodes = {
1565     [0] = "error-drop",
1566   },
1567 };
1568 /* *INDENT-ON* */
1569
1570 #ifndef CLIB_MARCH_VARIANT
1571 int
1572 ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1573 {
1574   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1575   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1576   if (is_enable)
1577     {
1578       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1579         {
1580           ++rm->feature_use_refcount_per_intf[sw_if_index];
1581           return vnet_feature_enable_disable ("ip4-unicast",
1582                                               "ip4-sv-reassembly-feature",
1583                                               sw_if_index, 1, 0, 0);
1584         }
1585       ++rm->feature_use_refcount_per_intf[sw_if_index];
1586     }
1587   else
1588     {
1589       if (rm->feature_use_refcount_per_intf[sw_if_index])
1590         --rm->feature_use_refcount_per_intf[sw_if_index];
1591       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1592         return vnet_feature_enable_disable ("ip4-unicast",
1593                                             "ip4-sv-reassembly-feature",
1594                                             sw_if_index, 0, 0, 0);
1595     }
1596   return 0;
1597 }
1598
1599 uword
1600 ip4_sv_reass_custom_register_next_node (uword node_index)
1601 {
1602   return vlib_node_add_next (vlib_get_main (), ip4_sv_reass_custom_node.index,
1603                              node_index);
1604 }
1605
1606 int
1607 ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
1608                                                 int is_enable)
1609 {
1610   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1611   vec_validate (rm->output_feature_use_refcount_per_intf, sw_if_index);
1612   if (is_enable)
1613     {
1614       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1615         {
1616           ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1617           return vnet_feature_enable_disable ("ip4-output",
1618                                               "ip4-sv-reassembly-output-feature",
1619                                               sw_if_index, 1, 0, 0);
1620         }
1621       ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1622     }
1623   else
1624     {
1625       if (rm->output_feature_use_refcount_per_intf[sw_if_index])
1626         --rm->output_feature_use_refcount_per_intf[sw_if_index];
1627       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1628         return vnet_feature_enable_disable ("ip4-output",
1629                                             "ip4-sv-reassembly-output-feature",
1630                                             sw_if_index, 0, 0, 0);
1631     }
1632   return 0;
1633 }
1634 #endif
1635
1636 /*
1637  * fd.io coding-style-patch-verification: ON
1638  *
1639  * Local Variables:
1640  * eval: (c-set-style "gnu")
1641  * End:
1642  */