9b3f1b9855850cfb9966e44ed6c93684957adef7
[vpp.git] / src / vnet / ip / reass / ip4_sv_reass.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv4 Shallow Virtual Reassembly.
19  *
20  * This file contains the source code for IPv4 Shallow Virtual reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vnet/ip/ip4_to_ip6.h>
27 #include <vppinfra/fifo.h>
28 #include <vppinfra/bihash_16_8.h>
29 #include <vnet/ip/reass/ip4_sv_reass.h>
30
31 #define MSEC_PER_SEC 1000
32 #define IP4_SV_REASS_TIMEOUT_DEFAULT_MS 100
33 #define IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000      // 10 seconds default
34 #define IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35 #define IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36 #define IP4_SV_REASS_HT_LOAD_FACTOR (0.75)
37
38 typedef enum
39 {
40   IP4_SV_REASS_RC_OK,
41   IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS,
42   IP4_SV_REASS_RC_UNSUPP_IP_PROTO,
43 } ip4_sv_reass_rc_t;
44
45 typedef struct
46 {
47   union
48   {
49     struct
50     {
51       u32 xx_id;
52       ip4_address_t src;
53       ip4_address_t dst;
54       u16 frag_id;
55       u8 proto;
56       u8 unused;
57     };
58     u64 as_u64[2];
59   };
60 } ip4_sv_reass_key_t;
61
62 typedef union
63 {
64   struct
65   {
66     u32 reass_index;
67     u32 thread_index;
68   };
69   u64 as_u64;
70 } ip4_sv_reass_val_t;
71
72 typedef union
73 {
74   struct
75   {
76     ip4_sv_reass_key_t k;
77     ip4_sv_reass_val_t v;
78   };
79   clib_bihash_kv_16_8_t kv;
80 } ip4_sv_reass_kv_t;
81
82 typedef struct
83 {
84   // hash table key
85   ip4_sv_reass_key_t key;
86   // time when last packet was received
87   f64 last_heard;
88   // internal id of this reassembly
89   u64 id;
90   // trace operation counter
91   u32 trace_op_counter;
92   // minimum fragment length for this reassembly - used to estimate MTU
93   u16 min_fragment_length;
94   // buffer indexes of buffers in this reassembly in chronological order -
95   // including overlaps and duplicate fragments
96   u32 *cached_buffers;
97   // set to true when this reassembly is completed
98   bool is_complete;
99   // ip protocol
100   u8 ip_proto;
101   u8 icmp_type_or_tcp_flags;
102   u32 tcp_ack_number;
103   u32 tcp_seq_number;
104   // l4 src port
105   u16 l4_src_port;
106   // l4 dst port
107   u16 l4_dst_port;
108   u32 next_index;
109   // lru indexes
110   u32 lru_prev;
111   u32 lru_next;
112 } ip4_sv_reass_t;
113
114 typedef struct
115 {
116   ip4_sv_reass_t *pool;
117   u32 reass_n;
118   u32 id_counter;
119   clib_spinlock_t lock;
120   // lru indexes
121   u32 lru_first;
122   u32 lru_last;
123
124 } ip4_sv_reass_per_thread_t;
125
126 typedef struct
127 {
128   // IPv4 config
129   u32 timeout_ms;
130   f64 timeout;
131   u32 expire_walk_interval_ms;
132   // maximum number of fragments in one reassembly
133   u32 max_reass_len;
134   // maximum number of reassemblies
135   u32 max_reass_n;
136
137   // IPv4 runtime
138   clib_bihash_16_8_t hash;
139   // per-thread data
140   ip4_sv_reass_per_thread_t *per_thread_data;
141
142   // convenience
143   vlib_main_t *vlib_main;
144   vnet_main_t *vnet_main;
145
146   // node index of ip4-drop node
147   u32 ip4_drop_idx;
148   u32 ip4_sv_reass_expire_node_idx;
149
150   /** Worker handoff */
151   u32 fq_index;
152   u32 fq_feature_index;
153
154   // reference count for enabling/disabling feature - per interface
155   u32 *feature_use_refcount_per_intf;
156
157   // reference count for enabling/disabling feature - per interface
158   u32 *output_feature_use_refcount_per_intf;
159
160 } ip4_sv_reass_main_t;
161
162 extern ip4_sv_reass_main_t ip4_sv_reass_main;
163
164 #ifndef CLIB_MARCH_VARIANT
165 ip4_sv_reass_main_t ip4_sv_reass_main;
166 #endif /* CLIB_MARCH_VARIANT */
167
168 typedef enum
169 {
170   IP4_SV_REASSEMBLY_NEXT_INPUT,
171   IP4_SV_REASSEMBLY_NEXT_DROP,
172   IP4_SV_REASSEMBLY_NEXT_HANDOFF,
173   IP4_SV_REASSEMBLY_N_NEXT,
174 } ip4_sv_reass_next_t;
175
176 typedef enum
177 {
178   REASS_FRAGMENT_CACHE,
179   REASS_FINISH,
180   REASS_FRAGMENT_FORWARD,
181   REASS_PASSTHROUGH,
182 } ip4_sv_reass_trace_operation_e;
183
184 typedef struct
185 {
186   ip4_sv_reass_trace_operation_e action;
187   u32 reass_id;
188   u32 op_id;
189   u8 ip_proto;
190   u16 l4_src_port;
191   u16 l4_dst_port;
192 } ip4_sv_reass_trace_t;
193
194 extern vlib_node_registration_t ip4_sv_reass_node;
195 extern vlib_node_registration_t ip4_sv_reass_node_feature;
196
197 static u8 *
198 format_ip4_sv_reass_trace (u8 * s, va_list * args)
199 {
200   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
201   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
202   ip4_sv_reass_trace_t *t = va_arg (*args, ip4_sv_reass_trace_t *);
203   if (REASS_PASSTHROUGH != t->action)
204     {
205       s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
206     }
207   switch (t->action)
208     {
209     case REASS_FRAGMENT_CACHE:
210       s = format (s, "[cached]");
211       break;
212     case REASS_FINISH:
213       s =
214         format (s, "[finish, ip proto=%u, src_port=%u, dst_port=%u]",
215                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
216                 clib_net_to_host_u16 (t->l4_dst_port));
217       break;
218     case REASS_FRAGMENT_FORWARD:
219       s =
220         format (s, "[forward, ip proto=%u, src_port=%u, dst_port=%u]",
221                 t->ip_proto, clib_net_to_host_u16 (t->l4_src_port),
222                 clib_net_to_host_u16 (t->l4_dst_port));
223       break;
224     case REASS_PASSTHROUGH:
225       s = format (s, "[not-fragmented]");
226       break;
227     }
228   return s;
229 }
230
231 static void
232 ip4_sv_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
233                         ip4_sv_reass_main_t * rm, ip4_sv_reass_t * reass,
234                         u32 bi, ip4_sv_reass_trace_operation_e action,
235                         u32 ip_proto, u16 l4_src_port, u16 l4_dst_port)
236 {
237   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
238   if (pool_is_free_index
239       (vm->trace_main.trace_buffer_pool, vlib_buffer_get_trace_index (b)))
240     {
241       // this buffer's trace is gone
242       b->flags &= ~VLIB_BUFFER_IS_TRACED;
243       return;
244     }
245   ip4_sv_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
246   if (reass)
247     {
248       t->reass_id = reass->id;
249       t->op_id = reass->trace_op_counter;
250       ++reass->trace_op_counter;
251     }
252   t->action = action;
253   t->ip_proto = ip_proto;
254   t->l4_src_port = l4_src_port;
255   t->l4_dst_port = l4_dst_port;
256 #if 0
257   static u8 *s = NULL;
258   s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
259   printf ("%.*s\n", vec_len (s), s);
260   fflush (stdout);
261   vec_reset_length (s);
262 #endif
263 }
264
265
266 always_inline void
267 ip4_sv_reass_free (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
268                    ip4_sv_reass_per_thread_t * rt, ip4_sv_reass_t * reass)
269 {
270   clib_bihash_kv_16_8_t kv;
271   kv.key[0] = reass->key.as_u64[0];
272   kv.key[1] = reass->key.as_u64[1];
273   clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
274   vlib_buffer_free (vm, reass->cached_buffers,
275                     vec_len (reass->cached_buffers));
276   vec_free (reass->cached_buffers);
277   reass->cached_buffers = NULL;
278   if (~0 != reass->lru_prev)
279     {
280       ip4_sv_reass_t *lru_prev =
281         pool_elt_at_index (rt->pool, reass->lru_prev);
282       lru_prev->lru_next = reass->lru_next;
283     }
284   if (~0 != reass->lru_next)
285     {
286       ip4_sv_reass_t *lru_next =
287         pool_elt_at_index (rt->pool, reass->lru_next);
288       lru_next->lru_prev = reass->lru_prev;
289     }
290   if (rt->lru_first == reass - rt->pool)
291     {
292       rt->lru_first = reass->lru_next;
293     }
294   if (rt->lru_last == reass - rt->pool)
295     {
296       rt->lru_last = reass->lru_prev;
297     }
298   pool_put (rt->pool, reass);
299   --rt->reass_n;
300 }
301
302 always_inline void
303 ip4_sv_reass_init (ip4_sv_reass_t * reass)
304 {
305   reass->cached_buffers = NULL;
306   reass->is_complete = false;
307 }
308
309 always_inline ip4_sv_reass_t *
310 ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
311                              ip4_sv_reass_per_thread_t * rt,
312                              ip4_sv_reass_kv_t * kv, u8 * do_handoff)
313 {
314   ip4_sv_reass_t *reass = NULL;
315   f64 now = vlib_time_now (vm);
316
317   if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
318     {
319       if (vm->thread_index != kv->v.thread_index)
320         {
321           *do_handoff = 1;
322           return NULL;
323         }
324       reass = pool_elt_at_index (rt->pool, kv->v.reass_index);
325
326       if (now > reass->last_heard + rm->timeout)
327         {
328           ip4_sv_reass_free (vm, rm, rt, reass);
329           reass = NULL;
330         }
331     }
332
333   if (reass)
334     {
335       reass->last_heard = now;
336       return reass;
337     }
338
339   if (rt->reass_n >= rm->max_reass_n && rm->max_reass_n)
340     {
341       reass = pool_elt_at_index (rt->pool, rt->lru_first);
342       ip4_sv_reass_free (vm, rm, rt, reass);
343     }
344
345   pool_get (rt->pool, reass);
346   clib_memset (reass, 0, sizeof (*reass));
347   reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
348   ++rt->id_counter;
349   ip4_sv_reass_init (reass);
350   ++rt->reass_n;
351   reass->lru_prev = reass->lru_next = ~0;
352
353   if (~0 != rt->lru_last)
354     {
355       ip4_sv_reass_t *lru_last = pool_elt_at_index (rt->pool, rt->lru_last);
356       reass->lru_prev = rt->lru_last;
357       lru_last->lru_next = rt->lru_last = reass - rt->pool;
358     }
359
360   if (~0 == rt->lru_first)
361     {
362       rt->lru_first = rt->lru_last = reass - rt->pool;
363     }
364
365   reass->key.as_u64[0] = kv->kv.key[0];
366   reass->key.as_u64[1] = kv->kv.key[1];
367   kv->v.reass_index = (reass - rt->pool);
368   kv->v.thread_index = vm->thread_index;
369   reass->last_heard = now;
370
371   if (clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 1))
372     {
373       ip4_sv_reass_free (vm, rm, rt, reass);
374       reass = NULL;
375     }
376
377   return reass;
378 }
379
380 always_inline ip4_sv_reass_rc_t
381 ip4_sv_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
382                      ip4_sv_reass_main_t * rm, ip4_sv_reass_per_thread_t * rt,
383                      ip4_header_t * ip0, ip4_sv_reass_t * reass, u32 bi0)
384 {
385   vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
386   ip4_sv_reass_rc_t rc = IP4_SV_REASS_RC_OK;
387   const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
388   if (0 == fragment_first)
389     {
390       reass->ip_proto = ip0->protocol;
391       reass->l4_src_port = ip4_get_port (ip0, 1);
392       reass->l4_dst_port = ip4_get_port (ip0, 0);
393       if (!reass->l4_src_port || !reass->l4_dst_port)
394         return IP4_SV_REASS_RC_UNSUPP_IP_PROTO;
395       if (IP_PROTOCOL_TCP == reass->ip_proto)
396         {
397           reass->icmp_type_or_tcp_flags = ((tcp_header_t *) (ip0 + 1))->flags;
398           reass->tcp_ack_number = ((tcp_header_t *) (ip0 + 1))->ack_number;
399           reass->tcp_seq_number = ((tcp_header_t *) (ip0 + 1))->seq_number;
400         }
401       else if (IP_PROTOCOL_ICMP == reass->ip_proto)
402         {
403           reass->icmp_type_or_tcp_flags =
404             ((icmp46_header_t *) (ip0 + 1))->type;
405         }
406       reass->is_complete = true;
407       vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
408       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
409         {
410           ip4_sv_reass_add_trace (vm, node, rm, reass, bi0, REASS_FINISH,
411                                   reass->ip_proto, reass->l4_src_port,
412                                   reass->l4_dst_port);
413         }
414     }
415   vec_add1 (reass->cached_buffers, bi0);
416   if (!reass->is_complete)
417     {
418       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
419         {
420           ip4_sv_reass_add_trace (vm, node, rm, reass, bi0,
421                                   REASS_FRAGMENT_CACHE, ~0, ~0, ~0);
422         }
423       if (vec_len (reass->cached_buffers) > rm->max_reass_len)
424         {
425           rc = IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS;
426         }
427     }
428   return rc;
429 }
430
431 always_inline uword
432 ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
433                      vlib_frame_t * frame, bool is_feature,
434                      bool is_output_feature, bool is_custom)
435 {
436   u32 *from = vlib_frame_vector_args (frame);
437   u32 n_left_from, n_left_to_next, *to_next, next_index;
438   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
439   ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
440   clib_spinlock_lock (&rt->lock);
441
442   n_left_from = frame->n_vectors;
443   next_index = node->cached_next_index;
444
445   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
446   vlib_get_buffers (vm, from, bufs, n_left_from);
447   u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
448   b = bufs;
449
450   /* optimistic case first - no fragments */
451   while (n_left_from >= 2)
452     {
453       vlib_buffer_t *b0, *b1;
454       u32 next0, next1;
455       b0 = *b;
456       b++;
457       b1 = *b;
458       b++;
459
460       /* Prefetch next iteration. */
461       if (PREDICT_TRUE (n_left_from >= 4))
462         {
463           vlib_buffer_t *p2, *p3;
464
465           p2 = *b;
466           p3 = *(b + 1);
467
468           vlib_prefetch_buffer_header (p2, LOAD);
469           vlib_prefetch_buffer_header (p3, LOAD);
470
471           clib_prefetch_load (p2->data);
472           clib_prefetch_load (p3->data);
473         }
474
475       ip4_header_t *ip0 =
476         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
477                                      (is_output_feature ? 1 : 0) *
478                                      vnet_buffer (b0)->
479                                      ip.save_rewrite_length);
480       ip4_header_t *ip1 =
481         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b1),
482                                      (is_output_feature ? 1 : 0) *
483                                      vnet_buffer (b1)->
484                                      ip.save_rewrite_length);
485       if (PREDICT_FALSE
486           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0))
487           || (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
488         {
489           // fragment found, go slow path
490           b -= 2;
491           if (b - bufs > 0)
492             {
493               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
494                                            b - bufs);
495             }
496           goto slow_path;
497         }
498       if (is_feature)
499         {
500           vnet_feature_next (&next0, b0);
501         }
502       else
503         {
504           next0 = is_custom ? vnet_buffer (b0)->ip.reass.next_index :
505             IP4_SV_REASSEMBLY_NEXT_INPUT;
506         }
507       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
508       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
509       if (IP_PROTOCOL_TCP == ip0->protocol)
510         {
511           vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
512             ((tcp_header_t *) (ip0 + 1))->flags;
513           vnet_buffer (b0)->ip.reass.tcp_ack_number =
514             ((tcp_header_t *) (ip0 + 1))->ack_number;
515           vnet_buffer (b0)->ip.reass.tcp_seq_number =
516             ((tcp_header_t *) (ip0 + 1))->seq_number;
517         }
518       else if (IP_PROTOCOL_ICMP == ip0->protocol)
519         {
520           vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
521             ((icmp46_header_t *) (ip0 + 1))->type;
522         }
523       vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
524       vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
525       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
526         {
527           ip4_sv_reass_add_trace (vm, node, rm, NULL, from[(b - 2) - bufs],
528                                   REASS_PASSTHROUGH,
529                                   vnet_buffer (b0)->ip.reass.ip_proto,
530                                   vnet_buffer (b0)->ip.reass.l4_src_port,
531                                   vnet_buffer (b0)->ip.reass.l4_dst_port);
532         }
533       if (is_feature)
534         {
535           vnet_feature_next (&next1, b1);
536         }
537       else
538         {
539           next1 = is_custom ? vnet_buffer (b1)->ip.reass.next_index :
540             IP4_SV_REASSEMBLY_NEXT_INPUT;
541         }
542       vnet_buffer (b1)->ip.reass.is_non_first_fragment = 0;
543       vnet_buffer (b1)->ip.reass.ip_proto = ip1->protocol;
544       if (IP_PROTOCOL_TCP == ip1->protocol)
545         {
546           vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
547             ((tcp_header_t *) (ip1 + 1))->flags;
548           vnet_buffer (b1)->ip.reass.tcp_ack_number =
549             ((tcp_header_t *) (ip1 + 1))->ack_number;
550           vnet_buffer (b1)->ip.reass.tcp_seq_number =
551             ((tcp_header_t *) (ip1 + 1))->seq_number;
552         }
553       else if (IP_PROTOCOL_ICMP == ip1->protocol)
554         {
555           vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
556             ((icmp46_header_t *) (ip1 + 1))->type;
557         }
558       vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
559       vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
560       if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
561         {
562           ip4_sv_reass_add_trace (vm, node, rm, NULL, from[(b - 1) - bufs],
563                                   REASS_PASSTHROUGH,
564                                   vnet_buffer (b1)->ip.reass.ip_proto,
565                                   vnet_buffer (b1)->ip.reass.l4_src_port,
566                                   vnet_buffer (b1)->ip.reass.l4_dst_port);
567         }
568
569       n_left_from -= 2;
570       next[0] = next0;
571       next[1] = next1;
572       next += 2;
573     }
574
575   while (n_left_from > 0)
576     {
577       vlib_buffer_t *b0;
578       u32 next0;
579       b0 = *b;
580       b++;
581
582       ip4_header_t *ip0 =
583         (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
584                                      (is_output_feature ? 1 : 0) *
585                                      vnet_buffer (b0)->
586                                      ip.save_rewrite_length);
587       if (PREDICT_FALSE
588           (ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0)))
589         {
590           // fragment found, go slow path
591           b -= 1;
592           if (b - bufs > 0)
593             {
594               vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
595                                            b - bufs);
596             }
597           goto slow_path;
598         }
599       if (is_feature)
600         {
601           vnet_feature_next (&next0, b0);
602         }
603       else
604         {
605           next0 =
606             is_custom ? vnet_buffer (b0)->ip.
607             reass.next_index : IP4_SV_REASSEMBLY_NEXT_INPUT;
608         }
609       vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
610       vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
611       if (IP_PROTOCOL_TCP == ip0->protocol)
612         {
613           vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
614             ((tcp_header_t *) (ip0 + 1))->flags;
615           vnet_buffer (b0)->ip.reass.tcp_ack_number =
616             ((tcp_header_t *) (ip0 + 1))->ack_number;
617           vnet_buffer (b0)->ip.reass.tcp_seq_number =
618             ((tcp_header_t *) (ip0 + 1))->seq_number;
619         }
620       else if (IP_PROTOCOL_ICMP == ip0->protocol)
621         {
622           vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
623             ((icmp46_header_t *) (ip0 + 1))->type;
624         }
625       vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
626       vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
627       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
628         {
629           ip4_sv_reass_add_trace (vm, node, rm, NULL, from[(b - 1) - bufs],
630                                   REASS_PASSTHROUGH,
631                                   vnet_buffer (b0)->ip.reass.ip_proto,
632                                   vnet_buffer (b0)->ip.reass.l4_src_port,
633                                   vnet_buffer (b0)->ip.reass.l4_dst_port);
634         }
635
636       n_left_from -= 1;
637       next[0] = next0;
638       next += 1;
639     }
640
641   vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
642                                frame->n_vectors);
643
644   goto done;
645
646 slow_path:
647
648   from += b - bufs;
649
650   while (n_left_from > 0)
651     {
652       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
653
654       while (n_left_from > 0 && n_left_to_next > 0)
655         {
656           u32 bi0;
657           vlib_buffer_t *b0;
658           u32 next0;
659           u32 error0 = IP4_ERROR_NONE;
660
661           bi0 = from[0];
662           b0 = vlib_get_buffer (vm, bi0);
663
664           ip4_header_t *ip0 =
665             (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
666                                          (is_output_feature ? 1 : 0) *
667                                          vnet_buffer (b0)->
668                                          ip.save_rewrite_length);
669           if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
670             {
671               // this is a regular packet - no fragmentation
672               if (is_custom)
673                 {
674                   next0 = vnet_buffer (b0)->ip.reass.next_index;
675                 }
676               else
677                 {
678                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
679                 }
680               vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
681               vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
682               if (IP_PROTOCOL_TCP == ip0->protocol)
683                 {
684                   vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
685                     ((tcp_header_t *) (ip0 + 1))->flags;
686                   vnet_buffer (b0)->ip.reass.tcp_ack_number =
687                     ((tcp_header_t *) (ip0 + 1))->ack_number;
688                   vnet_buffer (b0)->ip.reass.tcp_seq_number =
689                     ((tcp_header_t *) (ip0 + 1))->seq_number;
690                 }
691               else if (IP_PROTOCOL_ICMP == ip0->protocol)
692                 {
693                   vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
694                     ((icmp46_header_t *) (ip0 + 1))->type;
695                 }
696               vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
697               vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
698               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
699                 {
700                   ip4_sv_reass_add_trace (vm, node, rm, NULL, bi0,
701                                           REASS_PASSTHROUGH,
702                                           vnet_buffer (b0)->ip.reass.ip_proto,
703                                           vnet_buffer (b0)->ip.
704                                           reass.l4_src_port,
705                                           vnet_buffer (b0)->ip.
706                                           reass.l4_dst_port);
707                 }
708               goto packet_enqueue;
709             }
710           const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
711           const u32 fragment_length =
712             clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
713           const u32 fragment_last = fragment_first + fragment_length - 1;
714           if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))     // 8 is minimum frag length per RFC 791
715             {
716               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
717               error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
718               b0->error = node->errors[error0];
719               goto packet_enqueue;
720             }
721           ip4_sv_reass_kv_t kv;
722           u8 do_handoff = 0;
723
724           kv.k.as_u64[0] =
725             (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
726                            vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
727             (u64) ip0->src_address.as_u32 << 32;
728           kv.k.as_u64[1] =
729             (u64) ip0->dst_address.
730             as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
731
732           ip4_sv_reass_t *reass =
733             ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
734
735           if (PREDICT_FALSE (do_handoff))
736             {
737               next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
738               vnet_buffer (b0)->ip.reass.owner_thread_index =
739                 kv.v.thread_index;
740               goto packet_enqueue;
741             }
742
743           if (!reass)
744             {
745               next0 = IP4_SV_REASSEMBLY_NEXT_DROP;
746               error0 = IP4_ERROR_REASS_LIMIT_REACHED;
747               b0->error = node->errors[error0];
748               goto packet_enqueue;
749             }
750
751           if (reass->is_complete)
752             {
753               if (is_custom)
754                 {
755                   next0 = vnet_buffer (b0)->ip.reass.next_index;
756                 }
757               else
758                 {
759                   next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
760                 }
761               vnet_buffer (b0)->ip.reass.is_non_first_fragment =
762                 ! !fragment_first;
763               vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
764               vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
765                 reass->icmp_type_or_tcp_flags;
766               vnet_buffer (b0)->ip.reass.tcp_ack_number =
767                 reass->tcp_ack_number;
768               vnet_buffer (b0)->ip.reass.tcp_seq_number =
769                 reass->tcp_seq_number;
770               vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
771               vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
772               if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
773                 {
774                   ip4_sv_reass_add_trace (vm, node, rm, reass, bi0,
775                                           REASS_FRAGMENT_FORWARD,
776                                           reass->ip_proto,
777                                           reass->l4_src_port,
778                                           reass->l4_dst_port);
779                 }
780               goto packet_enqueue;
781             }
782
783           ip4_sv_reass_rc_t rc =
784             ip4_sv_reass_update (vm, node, rm, rt, ip0, reass, bi0);
785           switch (rc)
786             {
787             case IP4_SV_REASS_RC_OK:
788               /* nothing to do here */
789               break;
790             case IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS:
791               vlib_node_increment_counter (vm, node->node_index,
792                                            IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
793                                            1);
794               ip4_sv_reass_free (vm, rm, rt, reass);
795               goto next_packet;
796               break;
797             case IP4_SV_REASS_RC_UNSUPP_IP_PROTO:
798               vlib_node_increment_counter (vm, node->node_index,
799                                            IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
800                                            1);
801               ip4_sv_reass_free (vm, rm, rt, reass);
802               goto next_packet;
803               break;
804             }
805           if (reass->is_complete)
806             {
807               u32 idx;
808               vec_foreach_index (idx, reass->cached_buffers)
809               {
810                 u32 bi0 = vec_elt (reass->cached_buffers, idx);
811                 vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
812                 ip0 =
813                   (ip4_header_t *) u8_ptr_add (vlib_buffer_get_current (b0),
814                                                (is_output_feature ? 1 : 0) *
815                                                vnet_buffer (b0)->
816                                                ip.save_rewrite_length);
817                 u32 next0 = IP4_SV_REASSEMBLY_NEXT_INPUT;
818                 if (is_feature)
819                   {
820                     vnet_feature_next (&next0, b0);
821                   }
822                 if (is_custom)
823                   {
824                     next0 = vnet_buffer (b0)->ip.reass.next_index;
825                   }
826                 if (0 == n_left_to_next)
827                   {
828                     vlib_put_next_frame (vm, node, next_index,
829                                          n_left_to_next);
830                     vlib_get_next_frame (vm, node, next_index, to_next,
831                                          n_left_to_next);
832                   }
833                 to_next[0] = bi0;
834                 to_next += 1;
835                 n_left_to_next -= 1;
836                 vnet_buffer (b0)->ip.reass.is_non_first_fragment =
837                   ! !ip4_get_fragment_offset (ip0);
838                 vnet_buffer (b0)->ip.reass.ip_proto = reass->ip_proto;
839                 vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
840                   reass->icmp_type_or_tcp_flags;
841                 vnet_buffer (b0)->ip.reass.tcp_ack_number =
842                   reass->tcp_ack_number;
843                 vnet_buffer (b0)->ip.reass.tcp_seq_number =
844                   reass->tcp_seq_number;
845                 vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
846                 vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
847                 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
848                   {
849                     ip4_sv_reass_add_trace (vm, node, rm, reass, bi0,
850                                             REASS_FRAGMENT_FORWARD,
851                                             reass->ip_proto,
852                                             reass->l4_src_port,
853                                             reass->l4_dst_port);
854                   }
855                 vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
856                                                  to_next, n_left_to_next, bi0,
857                                                  next0);
858               }
859               _vec_len (reass->cached_buffers) = 0;     // buffers are owned by frame now
860             }
861           goto next_packet;
862
863         packet_enqueue:
864           to_next[0] = bi0;
865           to_next += 1;
866           n_left_to_next -= 1;
867           if (is_feature && IP4_ERROR_NONE == error0)
868             {
869               b0 = vlib_get_buffer (vm, bi0);
870               vnet_feature_next (&next0, b0);
871             }
872           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
873                                            to_next, n_left_to_next,
874                                            bi0, next0);
875
876         next_packet:
877           from += 1;
878           n_left_from -= 1;
879         }
880
881       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
882     }
883
884 done:
885   clib_spinlock_unlock (&rt->lock);
886   return frame->n_vectors;
887 }
888
889 static char *ip4_sv_reass_error_strings[] = {
890 #define _(sym, string) string,
891   foreach_ip4_error
892 #undef _
893 };
894
895 VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
896                                   vlib_node_runtime_t * node,
897                                   vlib_frame_t * frame)
898 {
899   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
900                               false /* is_output_feature */ ,
901                               false /* is_custom */ );
902 }
903
904 /* *INDENT-OFF* */
905 VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
906     .name = "ip4-sv-reassembly",
907     .vector_size = sizeof (u32),
908     .format_trace = format_ip4_sv_reass_trace,
909     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
910     .error_strings = ip4_sv_reass_error_strings,
911     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
912     .next_nodes =
913         {
914                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
915                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
916                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
917
918         },
919 };
920 /* *INDENT-ON* */
921
922 VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
923                                           vlib_node_runtime_t * node,
924                                           vlib_frame_t * frame)
925 {
926   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
927                               false /* is_output_feature */ ,
928                               false /* is_custom */ );
929 }
930
931 /* *INDENT-OFF* */
932 VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
933     .name = "ip4-sv-reassembly-feature",
934     .vector_size = sizeof (u32),
935     .format_trace = format_ip4_sv_reass_trace,
936     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
937     .error_strings = ip4_sv_reass_error_strings,
938     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
939     .next_nodes =
940         {
941                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
942                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
943                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
944         },
945 };
946 /* *INDENT-ON* */
947
948 /* *INDENT-OFF* */
949 VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
950     .arc_name = "ip4-unicast",
951     .node_name = "ip4-sv-reassembly-feature",
952     .runs_before = VNET_FEATURES ("ip4-lookup"),
953     .runs_after = 0,
954 };
955 /* *INDENT-ON* */
956
957 VLIB_NODE_FN (ip4_sv_reass_node_output_feature) (vlib_main_t * vm,
958                                                  vlib_node_runtime_t * node,
959                                                  vlib_frame_t * frame)
960 {
961   return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
962                               true /* is_output_feature */ ,
963                               false /* is_custom */ );
964 }
965
966
967 /* *INDENT-OFF* */
968 VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
969     .name = "ip4-sv-reassembly-output-feature",
970     .vector_size = sizeof (u32),
971     .format_trace = format_ip4_sv_reass_trace,
972     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
973     .error_strings = ip4_sv_reass_error_strings,
974     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
975     .next_nodes =
976         {
977                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
978                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
979                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
980         },
981 };
982 /* *INDENT-ON* */
983
984 /* *INDENT-OFF* */
985 VNET_FEATURE_INIT (ip4_sv_reass_output_feature) = {
986     .arc_name = "ip4-output",
987     .node_name = "ip4-sv-reassembly-output-feature",
988     .runs_before = 0,
989     .runs_after = 0,
990 };
991 /* *INDENT-ON* */
992
993 /* *INDENT-OFF* */
994 VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
995     .name = "ip4-sv-reassembly-custom-next",
996     .vector_size = sizeof (u32),
997     .format_trace = format_ip4_sv_reass_trace,
998     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
999     .error_strings = ip4_sv_reass_error_strings,
1000     .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
1001     .next_nodes =
1002         {
1003                 [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
1004                 [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
1005                 [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-handoff",
1006
1007         },
1008 };
1009 /* *INDENT-ON* */
1010
1011 VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
1012                                          vlib_node_runtime_t * node,
1013                                          vlib_frame_t * frame)
1014 {
1015   return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
1016                               false /* is_output_feature */ ,
1017                               true /* is_custom */ );
1018 }
1019
1020 #ifndef CLIB_MARCH_VARIANT
1021 always_inline u32
1022 ip4_sv_reass_get_nbuckets ()
1023 {
1024   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1025   u32 nbuckets;
1026   u8 i;
1027
1028   nbuckets = (u32) (rm->max_reass_n / IP4_SV_REASS_HT_LOAD_FACTOR);
1029
1030   for (i = 0; i < 31; i++)
1031     if ((1 << i) >= nbuckets)
1032       break;
1033   nbuckets = 1 << i;
1034
1035   return nbuckets;
1036 }
1037 #endif /* CLIB_MARCH_VARIANT */
1038
1039 typedef enum
1040 {
1041   IP4_EVENT_CONFIG_CHANGED = 1,
1042 } ip4_sv_reass_event_t;
1043
1044 typedef struct
1045 {
1046   int failure;
1047   clib_bihash_16_8_t *new_hash;
1048 } ip4_rehash_cb_ctx;
1049
1050 #ifndef CLIB_MARCH_VARIANT
1051 static int
1052 ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1053 {
1054   ip4_rehash_cb_ctx *ctx = _ctx;
1055   if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1056     {
1057       ctx->failure = 1;
1058     }
1059   return (BIHASH_WALK_CONTINUE);
1060 }
1061
1062 static void
1063 ip4_sv_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1064                          u32 max_reassembly_length,
1065                          u32 expire_walk_interval_ms)
1066 {
1067   ip4_sv_reass_main.timeout_ms = timeout_ms;
1068   ip4_sv_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1069   ip4_sv_reass_main.max_reass_n = max_reassemblies;
1070   ip4_sv_reass_main.max_reass_len = max_reassembly_length;
1071   ip4_sv_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1072 }
1073
1074 vnet_api_error_t
1075 ip4_sv_reass_set (u32 timeout_ms, u32 max_reassemblies,
1076                   u32 max_reassembly_length, u32 expire_walk_interval_ms)
1077 {
1078   u32 old_nbuckets = ip4_sv_reass_get_nbuckets ();
1079   ip4_sv_reass_set_params (timeout_ms, max_reassemblies,
1080                            max_reassembly_length, expire_walk_interval_ms);
1081   vlib_process_signal_event (ip4_sv_reass_main.vlib_main,
1082                              ip4_sv_reass_main.ip4_sv_reass_expire_node_idx,
1083                              IP4_EVENT_CONFIG_CHANGED, 0);
1084   u32 new_nbuckets = ip4_sv_reass_get_nbuckets ();
1085   if (ip4_sv_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1086     {
1087       clib_bihash_16_8_t new_hash;
1088       clib_memset (&new_hash, 0, sizeof (new_hash));
1089       ip4_rehash_cb_ctx ctx;
1090       ctx.failure = 0;
1091       ctx.new_hash = &new_hash;
1092       clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1093                              new_nbuckets * 1024);
1094       clib_bihash_foreach_key_value_pair_16_8 (&ip4_sv_reass_main.hash,
1095                                                ip4_rehash_cb, &ctx);
1096       if (ctx.failure)
1097         {
1098           clib_bihash_free_16_8 (&new_hash);
1099           return -1;
1100         }
1101       else
1102         {
1103           clib_bihash_free_16_8 (&ip4_sv_reass_main.hash);
1104           clib_memcpy_fast (&ip4_sv_reass_main.hash, &new_hash,
1105                             sizeof (ip4_sv_reass_main.hash));
1106           clib_bihash_copied (&ip4_sv_reass_main.hash, &new_hash);
1107         }
1108     }
1109   return 0;
1110 }
1111
1112 vnet_api_error_t
1113 ip4_sv_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1114                   u32 * max_reassembly_length, u32 * expire_walk_interval_ms)
1115 {
1116   *timeout_ms = ip4_sv_reass_main.timeout_ms;
1117   *max_reassemblies = ip4_sv_reass_main.max_reass_n;
1118   *max_reassembly_length = ip4_sv_reass_main.max_reass_len;
1119   *expire_walk_interval_ms = ip4_sv_reass_main.expire_walk_interval_ms;
1120   return 0;
1121 }
1122
1123 static clib_error_t *
1124 ip4_sv_reass_init_function (vlib_main_t * vm)
1125 {
1126   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1127   clib_error_t *error = 0;
1128   u32 nbuckets;
1129   vlib_node_t *node;
1130
1131   rm->vlib_main = vm;
1132   rm->vnet_main = vnet_get_main ();
1133
1134   vec_validate (rm->per_thread_data, vlib_num_workers ());
1135   ip4_sv_reass_per_thread_t *rt;
1136   vec_foreach (rt, rm->per_thread_data)
1137   {
1138     clib_spinlock_init (&rt->lock);
1139     pool_alloc (rt->pool, rm->max_reass_n);
1140     rt->lru_first = rt->lru_last = ~0;
1141   }
1142
1143   node = vlib_get_node_by_name (vm, (u8 *) "ip4-sv-reassembly-expire-walk");
1144   ASSERT (node);
1145   rm->ip4_sv_reass_expire_node_idx = node->index;
1146
1147   ip4_sv_reass_set_params (IP4_SV_REASS_TIMEOUT_DEFAULT_MS,
1148                            IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT,
1149                            IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1150                            IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1151
1152   nbuckets = ip4_sv_reass_get_nbuckets ();
1153   clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1154
1155   node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1156   ASSERT (node);
1157   rm->ip4_drop_idx = node->index;
1158
1159   rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
1160   rm->fq_feature_index =
1161     vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
1162
1163   rm->feature_use_refcount_per_intf = NULL;
1164   rm->output_feature_use_refcount_per_intf = NULL;
1165
1166   return error;
1167 }
1168
1169 VLIB_INIT_FUNCTION (ip4_sv_reass_init_function);
1170 #endif /* CLIB_MARCH_VARIANT */
1171
1172 static uword
1173 ip4_sv_reass_walk_expired (vlib_main_t * vm,
1174                            vlib_node_runtime_t * node, vlib_frame_t * f)
1175 {
1176   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1177   uword event_type, *event_data = 0;
1178
1179   while (true)
1180     {
1181       vlib_process_wait_for_event_or_clock (vm,
1182                                             (f64)
1183                                             rm->expire_walk_interval_ms /
1184                                             (f64) MSEC_PER_SEC);
1185       event_type = vlib_process_get_events (vm, &event_data);
1186
1187       switch (event_type)
1188         {
1189         case ~0:                /* no events => timeout */
1190           /* nothing to do here */
1191           break;
1192         case IP4_EVENT_CONFIG_CHANGED:
1193           break;
1194         default:
1195           clib_warning ("BUG: event type 0x%wx", event_type);
1196           break;
1197         }
1198       f64 now = vlib_time_now (vm);
1199
1200       ip4_sv_reass_t *reass;
1201       int *pool_indexes_to_free = NULL;
1202
1203       uword thread_index = 0;
1204       int index;
1205       const uword nthreads = vlib_num_workers () + 1;
1206       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1207         {
1208           ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1209           clib_spinlock_lock (&rt->lock);
1210
1211           vec_reset_length (pool_indexes_to_free);
1212           /* *INDENT-OFF* */
1213           pool_foreach_index (index, rt->pool)  {
1214                                 reass = pool_elt_at_index (rt->pool, index);
1215                                 if (now > reass->last_heard + rm->timeout)
1216                                   {
1217                                     vec_add1 (pool_indexes_to_free, index);
1218                                   }
1219                               }
1220           /* *INDENT-ON* */
1221           int *i;
1222           /* *INDENT-OFF* */
1223           vec_foreach (i, pool_indexes_to_free)
1224           {
1225             ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1226             ip4_sv_reass_free (vm, rm, rt, reass);
1227           }
1228           /* *INDENT-ON* */
1229
1230           clib_spinlock_unlock (&rt->lock);
1231         }
1232
1233       vec_free (pool_indexes_to_free);
1234       if (event_data)
1235         {
1236           _vec_len (event_data) = 0;
1237         }
1238     }
1239
1240   return 0;
1241 }
1242
1243 /* *INDENT-OFF* */
1244 VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
1245     .function = ip4_sv_reass_walk_expired,
1246     .type = VLIB_NODE_TYPE_PROCESS,
1247     .name = "ip4-sv-reassembly-expire-walk",
1248     .format_trace = format_ip4_sv_reass_trace,
1249     .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
1250     .error_strings = ip4_sv_reass_error_strings,
1251
1252 };
1253 /* *INDENT-ON* */
1254
1255 static u8 *
1256 format_ip4_sv_reass_key (u8 * s, va_list * args)
1257 {
1258   ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
1259   s =
1260     format (s,
1261             "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1262             key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1263             &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1264   return s;
1265 }
1266
1267 static u8 *
1268 format_ip4_sv_reass (u8 * s, va_list * args)
1269 {
1270   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1271   ip4_sv_reass_t *reass = va_arg (*args, ip4_sv_reass_t *);
1272
1273   s = format (s, "ID: %lu, key: %U trace_op_counter: %u\n",
1274               reass->id, format_ip4_sv_reass_key, &reass->key,
1275               reass->trace_op_counter);
1276
1277   vlib_buffer_t *b;
1278   u32 *bip;
1279   u32 counter = 0;
1280   vec_foreach (bip, reass->cached_buffers)
1281   {
1282     u32 bi = *bip;
1283     do
1284       {
1285         b = vlib_get_buffer (vm, bi);
1286         s = format (s, "  #%03u: bi: %u, ", counter, bi);
1287         ++counter;
1288         bi = b->next_buffer;
1289       }
1290     while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
1291   }
1292   return s;
1293 }
1294
1295 static clib_error_t *
1296 show_ip4_reass (vlib_main_t * vm,
1297                 unformat_input_t * input,
1298                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1299 {
1300   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1301
1302   vlib_cli_output (vm, "---------------------");
1303   vlib_cli_output (vm, "IP4 reassembly status");
1304   vlib_cli_output (vm, "---------------------");
1305   bool details = false;
1306   if (unformat (input, "details"))
1307     {
1308       details = true;
1309     }
1310
1311   u32 sum_reass_n = 0;
1312   ip4_sv_reass_t *reass;
1313   uword thread_index;
1314   const uword nthreads = vlib_num_workers () + 1;
1315   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1316     {
1317       ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1318       clib_spinlock_lock (&rt->lock);
1319       if (details)
1320         {
1321           /* *INDENT-OFF* */
1322           pool_foreach (reass, rt->pool) {
1323             vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
1324           }
1325           /* *INDENT-ON* */
1326         }
1327       sum_reass_n += rt->reass_n;
1328       clib_spinlock_unlock (&rt->lock);
1329     }
1330   vlib_cli_output (vm, "---------------------");
1331   vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1332                    (long unsigned) sum_reass_n);
1333   vlib_cli_output (vm,
1334                    "Maximum configured concurrent shallow virtual IP4 reassemblies per worker-thread: %lu\n",
1335                    (long unsigned) rm->max_reass_n);
1336   vlib_cli_output (vm,
1337                    "Maximum configured amount of fragments per shallow "
1338                    "virtual IP4 reassembly: %lu\n",
1339                    (long unsigned) rm->max_reass_len);
1340   vlib_cli_output (vm,
1341                    "Maximum configured shallow virtual IP4 reassembly timeout: %lums\n",
1342                    (long unsigned) rm->timeout_ms);
1343   vlib_cli_output (vm,
1344                    "Maximum configured shallow virtual IP4 reassembly expire walk interval: %lums\n",
1345                    (long unsigned) rm->expire_walk_interval_ms);
1346   return 0;
1347 }
1348
1349 /* *INDENT-OFF* */
1350 VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
1351     .path = "show ip4-sv-reassembly",
1352     .short_help = "show ip4-sv-reassembly [details]",
1353     .function = show_ip4_reass,
1354 };
1355 /* *INDENT-ON* */
1356
1357 #ifndef CLIB_MARCH_VARIANT
1358 vnet_api_error_t
1359 ip4_sv_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1360 {
1361   return ip4_sv_reass_enable_disable_with_refcnt (sw_if_index,
1362                                                   enable_disable);
1363 }
1364 #endif /* CLIB_MARCH_VARIANT */
1365
1366
1367 #define foreach_ip4_sv_reass_handoff_error                       \
1368 _(CONGESTION_DROP, "congestion drop")
1369
1370
1371 typedef enum
1372 {
1373 #define _(sym,str) IP4_SV_REASSEMBLY_HANDOFF_ERROR_##sym,
1374   foreach_ip4_sv_reass_handoff_error
1375 #undef _
1376     IP4_SV_REASSEMBLY_HANDOFF_N_ERROR,
1377 } ip4_sv_reass_handoff_error_t;
1378
1379 static char *ip4_sv_reass_handoff_error_strings[] = {
1380 #define _(sym,string) string,
1381   foreach_ip4_sv_reass_handoff_error
1382 #undef _
1383 };
1384
1385 typedef struct
1386 {
1387   u32 next_worker_index;
1388 } ip4_sv_reass_handoff_trace_t;
1389
1390 static u8 *
1391 format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
1392 {
1393   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1394   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1395   ip4_sv_reass_handoff_trace_t *t =
1396     va_arg (*args, ip4_sv_reass_handoff_trace_t *);
1397
1398   s =
1399     format (s, "ip4-sv-reassembly-handoff: next-worker %d",
1400             t->next_worker_index);
1401
1402   return s;
1403 }
1404
1405 always_inline uword
1406 ip4_sv_reass_handoff_node_inline (vlib_main_t * vm,
1407                                   vlib_node_runtime_t * node,
1408                                   vlib_frame_t * frame, bool is_feature)
1409 {
1410   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1411
1412   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1413   u32 n_enq, n_left_from, *from;
1414   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1415   u32 fq_index;
1416
1417   from = vlib_frame_vector_args (frame);
1418   n_left_from = frame->n_vectors;
1419   vlib_get_buffers (vm, from, bufs, n_left_from);
1420
1421   b = bufs;
1422   ti = thread_indices;
1423
1424   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1425
1426   while (n_left_from > 0)
1427     {
1428       ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1429
1430       if (PREDICT_FALSE
1431           ((node->flags & VLIB_NODE_FLAG_TRACE)
1432            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1433         {
1434           ip4_sv_reass_handoff_trace_t *t =
1435             vlib_add_trace (vm, node, b[0], sizeof (*t));
1436           t->next_worker_index = ti[0];
1437         }
1438
1439       n_left_from -= 1;
1440       ti += 1;
1441       b += 1;
1442     }
1443   n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
1444                                          thread_indices, frame->n_vectors, 1);
1445
1446   if (n_enq < frame->n_vectors)
1447     vlib_node_increment_counter (vm, node->node_index,
1448                                  IP4_SV_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1449                                  frame->n_vectors - n_enq);
1450   return frame->n_vectors;
1451 }
1452
1453 VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
1454                                           vlib_node_runtime_t * node,
1455                                           vlib_frame_t * frame)
1456 {
1457   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1458                                            false /* is_feature */ );
1459 }
1460
1461
1462 /* *INDENT-OFF* */
1463 VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
1464   .name = "ip4-sv-reassembly-handoff",
1465   .vector_size = sizeof (u32),
1466   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1467   .error_strings = ip4_sv_reass_handoff_error_strings,
1468   .format_trace = format_ip4_sv_reass_handoff_trace,
1469
1470   .n_next_nodes = 1,
1471
1472   .next_nodes = {
1473     [0] = "error-drop",
1474   },
1475 };
1476 /* *INDENT-ON* */
1477
1478
1479 /* *INDENT-OFF* */
1480 VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
1481                                                     vlib_node_runtime_t *
1482                                                     node,
1483                                                     vlib_frame_t * frame)
1484 {
1485   return ip4_sv_reass_handoff_node_inline (vm, node, frame,
1486                                              true /* is_feature */ );
1487 }
1488 /* *INDENT-ON* */
1489
1490
1491 /* *INDENT-OFF* */
1492 VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
1493   .name = "ip4-sv-reass-feature-hoff",
1494   .vector_size = sizeof (u32),
1495   .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
1496   .error_strings = ip4_sv_reass_handoff_error_strings,
1497   .format_trace = format_ip4_sv_reass_handoff_trace,
1498
1499   .n_next_nodes = 1,
1500
1501   .next_nodes = {
1502     [0] = "error-drop",
1503   },
1504 };
1505 /* *INDENT-ON* */
1506
1507 #ifndef CLIB_MARCH_VARIANT
1508 int
1509 ip4_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1510 {
1511   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1512   vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1513   if (is_enable)
1514     {
1515       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1516         {
1517           ++rm->feature_use_refcount_per_intf[sw_if_index];
1518           return vnet_feature_enable_disable ("ip4-unicast",
1519                                               "ip4-sv-reassembly-feature",
1520                                               sw_if_index, 1, 0, 0);
1521         }
1522       ++rm->feature_use_refcount_per_intf[sw_if_index];
1523     }
1524   else
1525     {
1526       if (rm->feature_use_refcount_per_intf[sw_if_index])
1527         --rm->feature_use_refcount_per_intf[sw_if_index];
1528       if (!rm->feature_use_refcount_per_intf[sw_if_index])
1529         return vnet_feature_enable_disable ("ip4-unicast",
1530                                             "ip4-sv-reassembly-feature",
1531                                             sw_if_index, 0, 0, 0);
1532     }
1533   return 0;
1534 }
1535
1536 uword
1537 ip4_sv_reass_custom_register_next_node (uword node_index)
1538 {
1539   return vlib_node_add_next (vlib_get_main (), ip4_sv_reass_custom_node.index,
1540                              node_index);
1541 }
1542
1543 int
1544 ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
1545                                                 int is_enable)
1546 {
1547   ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
1548   vec_validate (rm->output_feature_use_refcount_per_intf, sw_if_index);
1549   if (is_enable)
1550     {
1551       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1552         {
1553           ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1554           return vnet_feature_enable_disable ("ip4-output",
1555                                               "ip4-sv-reassembly-output-feature",
1556                                               sw_if_index, 1, 0, 0);
1557         }
1558       ++rm->output_feature_use_refcount_per_intf[sw_if_index];
1559     }
1560   else
1561     {
1562       if (rm->output_feature_use_refcount_per_intf[sw_if_index])
1563         --rm->output_feature_use_refcount_per_intf[sw_if_index];
1564       if (!rm->output_feature_use_refcount_per_intf[sw_if_index])
1565         return vnet_feature_enable_disable ("ip4-output",
1566                                             "ip4-sv-reassembly-output-feature",
1567                                             sw_if_index, 0, 0, 0);
1568     }
1569   return 0;
1570 }
1571 #endif
1572
1573 /*
1574  * fd.io coding-style-patch-verification: ON
1575  *
1576  * Local Variables:
1577  * eval: (c-set-style "gnu")
1578  * End:
1579  */