4f27304ca94d533657387cd2dc2cf6dc470db817
[vpp.git] / src / vnet / ip / ip6_reassembly.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15
16 /**
17  * @file
18  * @brief IPv6 Reassembly.
19  *
20  * This file contains the source code for IPv6 reassembly.
21  */
22
23 #include <vppinfra/vec.h>
24 #include <vnet/vnet.h>
25 #include <vnet/ip/ip.h>
26 #include <vppinfra/bihash_48_8.h>
27 #include <vnet/ip/ip6_reassembly.h>
28
29 #define MSEC_PER_SEC 1000
30 #define IP6_REASS_TIMEOUT_DEFAULT_MS 100
31 #define IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
32 #define IP6_REASS_MAX_REASSEMBLIES_DEFAULT 1024
33 #define IP6_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
34 #define IP6_REASS_HT_LOAD_FACTOR (0.75)
35
36 typedef enum
37 {
38   IP6_REASS_RC_OK,
39   IP6_REASS_RC_INTERNAL_ERROR,
40   IP6_REASS_RC_TOO_MANY_FRAGMENTS,
41   IP6_REASS_RC_NO_BUF,
42   IP6_REASS_RC_HANDOFF,
43 } ip6_reass_rc_t;
44
45 typedef struct
46 {
47   union
48   {
49     struct
50     {
51       ip6_address_t src;
52       ip6_address_t dst;
53       u32 xx_id;
54       u32 frag_id;
55       u8 unused[7];
56       u8 proto;
57     };
58     u64 as_u64[6];
59   };
60 } ip6_reass_key_t;
61
62 typedef union
63 {
64   struct
65   {
66     u32 reass_index;
67     u32 memory_owner_thread_index;
68   };
69   u64 as_u64;
70 } ip6_reass_val_t;
71
72 typedef union
73 {
74   struct
75   {
76     ip6_reass_key_t k;
77     ip6_reass_val_t v;
78   };
79   clib_bihash_kv_48_8_t kv;
80 } ip6_reass_kv_t;
81
82
83 always_inline u32
84 ip6_reass_buffer_get_data_offset (vlib_buffer_t * b)
85 {
86   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
87   return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
88 }
89
90 always_inline u16
91 ip6_reass_buffer_get_data_len (vlib_buffer_t * b)
92 {
93   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
94   return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
95     (vnb->ip.reass.fragment_first + ip6_reass_buffer_get_data_offset (b)) + 1;
96 }
97
98 typedef struct
99 {
100   // hash table key
101   ip6_reass_key_t key;
102   // time when last packet was received
103   f64 last_heard;
104   // internal id of this reassembly
105   u64 id;
106   // buffer index of first buffer in this reassembly context
107   u32 first_bi;
108   // last octet of packet, ~0 until fragment without more_fragments arrives
109   u32 last_packet_octet;
110   // length of data collected so far
111   u32 data_len;
112   // trace operation counter
113   u32 trace_op_counter;
114   // next index - used by custom apps (~0 if not set)
115   u32 next_index;
116   // error next index - used by custom apps (~0 if not set)
117   u32 error_next_index;
118   // minimum fragment length for this reassembly - used to estimate MTU
119   u16 min_fragment_length;
120   // number of fragments for this reassembly
121   u32 fragments_n;
122   // thread owning memory for this context (whose pool contains this ctx)
123   u32 memory_owner_thread_index;
124   // thread which received fragment with offset 0 and which sends out the
125   // completed reassembly
126   u32 sendout_thread_index;
127 } ip6_reass_t;
128
129 typedef struct
130 {
131   ip6_reass_t *pool;
132   u32 reass_n;
133   u32 id_counter;
134   clib_spinlock_t lock;
135 } ip6_reass_per_thread_t;
136
137 typedef struct
138 {
139   // IPv6 config
140   u32 timeout_ms;
141   f64 timeout;
142   u32 expire_walk_interval_ms;
143   // maximum number of fragments in one reassembly
144   u32 max_reass_len;
145   // maximum number of reassemblies
146   u32 max_reass_n;
147
148   // IPv6 runtime
149   clib_bihash_48_8_t hash;
150
151   // per-thread data
152   ip6_reass_per_thread_t *per_thread_data;
153
154   // convenience
155   vlib_main_t *vlib_main;
156
157   // node index of ip6-drop node
158   u32 ip6_drop_idx;
159   u32 ip6_icmp_error_idx;
160   u32 ip6_reass_expire_node_idx;
161
162   /** Worker handoff */
163   u32 fq_index;
164   u32 fq_feature_index;
165
166 } ip6_reass_main_t;
167
168 extern ip6_reass_main_t ip6_reass_main;
169
170 #ifndef CLIB_MARCH_VARIANT
171 ip6_reass_main_t ip6_reass_main;
172 #endif /* CLIB_MARCH_VARIANT */
173
174 typedef enum
175 {
176   IP6_REASSEMBLY_NEXT_INPUT,
177   IP6_REASSEMBLY_NEXT_DROP,
178   IP6_REASSEMBLY_NEXT_ICMP_ERROR,
179   IP6_REASSEMBLY_NEXT_HANDOFF,
180   IP6_REASSEMBLY_N_NEXT,
181 } ip6_reass_next_t;
182
183 typedef enum
184 {
185   RANGE_NEW,
186   RANGE_OVERLAP,
187   ICMP_ERROR_RT_EXCEEDED,
188   ICMP_ERROR_FL_TOO_BIG,
189   ICMP_ERROR_FL_NOT_MULT_8,
190   FINALIZE,
191   HANDOFF,
192 } ip6_reass_trace_operation_e;
193
194 typedef struct
195 {
196   u16 range_first;
197   u16 range_last;
198   u32 range_bi;
199   i32 data_offset;
200   u32 data_len;
201   u32 first_bi;
202 } ip6_reass_range_trace_t;
203
204 typedef struct
205 {
206   ip6_reass_trace_operation_e action;
207   u32 reass_id;
208   ip6_reass_range_trace_t trace_range;
209   u32 op_id;
210   u32 fragment_first;
211   u32 fragment_last;
212   u32 total_data_len;
213   u32 thread_id;
214   u32 thread_id_to;
215 } ip6_reass_trace_t;
216
217 static void
218 ip6_reass_trace_details (vlib_main_t * vm, u32 bi,
219                          ip6_reass_range_trace_t * trace)
220 {
221   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
222   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
223   trace->range_first = vnb->ip.reass.range_first;
224   trace->range_last = vnb->ip.reass.range_last;
225   trace->data_offset = ip6_reass_buffer_get_data_offset (b);
226   trace->data_len = ip6_reass_buffer_get_data_len (b);
227   trace->range_bi = bi;
228 }
229
230 static u8 *
231 format_ip6_reass_range_trace (u8 * s, va_list * args)
232 {
233   ip6_reass_range_trace_t *trace = va_arg (*args, ip6_reass_range_trace_t *);
234   s = format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
235               trace->range_last, trace->data_offset, trace->data_len,
236               trace->range_bi);
237   return s;
238 }
239
240 static u8 *
241 format_ip6_reass_trace (u8 * s, va_list * args)
242 {
243   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
244   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
245   ip6_reass_trace_t *t = va_arg (*args, ip6_reass_trace_t *);
246   u32 indent = 0;
247   if (~0 != t->reass_id)
248     {
249       s = format (s, "reass id: %u, op id: %u ", t->reass_id, t->op_id);
250       indent = format_get_indent (s);
251       s = format (s, "first bi: %u, data len: %u, ip/fragment[%u, %u]",
252                   t->trace_range.first_bi, t->total_data_len,
253                   t->fragment_first, t->fragment_last);
254     }
255   switch (t->action)
256     {
257     case RANGE_NEW:
258       s = format (s, "\n%Unew %U", format_white_space, indent,
259                   format_ip6_reass_range_trace, &t->trace_range);
260       break;
261     case RANGE_OVERLAP:
262       s = format (s, "\n%Uoverlap %U", format_white_space, indent,
263                   format_ip6_reass_range_trace, &t->trace_range);
264       break;
265     case ICMP_ERROR_FL_TOO_BIG:
266       s = format (s, "\n%Uicmp-error - frag_len > 65535 %U",
267                   format_white_space, indent, format_ip6_reass_range_trace,
268                   &t->trace_range);
269       break;
270     case ICMP_ERROR_FL_NOT_MULT_8:
271       s = format (s, "\n%Uicmp-error - frag_len mod 8 != 0 %U",
272                   format_white_space, indent, format_ip6_reass_range_trace,
273                   &t->trace_range);
274       break;
275     case ICMP_ERROR_RT_EXCEEDED:
276       s = format (s, "\n%Uicmp-error - reassembly time exceeded",
277                   format_white_space, indent);
278       break;
279     case FINALIZE:
280       s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
281       break;
282     case HANDOFF:
283       s =
284         format (s, "handoff from thread #%u to thread #%u", t->thread_id,
285                 t->thread_id_to);
286       break;
287     }
288   return s;
289 }
290
291 static void
292 ip6_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
293                      ip6_reass_main_t * rm, u32 reass_id, u32 op_id,
294                      u32 bi, u32 first_bi, u32 data_len,
295                      ip6_reass_trace_operation_e action, u32 thread_id_to)
296 {
297   vlib_buffer_t *b = vlib_get_buffer (vm, bi);
298   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
299   ip6_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
300   t->reass_id = reass_id;
301   t->action = action;
302   ip6_reass_trace_details (vm, bi, &t->trace_range);
303   t->op_id = op_id;
304   t->thread_id = vm->thread_index;
305   t->thread_id_to = thread_id_to;
306   t->fragment_first = vnb->ip.reass.fragment_first;
307   t->fragment_last = vnb->ip.reass.fragment_last;
308   t->trace_range.first_bi = first_bi;
309   t->total_data_len = data_len;
310 #if 0
311   static u8 *s = NULL;
312   s = format (s, "%U", format_ip6_reass_trace, NULL, NULL, t);
313   printf ("%.*s\n", vec_len (s), s);
314   fflush (stdout);
315   vec_reset_length (s);
316 #endif
317 }
318
319 always_inline void
320 ip6_reass_free_ctx (ip6_reass_per_thread_t * rt, ip6_reass_t * reass)
321 {
322   pool_put (rt->pool, reass);
323   --rt->reass_n;
324 }
325
326 always_inline void
327 ip6_reass_free (ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
328                 ip6_reass_t * reass)
329 {
330   clib_bihash_kv_48_8_t kv;
331   kv.key[0] = reass->key.as_u64[0];
332   kv.key[1] = reass->key.as_u64[1];
333   kv.key[2] = reass->key.as_u64[2];
334   kv.key[3] = reass->key.as_u64[3];
335   kv.key[4] = reass->key.as_u64[4];
336   kv.key[5] = reass->key.as_u64[5];
337   clib_bihash_add_del_48_8 (&rm->hash, &kv, 0);
338   ip6_reass_free_ctx (rt, reass);
339 }
340
341 always_inline void
342 ip6_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
343                     ip6_reass_main_t * rm, ip6_reass_t * reass)
344 {
345   u32 range_bi = reass->first_bi;
346   vlib_buffer_t *range_b;
347   vnet_buffer_opaque_t *range_vnb;
348   u32 *to_free = NULL;
349   while (~0 != range_bi)
350     {
351       range_b = vlib_get_buffer (vm, range_bi);
352       range_vnb = vnet_buffer (range_b);
353       u32 bi = range_bi;
354       while (~0 != bi)
355         {
356           vec_add1 (to_free, bi);
357           vlib_buffer_t *b = vlib_get_buffer (vm, bi);
358           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
359             {
360               bi = b->next_buffer;
361               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
362             }
363           else
364             {
365               bi = ~0;
366             }
367         }
368       range_bi = range_vnb->ip.reass.next_range_bi;
369     }
370   /* send to next_error_index */
371   if (~0 != reass->error_next_index)
372     {
373       u32 n_left_to_next, *to_next, next_index;
374
375       next_index = reass->error_next_index;
376       u32 bi = ~0;
377
378       while (vec_len (to_free) > 0)
379         {
380           vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
381
382           while (vec_len (to_free) > 0 && n_left_to_next > 0)
383             {
384               bi = vec_pop (to_free);
385
386               if (~0 != bi)
387                 {
388                   to_next[0] = bi;
389                   to_next += 1;
390                   n_left_to_next -= 1;
391                 }
392             }
393           vlib_put_next_frame (vm, node, next_index, n_left_to_next);
394         }
395     }
396   else
397     {
398       vlib_buffer_free (vm, to_free, vec_len (to_free));
399     }
400   vec_free (to_free);
401 }
402
403 always_inline void
404 ip6_reass_on_timeout (vlib_main_t * vm, vlib_node_runtime_t * node,
405                       ip6_reass_main_t * rm, ip6_reass_t * reass,
406                       u32 * icmp_bi)
407 {
408   if (~0 == reass->first_bi)
409     {
410       return;
411     }
412   if (~0 == reass->next_index)  // custom apps don't want icmp
413     {
414       vlib_buffer_t *b = vlib_get_buffer (vm, reass->first_bi);
415       if (0 == vnet_buffer (b)->ip.reass.fragment_first)
416         {
417           *icmp_bi = reass->first_bi;
418           if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
419             {
420               ip6_reass_add_trace (vm, node, rm, reass->id,
421                                    reass->trace_op_counter, reass->first_bi,
422                                    reass->first_bi, reass->data_len,
423                                    ICMP_ERROR_RT_EXCEEDED, ~0);
424               ++reass->trace_op_counter;
425             }
426           // fragment with offset zero received - send icmp message back
427           if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
428             {
429               // separate first buffer from chain and steer it towards icmp node
430               b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
431               reass->first_bi = b->next_buffer;
432             }
433           else
434             {
435               reass->first_bi = vnet_buffer (b)->ip.reass.next_range_bi;
436             }
437           icmp6_error_set_vnet_buffer (b, ICMP6_time_exceeded,
438                                        ICMP6_time_exceeded_fragment_reassembly_time_exceeded,
439                                        0);
440         }
441     }
442   ip6_reass_drop_all (vm, node, rm, reass);
443 }
444
445 always_inline ip6_reass_t *
446 ip6_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
447                           ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
448                           ip6_reass_kv_t * kv, u32 * icmp_bi, u8 * do_handoff)
449 {
450   ip6_reass_t *reass;
451   f64 now;
452
453 again:
454
455   reass = NULL;
456   now = vlib_time_now (vm);
457
458   if (!clib_bihash_search_48_8
459       (&rm->hash, (clib_bihash_kv_48_8_t *) kv, (clib_bihash_kv_48_8_t *) kv))
460     {
461       reass =
462         pool_elt_at_index (rm->per_thread_data
463                            [kv->v.memory_owner_thread_index].pool,
464                            kv->v.reass_index);
465       if (vm->thread_index != kv->v.memory_owner_thread_index)
466         {
467           *do_handoff = 1;
468           return reass;
469         }
470
471       if (now > reass->last_heard + rm->timeout)
472         {
473           ip6_reass_on_timeout (vm, node, rm, reass, icmp_bi);
474           ip6_reass_free (rm, rt, reass);
475           reass = NULL;
476         }
477     }
478
479   if (reass)
480     {
481       reass->last_heard = now;
482       return reass;
483     }
484
485   if (rt->reass_n >= rm->max_reass_n)
486     {
487       reass = NULL;
488       return reass;
489     }
490   else
491     {
492       pool_get (rt->pool, reass);
493       clib_memset (reass, 0, sizeof (*reass));
494       reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
495       ++rt->id_counter;
496       reass->first_bi = ~0;
497       reass->last_packet_octet = ~0;
498       reass->data_len = 0;
499       reass->next_index = ~0;
500       reass->error_next_index = ~0;
501       ++rt->reass_n;
502     }
503
504   reass->key.as_u64[0] = ((clib_bihash_kv_48_8_t *) kv)->key[0];
505   reass->key.as_u64[1] = ((clib_bihash_kv_48_8_t *) kv)->key[1];
506   reass->key.as_u64[2] = ((clib_bihash_kv_48_8_t *) kv)->key[2];
507   reass->key.as_u64[3] = ((clib_bihash_kv_48_8_t *) kv)->key[3];
508   reass->key.as_u64[4] = ((clib_bihash_kv_48_8_t *) kv)->key[4];
509   reass->key.as_u64[5] = ((clib_bihash_kv_48_8_t *) kv)->key[5];
510   kv->v.reass_index = (reass - rt->pool);
511   kv->v.memory_owner_thread_index = vm->thread_index;
512   reass->last_heard = now;
513
514   int rv =
515     clib_bihash_add_del_48_8 (&rm->hash, (clib_bihash_kv_48_8_t *) kv, 2);
516   if (rv)
517     {
518       ip6_reass_free_ctx (rt, reass);
519       reass = NULL;
520       // if other worker created a context already work with the other copy
521       if (-2 == rv)
522         goto again;
523     }
524
525   return reass;
526 }
527
528 always_inline ip6_reass_rc_t
529 ip6_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
530                     ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
531                     ip6_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
532                     bool is_custom_app)
533 {
534   *bi0 = reass->first_bi;
535   *error0 = IP6_ERROR_NONE;
536   ip6_frag_hdr_t *frag_hdr;
537   vlib_buffer_t *last_b = NULL;
538   u32 sub_chain_bi = reass->first_bi;
539   u32 total_length = 0;
540   u32 buf_cnt = 0;
541   u32 dropped_cnt = 0;
542   u32 *vec_drop_compress = NULL;
543   ip6_reass_rc_t rv = IP6_REASS_RC_OK;
544   do
545     {
546       u32 tmp_bi = sub_chain_bi;
547       vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
548       vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
549       if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
550           !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
551         {
552           rv = IP6_REASS_RC_INTERNAL_ERROR;
553           goto free_buffers_and_return;
554         }
555
556       u32 data_len = ip6_reass_buffer_get_data_len (tmp);
557       u32 trim_front = vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
558         sizeof (*frag_hdr) + ip6_reass_buffer_get_data_offset (tmp);
559       u32 trim_end =
560         vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
561       if (tmp_bi == reass->first_bi)
562         {
563           /* first buffer - keep ip6 header */
564           if (0 != ip6_reass_buffer_get_data_offset (tmp))
565             {
566               rv = IP6_REASS_RC_INTERNAL_ERROR;
567               goto free_buffers_and_return;
568             }
569           trim_front = 0;
570           trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
571             (vnet_buffer (tmp)->ip.reass.ip6_frag_hdr_offset +
572              sizeof (*frag_hdr));
573           if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
574             {
575               rv = IP6_REASS_RC_INTERNAL_ERROR;
576               goto free_buffers_and_return;
577             }
578         }
579       u32 keep_data =
580         vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
581       while (1)
582         {
583           ++buf_cnt;
584           if (trim_front)
585             {
586               if (trim_front > tmp->current_length)
587                 {
588                   /* drop whole buffer */
589                   vec_add1 (vec_drop_compress, tmp_bi);
590                   trim_front -= tmp->current_length;
591                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
592                     {
593                       rv = IP6_REASS_RC_INTERNAL_ERROR;
594                       goto free_buffers_and_return;
595                     }
596                   tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
597                   tmp_bi = tmp->next_buffer;
598                   tmp = vlib_get_buffer (vm, tmp_bi);
599                   continue;
600                 }
601               else
602                 {
603                   vlib_buffer_advance (tmp, trim_front);
604                   trim_front = 0;
605                 }
606             }
607           if (keep_data)
608             {
609               if (last_b)
610                 {
611                   last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
612                   last_b->next_buffer = tmp_bi;
613                 }
614               last_b = tmp;
615               if (keep_data <= tmp->current_length)
616                 {
617                   tmp->current_length = keep_data;
618                   keep_data = 0;
619                 }
620               else
621                 {
622                   keep_data -= tmp->current_length;
623                   if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
624                     {
625                       rv = IP6_REASS_RC_INTERNAL_ERROR;
626                       goto free_buffers_and_return;
627                     }
628                 }
629               total_length += tmp->current_length;
630             }
631           else
632             {
633               vec_add1 (vec_drop_compress, tmp_bi);
634               if (reass->first_bi == tmp_bi)
635                 {
636                   rv = IP6_REASS_RC_INTERNAL_ERROR;
637                   goto free_buffers_and_return;
638                 }
639               ++dropped_cnt;
640             }
641           if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
642             {
643               tmp_bi = tmp->next_buffer;
644               tmp = vlib_get_buffer (vm, tmp->next_buffer);
645             }
646           else
647             {
648               break;
649             }
650         }
651       sub_chain_bi =
652         vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
653         reass.next_range_bi;
654     }
655   while (~0 != sub_chain_bi);
656
657   if (!last_b)
658     {
659       rv = IP6_REASS_RC_INTERNAL_ERROR;
660       goto free_buffers_and_return;
661     }
662   last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
663   vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
664   if (total_length < first_b->current_length)
665     {
666       rv = IP6_REASS_RC_INTERNAL_ERROR;
667       goto free_buffers_and_return;
668     }
669   total_length -= first_b->current_length;
670   first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
671   first_b->total_length_not_including_first_buffer = total_length;
672   // drop fragment header
673   vnet_buffer_opaque_t *first_b_vnb = vnet_buffer (first_b);
674   ip6_header_t *ip = vlib_buffer_get_current (first_b);
675   u16 ip6_frag_hdr_offset = first_b_vnb->ip.reass.ip6_frag_hdr_offset;
676   ip6_ext_header_t *prev_hdr;
677   ip6_ext_header_find_t (ip, prev_hdr, frag_hdr,
678                          IP_PROTOCOL_IPV6_FRAGMENTATION);
679   if (prev_hdr)
680     {
681       prev_hdr->next_hdr = frag_hdr->next_hdr;
682     }
683   else
684     {
685       ip->protocol = frag_hdr->next_hdr;
686     }
687   if (!((u8 *) frag_hdr - (u8 *) ip == ip6_frag_hdr_offset))
688     {
689       rv = IP6_REASS_RC_INTERNAL_ERROR;
690       goto free_buffers_and_return;
691     }
692   memmove (frag_hdr, (u8 *) frag_hdr + sizeof (*frag_hdr),
693            first_b->current_length - ip6_frag_hdr_offset -
694            sizeof (ip6_frag_hdr_t));
695   first_b->current_length -= sizeof (*frag_hdr);
696   ip->payload_length =
697     clib_host_to_net_u16 (total_length + first_b->current_length -
698                           sizeof (*ip));
699   if (!vlib_buffer_chain_linearize (vm, first_b))
700     {
701       rv = IP6_REASS_RC_NO_BUF;
702       goto free_buffers_and_return;
703     }
704   first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
705   if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
706     {
707       ip6_reass_add_trace (vm, node, rm, reass->id, reass->trace_op_counter,
708                            reass->first_bi, reass->first_bi, reass->data_len,
709                            FINALIZE, ~0);
710       ++reass->trace_op_counter;
711 #if 0
712       // following code does a hexdump of packet fragments to stdout ...
713       do
714         {
715           u32 bi = reass->first_bi;
716           u8 *s = NULL;
717           while (~0 != bi)
718             {
719               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
720               s = format (s, "%u: %U\n", bi, format_hexdump,
721                           vlib_buffer_get_current (b), b->current_length);
722               if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
723                 {
724                   bi = b->next_buffer;
725                 }
726               else
727                 {
728                   break;
729                 }
730             }
731           printf ("%.*s\n", vec_len (s), s);
732           fflush (stdout);
733           vec_free (s);
734         }
735       while (0);
736 #endif
737     }
738   if (!is_custom_app)
739     {
740       *next0 = IP6_REASSEMBLY_NEXT_INPUT;
741     }
742   else
743     {
744       *next0 = reass->next_index;
745     }
746   vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
747   ip6_reass_free (rm, rt, reass);
748   reass = NULL;
749 free_buffers_and_return:
750   vlib_buffer_free (vm, vec_drop_compress, vec_len (vec_drop_compress));
751   vec_free (vec_drop_compress);
752   return rv;
753 }
754
755 always_inline void
756 ip6_reass_insert_range_in_chain (vlib_main_t * vm, ip6_reass_main_t * rm,
757                                  ip6_reass_per_thread_t * rt,
758                                  ip6_reass_t * reass, u32 prev_range_bi,
759                                  u32 new_next_bi)
760 {
761
762   vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
763   vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
764   if (~0 != prev_range_bi)
765     {
766       vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
767       vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
768       new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
769       prev_vnb->ip.reass.next_range_bi = new_next_bi;
770     }
771   else
772     {
773       if (~0 != reass->first_bi)
774         {
775           new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
776         }
777       reass->first_bi = new_next_bi;
778     }
779   reass->data_len += ip6_reass_buffer_get_data_len (new_next_b);
780 }
781
782 always_inline ip6_reass_rc_t
783 ip6_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
784                   ip6_reass_main_t * rm, ip6_reass_per_thread_t * rt,
785                   ip6_reass_t * reass, u32 * bi0, u32 * next0, u32 * error0,
786                   ip6_frag_hdr_t * frag_hdr, bool is_custom_app,
787                   u32 * handoff_thread_idx)
788 {
789   int consumed = 0;
790   vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
791   vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
792   if (is_custom_app)
793     {
794       reass->next_index = fvnb->ip.reass.next_index;    // store next_index before it's overwritten
795       reass->error_next_index = fvnb->ip.reass.error_next_index;        // store error_next_index before it is overwritten
796     }
797
798   fvnb->ip.reass.ip6_frag_hdr_offset =
799     (u8 *) frag_hdr - (u8 *) vlib_buffer_get_current (fb);
800   ip6_header_t *fip = vlib_buffer_get_current (fb);
801   if (fb->current_length < sizeof (*fip) ||
802       fvnb->ip.reass.ip6_frag_hdr_offset == 0 ||
803       fvnb->ip.reass.ip6_frag_hdr_offset >= fb->current_length)
804     {
805       return IP6_REASS_RC_INTERNAL_ERROR;
806     }
807
808   u32 fragment_first = fvnb->ip.reass.fragment_first =
809     ip6_frag_hdr_offset_bytes (frag_hdr);
810   u32 fragment_length =
811     vlib_buffer_length_in_chain (vm, fb) -
812     (fvnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
813   u32 fragment_last = fvnb->ip.reass.fragment_last =
814     fragment_first + fragment_length - 1;
815   int more_fragments = ip6_frag_hdr_more (frag_hdr);
816   u32 candidate_range_bi = reass->first_bi;
817   u32 prev_range_bi = ~0;
818   fvnb->ip.reass.range_first = fragment_first;
819   fvnb->ip.reass.range_last = fragment_last;
820   fvnb->ip.reass.next_range_bi = ~0;
821   if (!more_fragments)
822     {
823       reass->last_packet_octet = fragment_last;
824     }
825   if (~0 == reass->first_bi)
826     {
827       // starting a new reassembly
828       ip6_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
829                                        *bi0);
830       reass->min_fragment_length = clib_net_to_host_u16 (fip->payload_length);
831       consumed = 1;
832       reass->fragments_n = 1;
833       goto check_if_done_maybe;
834     }
835   reass->min_fragment_length =
836     clib_min (clib_net_to_host_u16 (fip->payload_length),
837               fvnb->ip.reass.estimated_mtu);
838   while (~0 != candidate_range_bi)
839     {
840       vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
841       vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
842       if (fragment_first > candidate_vnb->ip.reass.range_last)
843         {
844           // this fragments starts after candidate range
845           prev_range_bi = candidate_range_bi;
846           candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
847           if (candidate_vnb->ip.reass.range_last < fragment_last &&
848               ~0 == candidate_range_bi)
849             {
850               // special case - this fragment falls beyond all known ranges
851               ip6_reass_insert_range_in_chain (vm, rm, rt, reass,
852                                                prev_range_bi, *bi0);
853               consumed = 1;
854               break;
855             }
856           continue;
857         }
858       if (fragment_last < candidate_vnb->ip.reass.range_first)
859         {
860           // this fragment ends before candidate range without any overlap
861           ip6_reass_insert_range_in_chain (vm, rm, rt, reass, prev_range_bi,
862                                            *bi0);
863           consumed = 1;
864         }
865       else if (fragment_first == candidate_vnb->ip.reass.range_first &&
866                fragment_last == candidate_vnb->ip.reass.range_last)
867         {
868           // duplicate fragment - ignore
869         }
870       else
871         {
872           // overlapping fragment - not allowed by RFC 8200
873           ip6_reass_drop_all (vm, node, rm, reass);
874           ip6_reass_free (rm, rt, reass);
875           if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
876             {
877               ip6_reass_add_trace (vm, node, rm, reass->id,
878                                    reass->trace_op_counter, *bi0,
879                                    reass->first_bi, reass->data_len,
880                                    RANGE_OVERLAP, ~0);
881               ++reass->trace_op_counter;
882             }
883           *next0 = IP6_REASSEMBLY_NEXT_DROP;
884           *error0 = IP6_ERROR_REASS_OVERLAPPING_FRAGMENT;
885           return IP6_REASS_RC_OK;
886         }
887       break;
888     }
889   ++reass->fragments_n;
890 check_if_done_maybe:
891   if (consumed)
892     {
893       if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
894         {
895           ip6_reass_add_trace (vm, node, rm, reass->id,
896                                reass->trace_op_counter, *bi0, reass->first_bi,
897                                reass->data_len, RANGE_NEW, ~0);
898           ++reass->trace_op_counter;
899         }
900     }
901   if (~0 != reass->last_packet_octet &&
902       reass->data_len == reass->last_packet_octet + 1)
903     {
904       *handoff_thread_idx = reass->sendout_thread_index;
905       ip6_reass_rc_t rc =
906         ip6_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
907                             is_custom_app);
908       if (IP6_REASS_RC_OK == rc
909           && reass->memory_owner_thread_index != reass->sendout_thread_index)
910         {
911           return IP6_REASS_RC_HANDOFF;
912         }
913       return rc;
914     }
915   else
916     {
917       if (consumed)
918         {
919           *bi0 = ~0;
920           if (reass->fragments_n > rm->max_reass_len)
921             {
922               return IP6_REASS_RC_TOO_MANY_FRAGMENTS;
923             }
924         }
925       else
926         {
927           *next0 = IP6_REASSEMBLY_NEXT_DROP;
928           *error0 = IP6_ERROR_REASS_DUPLICATE_FRAGMENT;
929         }
930     }
931   return IP6_REASS_RC_OK;
932 }
933
934 always_inline bool
935 ip6_reass_verify_upper_layer_present (vlib_node_runtime_t * node,
936                                       vlib_buffer_t * b,
937                                       ip6_frag_hdr_t * frag_hdr)
938 {
939   ip6_ext_header_t *tmp = (ip6_ext_header_t *) frag_hdr;
940   while (ip6_ext_hdr (tmp->next_hdr))
941     {
942       tmp = ip6_ext_next_header (tmp);
943     }
944   if (IP_PROTOCOL_IP6_NONXT == tmp->next_hdr)
945     {
946       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
947                                    ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain,
948                                    0);
949       b->error = node->errors[IP6_ERROR_REASS_MISSING_UPPER];
950
951       return false;
952     }
953   return true;
954 }
955
956 always_inline bool
957 ip6_reass_verify_fragment_multiple_8 (vlib_main_t * vm,
958                                       vlib_node_runtime_t * node,
959                                       vlib_buffer_t * b,
960                                       ip6_frag_hdr_t * frag_hdr)
961 {
962   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
963   ip6_header_t *ip = vlib_buffer_get_current (b);
964   int more_fragments = ip6_frag_hdr_more (frag_hdr);
965   u32 fragment_length =
966     vlib_buffer_length_in_chain (vm, b) -
967     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
968   if (more_fragments && 0 != fragment_length % 8)
969     {
970       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
971                                    ICMP6_parameter_problem_erroneous_header_field,
972                                    (u8 *) & ip->payload_length - (u8 *) ip);
973       return false;
974     }
975   return true;
976 }
977
978 always_inline bool
979 ip6_reass_verify_packet_size_lt_64k (vlib_main_t * vm,
980                                      vlib_node_runtime_t * node,
981                                      vlib_buffer_t * b,
982                                      ip6_frag_hdr_t * frag_hdr)
983 {
984   vnet_buffer_opaque_t *vnb = vnet_buffer (b);
985   u32 fragment_first = ip6_frag_hdr_offset_bytes (frag_hdr);
986   u32 fragment_length =
987     vlib_buffer_length_in_chain (vm, b) -
988     (vnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
989   if (fragment_first + fragment_length > 65535)
990     {
991       ip6_header_t *ip0 = vlib_buffer_get_current (b);
992       icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
993                                    ICMP6_parameter_problem_erroneous_header_field,
994                                    (u8 *) & frag_hdr->fragment_offset_and_more
995                                    - (u8 *) ip0);
996       return false;
997     }
998   return true;
999 }
1000
1001 always_inline uword
1002 ip6_reassembly_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1003                        vlib_frame_t * frame, bool is_feature,
1004                        bool is_custom_app)
1005 {
1006   u32 *from = vlib_frame_vector_args (frame);
1007   u32 n_left_from, n_left_to_next, *to_next, next_index;
1008   ip6_reass_main_t *rm = &ip6_reass_main;
1009   ip6_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1010   clib_spinlock_lock (&rt->lock);
1011
1012   n_left_from = frame->n_vectors;
1013   next_index = node->cached_next_index;
1014   while (n_left_from > 0)
1015     {
1016       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1017
1018       while (n_left_from > 0 && n_left_to_next > 0)
1019         {
1020           u32 bi0;
1021           vlib_buffer_t *b0;
1022           u32 next0 = IP6_REASSEMBLY_NEXT_DROP;
1023           u32 error0 = IP6_ERROR_NONE;
1024           u32 icmp_bi = ~0;
1025
1026           bi0 = from[0];
1027           b0 = vlib_get_buffer (vm, bi0);
1028
1029           ip6_header_t *ip0 = vlib_buffer_get_current (b0);
1030           ip6_frag_hdr_t *frag_hdr = NULL;
1031           ip6_ext_header_t *prev_hdr;
1032           if (ip6_ext_hdr (ip0->protocol))
1033             {
1034               ip6_ext_header_find_t (ip0, prev_hdr, frag_hdr,
1035                                      IP_PROTOCOL_IPV6_FRAGMENTATION);
1036             }
1037           if (!frag_hdr)
1038             {
1039               // this is a regular packet - no fragmentation
1040               next0 = IP6_REASSEMBLY_NEXT_INPUT;
1041               goto skip_reass;
1042             }
1043           if (0 == ip6_frag_hdr_offset (frag_hdr))
1044             {
1045               // first fragment - verify upper-layer is present
1046               if (!ip6_reass_verify_upper_layer_present (node, b0, frag_hdr))
1047                 {
1048                   next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
1049                   goto skip_reass;
1050                 }
1051             }
1052           if (!ip6_reass_verify_fragment_multiple_8 (vm, node, b0, frag_hdr)
1053               || !ip6_reass_verify_packet_size_lt_64k (vm, node, b0,
1054                                                        frag_hdr))
1055             {
1056               next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
1057               goto skip_reass;
1058             }
1059           vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset =
1060             (u8 *) frag_hdr - (u8 *) ip0;
1061
1062           ip6_reass_kv_t kv;
1063           u8 do_handoff = 0;
1064
1065           kv.k.as_u64[0] = ip0->src_address.as_u64[0];
1066           kv.k.as_u64[1] = ip0->src_address.as_u64[1];
1067           kv.k.as_u64[2] = ip0->dst_address.as_u64[0];
1068           kv.k.as_u64[3] = ip0->dst_address.as_u64[1];
1069           kv.k.as_u64[4] =
1070             ((u64) vec_elt (ip6_main.fib_index_by_sw_if_index,
1071                             vnet_buffer (b0)->sw_if_index[VLIB_RX])) << 32 |
1072             (u64) frag_hdr->identification;
1073           kv.k.as_u64[5] = ip0->protocol;
1074
1075           ip6_reass_t *reass =
1076             ip6_reass_find_or_create (vm, node, rm, rt, &kv, &icmp_bi,
1077                                       &do_handoff);
1078
1079           if (reass)
1080             {
1081               const u32 fragment_first = ip6_frag_hdr_offset (frag_hdr);
1082               if (0 == fragment_first)
1083                 {
1084                   reass->sendout_thread_index = vm->thread_index;
1085                 }
1086             }
1087           if (PREDICT_FALSE (do_handoff))
1088             {
1089               next0 = IP6_REASSEMBLY_NEXT_HANDOFF;
1090               if (is_feature)
1091                 vnet_buffer (b0)->ip.reass.owner_feature_thread_index =
1092                   kv.v.memory_owner_thread_index;
1093               else
1094                 vnet_buffer (b0)->ip.reass.owner_thread_index =
1095                   kv.v.memory_owner_thread_index;
1096             }
1097           else if (reass)
1098             {
1099               u32 handoff_thread_idx;
1100               switch (ip6_reass_update (vm, node, rm, rt, reass, &bi0, &next0,
1101                                         &error0, frag_hdr, is_custom_app,
1102                                         &handoff_thread_idx))
1103                 {
1104                 case IP6_REASS_RC_OK:
1105                   /* nothing to do here */
1106                   break;
1107                 case IP6_REASS_RC_HANDOFF:
1108                   next0 = IP6_REASSEMBLY_NEXT_HANDOFF;
1109                   b0 = vlib_get_buffer (vm, bi0);
1110                   if (is_feature)
1111                     vnet_buffer (b0)->ip.reass.owner_feature_thread_index =
1112                       handoff_thread_idx;
1113                   else
1114                     vnet_buffer (b0)->ip.reass.owner_thread_index =
1115                       handoff_thread_idx;
1116                   break;
1117                 case IP6_REASS_RC_TOO_MANY_FRAGMENTS:
1118                   vlib_node_increment_counter (vm, node->node_index,
1119                                                IP6_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1120                                                1);
1121                   ip6_reass_drop_all (vm, node, rm, reass);
1122                   ip6_reass_free (rm, rt, reass);
1123                   goto next_packet;
1124                   break;
1125                 case IP6_REASS_RC_NO_BUF:
1126                   vlib_node_increment_counter (vm, node->node_index,
1127                                                IP6_ERROR_REASS_NO_BUF, 1);
1128                   ip6_reass_drop_all (vm, node, rm, reass);
1129                   ip6_reass_free (rm, rt, reass);
1130                   goto next_packet;
1131                   break;
1132                 case IP6_REASS_RC_INTERNAL_ERROR:
1133                   /* drop everything and start with a clean slate */
1134                   vlib_node_increment_counter (vm, node->node_index,
1135                                                IP6_ERROR_REASS_INTERNAL_ERROR,
1136                                                1);
1137                   ip6_reass_drop_all (vm, node, rm, reass);
1138                   ip6_reass_free (rm, rt, reass);
1139                   goto next_packet;
1140                   break;
1141                 }
1142             }
1143           else
1144             {
1145               if (is_feature)
1146                 {
1147                   next0 = IP6_REASSEMBLY_NEXT_DROP;
1148                 }
1149               else
1150                 {
1151                   vnet_buffer_opaque_t *fvnb = vnet_buffer (b0);
1152                   next0 = fvnb->ip.reass.error_next_index;
1153                 }
1154               error0 = IP6_ERROR_REASS_LIMIT_REACHED;
1155             }
1156
1157           b0->error = node->errors[error0];
1158
1159           if (~0 != bi0)
1160             {
1161             skip_reass:
1162               to_next[0] = bi0;
1163               to_next += 1;
1164               n_left_to_next -= 1;
1165               if (next0 == IP6_REASSEMBLY_NEXT_HANDOFF)
1166                 {
1167                   if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1168                     {
1169                       if (is_feature)
1170                         ip6_reass_add_trace (vm, node, rm, ~0,
1171                                              ~0,
1172                                              bi0, ~0, ~0, HANDOFF,
1173                                              vnet_buffer (b0)->ip.
1174                                              reass.owner_feature_thread_index);
1175                       else
1176                         ip6_reass_add_trace (vm, node, rm, ~0, ~0, bi0,
1177                                              ~0, ~0, HANDOFF,
1178                                              vnet_buffer (b0)->ip.
1179                                              reass.owner_thread_index);
1180                     }
1181                 }
1182               else if (is_feature && IP6_ERROR_NONE == error0)
1183                 {
1184                   b0 = vlib_get_buffer (vm, bi0);
1185                   vnet_feature_next (&next0, b0);
1186                 }
1187               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1188                                                n_left_to_next, bi0, next0);
1189             }
1190
1191           if (~0 != icmp_bi)
1192             {
1193               next0 = IP6_REASSEMBLY_NEXT_ICMP_ERROR;
1194               to_next[0] = icmp_bi;
1195               to_next += 1;
1196               n_left_to_next -= 1;
1197               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1198                                                n_left_to_next, icmp_bi,
1199                                                next0);
1200             }
1201         next_packet:
1202           from += 1;
1203           n_left_from -= 1;
1204         }
1205
1206       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1207     }
1208
1209   clib_spinlock_unlock (&rt->lock);
1210   return frame->n_vectors;
1211 }
1212
1213 static char *ip6_reassembly_error_strings[] = {
1214 #define _(sym, string) string,
1215   foreach_ip6_error
1216 #undef _
1217 };
1218
1219 VLIB_NODE_FN (ip6_reass_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1220                                vlib_frame_t * frame)
1221 {
1222   return ip6_reassembly_inline (vm, node, frame, false /* is_feature */ ,
1223                                 false /* is_custom_app */ );
1224 }
1225
1226 /* *INDENT-OFF* */
1227 VLIB_REGISTER_NODE (ip6_reass_node) = {
1228     .name = "ip6-reassembly",
1229     .vector_size = sizeof (u32),
1230     .format_trace = format_ip6_reass_trace,
1231     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1232     .error_strings = ip6_reassembly_error_strings,
1233     .n_next_nodes = IP6_REASSEMBLY_N_NEXT,
1234     .next_nodes =
1235         {
1236                 [IP6_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1237                 [IP6_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1238                 [IP6_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1239                 [IP6_REASSEMBLY_NEXT_HANDOFF] = "ip6-reassembly-handoff",
1240         },
1241 };
1242 /* *INDENT-ON* */
1243
1244 VLIB_NODE_FN (ip6_reass_node_feature) (vlib_main_t * vm,
1245                                        vlib_node_runtime_t * node,
1246                                        vlib_frame_t * frame)
1247 {
1248   return ip6_reassembly_inline (vm, node, frame, true /* is_feature */ ,
1249                                 false /* is_custom_app */ );
1250 }
1251
1252 /* *INDENT-OFF* */
1253 VLIB_REGISTER_NODE (ip6_reass_node_feature) = {
1254     .name = "ip6-reassembly-feature",
1255     .vector_size = sizeof (u32),
1256     .format_trace = format_ip6_reass_trace,
1257     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1258     .error_strings = ip6_reassembly_error_strings,
1259     .n_next_nodes = IP6_REASSEMBLY_N_NEXT,
1260     .next_nodes =
1261         {
1262                 [IP6_REASSEMBLY_NEXT_INPUT] = "ip6-input",
1263                 [IP6_REASSEMBLY_NEXT_DROP] = "ip6-drop",
1264                 [IP6_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
1265                 [IP6_REASSEMBLY_NEXT_HANDOFF] = "ip6-reass-feature-hoff",
1266         },
1267 };
1268 /* *INDENT-ON* */
1269
1270 /* *INDENT-OFF* */
1271 VNET_FEATURE_INIT (ip6_reassembly_feature, static) = {
1272     .arc_name = "ip6-unicast",
1273     .node_name = "ip6-reassembly-feature",
1274     .runs_before = VNET_FEATURES ("ip6-lookup",
1275                                   "ipsec6-input-feature"),
1276     .runs_after = 0,
1277 };
1278 /* *INDENT-ON* */
1279
1280 #ifndef CLIB_MARCH_VARIANT
1281 static u32
1282 ip6_reass_get_nbuckets ()
1283 {
1284   ip6_reass_main_t *rm = &ip6_reass_main;
1285   u32 nbuckets;
1286   u8 i;
1287
1288   nbuckets = (u32) (rm->max_reass_n / IP6_REASS_HT_LOAD_FACTOR);
1289
1290   for (i = 0; i < 31; i++)
1291     if ((1 << i) >= nbuckets)
1292       break;
1293   nbuckets = 1 << i;
1294
1295   return nbuckets;
1296 }
1297 #endif /* CLIB_MARCH_VARIANT */
1298
1299 typedef enum
1300 {
1301   IP6_EVENT_CONFIG_CHANGED = 1,
1302 } ip6_reass_event_t;
1303
1304 #ifndef CLIB_MARCH_VARIANT
1305 typedef struct
1306 {
1307   int failure;
1308   clib_bihash_48_8_t *new_hash;
1309 } ip6_rehash_cb_ctx;
1310
1311 static void
1312 ip6_rehash_cb (clib_bihash_kv_48_8_t * kv, void *_ctx)
1313 {
1314   ip6_rehash_cb_ctx *ctx = _ctx;
1315   if (clib_bihash_add_del_48_8 (ctx->new_hash, kv, 1))
1316     {
1317       ctx->failure = 1;
1318     }
1319 }
1320
1321 static void
1322 ip6_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1323                       u32 max_reassembly_length, u32 expire_walk_interval_ms)
1324 {
1325   ip6_reass_main.timeout_ms = timeout_ms;
1326   ip6_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1327   ip6_reass_main.max_reass_n = max_reassemblies;
1328   ip6_reass_main.max_reass_len = max_reassembly_length;
1329   ip6_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1330 }
1331
1332 vnet_api_error_t
1333 ip6_reass_set (u32 timeout_ms, u32 max_reassemblies,
1334                u32 max_reassembly_length, u32 expire_walk_interval_ms)
1335 {
1336   u32 old_nbuckets = ip6_reass_get_nbuckets ();
1337   ip6_reass_set_params (timeout_ms, max_reassemblies, max_reassembly_length,
1338                         expire_walk_interval_ms);
1339   vlib_process_signal_event (ip6_reass_main.vlib_main,
1340                              ip6_reass_main.ip6_reass_expire_node_idx,
1341                              IP6_EVENT_CONFIG_CHANGED, 0);
1342   u32 new_nbuckets = ip6_reass_get_nbuckets ();
1343   if (ip6_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1344     {
1345       clib_bihash_48_8_t new_hash;
1346       clib_memset (&new_hash, 0, sizeof (new_hash));
1347       ip6_rehash_cb_ctx ctx;
1348       ctx.failure = 0;
1349       ctx.new_hash = &new_hash;
1350       clib_bihash_init_48_8 (&new_hash, "ip6-reass", new_nbuckets,
1351                              new_nbuckets * 1024);
1352       clib_bihash_foreach_key_value_pair_48_8 (&ip6_reass_main.hash,
1353                                                ip6_rehash_cb, &ctx);
1354       if (ctx.failure)
1355         {
1356           clib_bihash_free_48_8 (&new_hash);
1357           return -1;
1358         }
1359       else
1360         {
1361           clib_bihash_free_48_8 (&ip6_reass_main.hash);
1362           clib_memcpy_fast (&ip6_reass_main.hash, &new_hash,
1363                             sizeof (ip6_reass_main.hash));
1364           clib_bihash_copied (&ip6_reass_main.hash, &new_hash);
1365         }
1366     }
1367   return 0;
1368 }
1369
1370 vnet_api_error_t
1371 ip6_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1372                u32 * expire_walk_interval_ms)
1373 {
1374   *timeout_ms = ip6_reass_main.timeout_ms;
1375   *max_reassemblies = ip6_reass_main.max_reass_n;
1376   *expire_walk_interval_ms = ip6_reass_main.expire_walk_interval_ms;
1377   return 0;
1378 }
1379
1380 static clib_error_t *
1381 ip6_reass_init_function (vlib_main_t * vm)
1382 {
1383   ip6_reass_main_t *rm = &ip6_reass_main;
1384   clib_error_t *error = 0;
1385   u32 nbuckets;
1386   vlib_node_t *node;
1387
1388   rm->vlib_main = vm;
1389
1390   vec_validate (rm->per_thread_data, vlib_num_workers ());
1391   ip6_reass_per_thread_t *rt;
1392   vec_foreach (rt, rm->per_thread_data)
1393   {
1394     clib_spinlock_init (&rt->lock);
1395     pool_alloc (rt->pool, rm->max_reass_n);
1396   }
1397
1398   node = vlib_get_node_by_name (vm, (u8 *) "ip6-reassembly-expire-walk");
1399   ASSERT (node);
1400   rm->ip6_reass_expire_node_idx = node->index;
1401
1402   ip6_reass_set_params (IP6_REASS_TIMEOUT_DEFAULT_MS,
1403                         IP6_REASS_MAX_REASSEMBLIES_DEFAULT,
1404                         IP6_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1405                         IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1406
1407   nbuckets = ip6_reass_get_nbuckets ();
1408   clib_bihash_init_48_8 (&rm->hash, "ip6-reass", nbuckets, nbuckets * 1024);
1409
1410   node = vlib_get_node_by_name (vm, (u8 *) "ip6-drop");
1411   ASSERT (node);
1412   rm->ip6_drop_idx = node->index;
1413   node = vlib_get_node_by_name (vm, (u8 *) "ip6-icmp-error");
1414   ASSERT (node);
1415   rm->ip6_icmp_error_idx = node->index;
1416
1417   if ((error = vlib_call_init_function (vm, ip_main_init)))
1418     return error;
1419   ip6_register_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION,
1420                          ip6_reass_node.index);
1421
1422   rm->fq_index = vlib_frame_queue_main_init (ip6_reass_node.index, 0);
1423   rm->fq_feature_index =
1424     vlib_frame_queue_main_init (ip6_reass_node_feature.index, 0);
1425
1426   return error;
1427 }
1428
1429 VLIB_INIT_FUNCTION (ip6_reass_init_function);
1430 #endif /* CLIB_MARCH_VARIANT */
1431
1432 static uword
1433 ip6_reass_walk_expired (vlib_main_t * vm,
1434                         vlib_node_runtime_t * node, vlib_frame_t * f)
1435 {
1436   ip6_reass_main_t *rm = &ip6_reass_main;
1437   uword event_type, *event_data = 0;
1438
1439   while (true)
1440     {
1441       vlib_process_wait_for_event_or_clock (vm,
1442                                             (f64) rm->expire_walk_interval_ms
1443                                             / (f64) MSEC_PER_SEC);
1444       event_type = vlib_process_get_events (vm, &event_data);
1445
1446       switch (event_type)
1447         {
1448         case ~0:                /* no events => timeout */
1449           /* nothing to do here */
1450           break;
1451         case IP6_EVENT_CONFIG_CHANGED:
1452           break;
1453         default:
1454           clib_warning ("BUG: event type 0x%wx", event_type);
1455           break;
1456         }
1457       f64 now = vlib_time_now (vm);
1458
1459       ip6_reass_t *reass;
1460       int *pool_indexes_to_free = NULL;
1461
1462       uword thread_index = 0;
1463       int index;
1464       const uword nthreads = vlib_num_workers () + 1;
1465       u32 *vec_icmp_bi = NULL;
1466       for (thread_index = 0; thread_index < nthreads; ++thread_index)
1467         {
1468           ip6_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1469           clib_spinlock_lock (&rt->lock);
1470
1471           vec_reset_length (pool_indexes_to_free);
1472           /* *INDENT-OFF* */
1473           pool_foreach_index (index, rt->pool, ({
1474                                 reass = pool_elt_at_index (rt->pool, index);
1475                                 if (now > reass->last_heard + rm->timeout)
1476                                   {
1477                                     vec_add1 (pool_indexes_to_free, index);
1478                                   }
1479                               }));
1480           /* *INDENT-ON* */
1481           int *i;
1482           /* *INDENT-OFF* */
1483           vec_foreach (i, pool_indexes_to_free)
1484           {
1485             ip6_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1486             u32 icmp_bi = ~0;
1487             ip6_reass_on_timeout (vm, node, rm, reass, &icmp_bi);
1488             if (~0 != icmp_bi)
1489               vec_add1 (vec_icmp_bi, icmp_bi);
1490
1491             ip6_reass_free (rm, rt, reass);
1492           }
1493           /* *INDENT-ON* */
1494
1495           clib_spinlock_unlock (&rt->lock);
1496         }
1497
1498       while (vec_len (vec_icmp_bi) > 0)
1499         {
1500           vlib_frame_t *f =
1501             vlib_get_frame_to_node (vm, rm->ip6_icmp_error_idx);
1502           u32 *to_next = vlib_frame_vector_args (f);
1503           u32 n_left_to_next = VLIB_FRAME_SIZE - f->n_vectors;
1504           int trace_frame = 0;
1505           while (vec_len (vec_icmp_bi) > 0 && n_left_to_next > 0)
1506             {
1507               u32 bi = vec_pop (vec_icmp_bi);
1508               vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1509               if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
1510                 trace_frame = 1;
1511               b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
1512               to_next[0] = bi;
1513               ++f->n_vectors;
1514               to_next += 1;
1515               n_left_to_next -= 1;
1516             }
1517           f->frame_flags |= (trace_frame * VLIB_FRAME_TRACE);
1518           vlib_put_frame_to_node (vm, rm->ip6_icmp_error_idx, f);
1519         }
1520
1521       vec_free (pool_indexes_to_free);
1522       vec_free (vec_icmp_bi);
1523       if (event_data)
1524         {
1525           _vec_len (event_data) = 0;
1526         }
1527     }
1528
1529   return 0;
1530 }
1531
1532 /* *INDENT-OFF* */
1533 VLIB_REGISTER_NODE (ip6_reass_expire_node) = {
1534     .function = ip6_reass_walk_expired,
1535     .format_trace = format_ip6_reass_trace,
1536     .type = VLIB_NODE_TYPE_PROCESS,
1537     .name = "ip6-reassembly-expire-walk",
1538
1539     .n_errors = ARRAY_LEN (ip6_reassembly_error_strings),
1540     .error_strings = ip6_reassembly_error_strings,
1541
1542 };
1543 /* *INDENT-ON* */
1544
1545 static u8 *
1546 format_ip6_reass_key (u8 * s, va_list * args)
1547 {
1548   ip6_reass_key_t *key = va_arg (*args, ip6_reass_key_t *);
1549   s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1550               key->xx_id, format_ip6_address, &key->src, format_ip6_address,
1551               &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1552   return s;
1553 }
1554
1555 static u8 *
1556 format_ip6_reass (u8 * s, va_list * args)
1557 {
1558   vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1559   ip6_reass_t *reass = va_arg (*args, ip6_reass_t *);
1560
1561   s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1562               "last_packet_octet: %u, trace_op_counter: %u\n",
1563               reass->id, format_ip6_reass_key, &reass->key, reass->first_bi,
1564               reass->data_len, reass->last_packet_octet,
1565               reass->trace_op_counter);
1566   u32 bi = reass->first_bi;
1567   u32 counter = 0;
1568   while (~0 != bi)
1569     {
1570       vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1571       vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1572       s = format (s, "  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1573                   "fragment[%u, %u]\n",
1574                   counter, vnb->ip.reass.range_first,
1575                   vnb->ip.reass.range_last, bi,
1576                   ip6_reass_buffer_get_data_offset (b),
1577                   ip6_reass_buffer_get_data_len (b),
1578                   vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1579       if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1580         {
1581           bi = b->next_buffer;
1582         }
1583       else
1584         {
1585           bi = ~0;
1586         }
1587     }
1588   return s;
1589 }
1590
1591 static clib_error_t *
1592 show_ip6_reass (vlib_main_t * vm, unformat_input_t * input,
1593                 CLIB_UNUSED (vlib_cli_command_t * lmd))
1594 {
1595   ip6_reass_main_t *rm = &ip6_reass_main;
1596
1597   vlib_cli_output (vm, "---------------------");
1598   vlib_cli_output (vm, "IP6 reassembly status");
1599   vlib_cli_output (vm, "---------------------");
1600   bool details = false;
1601   if (unformat (input, "details"))
1602     {
1603       details = true;
1604     }
1605
1606   u32 sum_reass_n = 0;
1607   u64 sum_buffers_n = 0;
1608   ip6_reass_t *reass;
1609   uword thread_index;
1610   const uword nthreads = vlib_num_workers () + 1;
1611   for (thread_index = 0; thread_index < nthreads; ++thread_index)
1612     {
1613       ip6_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1614       clib_spinlock_lock (&rt->lock);
1615       if (details)
1616         {
1617           /* *INDENT-OFF* */
1618           pool_foreach (reass, rt->pool, {
1619             vlib_cli_output (vm, "%U", format_ip6_reass, vm, reass);
1620           });
1621           /* *INDENT-ON* */
1622         }
1623       sum_reass_n += rt->reass_n;
1624       clib_spinlock_unlock (&rt->lock);
1625     }
1626   vlib_cli_output (vm, "---------------------");
1627   vlib_cli_output (vm, "Current IP6 reassemblies count: %lu\n",
1628                    (long unsigned) sum_reass_n);
1629   vlib_cli_output (vm, "Maximum configured concurrent IP6 reassemblies per "
1630                    "worker-thread: %lu\n", (long unsigned) rm->max_reass_n);
1631   vlib_cli_output (vm, "Buffers in use: %lu\n",
1632                    (long unsigned) sum_buffers_n);
1633   return 0;
1634 }
1635
1636 /* *INDENT-OFF* */
1637 VLIB_CLI_COMMAND (show_ip6_reassembly_cmd, static) = {
1638     .path = "show ip6-reassembly",
1639     .short_help = "show ip6-reassembly [details]",
1640     .function = show_ip6_reass,
1641 };
1642 /* *INDENT-ON* */
1643
1644 #ifndef CLIB_MARCH_VARIANT
1645 vnet_api_error_t
1646 ip6_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1647 {
1648   return vnet_feature_enable_disable ("ip6-unicast", "ip6-reassembly-feature",
1649                                       sw_if_index, enable_disable, 0, 0);
1650 }
1651 #endif /* CLIB_MARCH_VARIANT */
1652
1653 #define foreach_ip6_reassembly_handoff_error                       \
1654 _(CONGESTION_DROP, "congestion drop")
1655
1656
1657 typedef enum
1658 {
1659 #define _(sym,str) IP6_REASSEMBLY_HANDOFF_ERROR_##sym,
1660   foreach_ip6_reassembly_handoff_error
1661 #undef _
1662     IP6_REASSEMBLY_HANDOFF_N_ERROR,
1663 } ip6_reassembly_handoff_error_t;
1664
1665 static char *ip6_reassembly_handoff_error_strings[] = {
1666 #define _(sym,string) string,
1667   foreach_ip6_reassembly_handoff_error
1668 #undef _
1669 };
1670
1671 typedef struct
1672 {
1673   u32 next_worker_index;
1674 } ip6_reassembly_handoff_trace_t;
1675
1676 static u8 *
1677 format_ip6_reassembly_handoff_trace (u8 * s, va_list * args)
1678 {
1679   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1680   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1681   ip6_reassembly_handoff_trace_t *t =
1682     va_arg (*args, ip6_reassembly_handoff_trace_t *);
1683
1684   s =
1685     format (s, "ip6-reassembly-handoff: next-worker %d",
1686             t->next_worker_index);
1687
1688   return s;
1689 }
1690
1691 always_inline uword
1692 ip6_reassembly_handoff_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1693                                vlib_frame_t * frame, bool is_feature)
1694 {
1695   ip6_reass_main_t *rm = &ip6_reass_main;
1696
1697   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1698   u32 n_enq, n_left_from, *from;
1699   u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1700   u32 fq_index;
1701
1702   from = vlib_frame_vector_args (frame);
1703   n_left_from = frame->n_vectors;
1704   vlib_get_buffers (vm, from, bufs, n_left_from);
1705
1706   b = bufs;
1707   ti = thread_indices;
1708
1709   fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1710
1711   while (n_left_from > 0)
1712     {
1713       ti[0] =
1714         (is_feature) ? vnet_buffer (b[0])->ip.
1715         reass.owner_feature_thread_index : vnet_buffer (b[0])->ip.
1716         reass.owner_thread_index;
1717
1718       if (PREDICT_FALSE
1719           ((node->flags & VLIB_NODE_FLAG_TRACE)
1720            && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1721         {
1722           ip6_reassembly_handoff_trace_t *t =
1723             vlib_add_trace (vm, node, b[0], sizeof (*t));
1724           t->next_worker_index = ti[0];
1725         }
1726
1727       n_left_from -= 1;
1728       ti += 1;
1729       b += 1;
1730     }
1731   n_enq =
1732     vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1733                                    frame->n_vectors, 1);
1734
1735   if (n_enq < frame->n_vectors)
1736     vlib_node_increment_counter (vm, node->node_index,
1737                                  IP6_REASSEMBLY_HANDOFF_ERROR_CONGESTION_DROP,
1738                                  frame->n_vectors - n_enq);
1739   return frame->n_vectors;
1740 }
1741
1742 VLIB_NODE_FN (ip6_reassembly_handoff_node) (vlib_main_t * vm,
1743                                             vlib_node_runtime_t * node,
1744                                             vlib_frame_t * frame)
1745 {
1746   return ip6_reassembly_handoff_inline (vm, node, frame,
1747                                         false /* is_feature */ );
1748 }
1749
1750 /* *INDENT-OFF* */
1751 VLIB_REGISTER_NODE (ip6_reassembly_handoff_node) = {
1752   .name = "ip6-reassembly-handoff",
1753   .vector_size = sizeof (u32),
1754   .n_errors = ARRAY_LEN(ip6_reassembly_handoff_error_strings),
1755   .error_strings = ip6_reassembly_handoff_error_strings,
1756   .format_trace = format_ip6_reassembly_handoff_trace,
1757
1758   .n_next_nodes = 1,
1759
1760   .next_nodes = {
1761     [0] = "error-drop",
1762   },
1763 };
1764
1765
1766 VLIB_NODE_FN (ip6_reassembly_feature_handoff_node) (vlib_main_t * vm,
1767                                vlib_node_runtime_t * node, vlib_frame_t * frame)
1768 {
1769   return ip6_reassembly_handoff_inline (vm, node, frame, true /* is_feature */ );
1770 }
1771
1772
1773 /* *INDENT-OFF* */
1774 VLIB_REGISTER_NODE (ip6_reassembly_feature_handoff_node) = {
1775   .name = "ip6-reass-feature-hoff",
1776   .vector_size = sizeof (u32),
1777   .n_errors = ARRAY_LEN(ip6_reassembly_handoff_error_strings),
1778   .error_strings = ip6_reassembly_handoff_error_strings,
1779   .format_trace = format_ip6_reassembly_handoff_trace,
1780
1781   .n_next_nodes = 1,
1782
1783   .next_nodes = {
1784     [0] = "error-drop",
1785   },
1786 };
1787 /* *INDENT-ON* */
1788
1789 /*
1790  * fd.io coding-style-patch-verification: ON
1791  *
1792  * Local Variables:
1793  * eval: (c-set-style "gnu")
1794  * End:
1795  */