NAT64: fix TCP session expire (VPP-1390)
[vpp.git] / src / plugins / nat / nat64_out2in.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /**
16  * @file
17  * @brief NAT64 IPv4 to IPv6 translation (otside to inside network)
18  */
19
20 #include <nat/nat64.h>
21 #include <nat/nat_reass.h>
22 #include <nat/nat_inlines.h>
23 #include <vnet/ip/ip4_to_ip6.h>
24 #include <vnet/fib/ip4_fib.h>
25 #include <vnet/udp/udp.h>
26
27 typedef struct
28 {
29   u32 sw_if_index;
30   u32 next_index;
31 } nat64_out2in_trace_t;
32
33 static u8 *
34 format_nat64_out2in_trace (u8 * s, va_list * args)
35 {
36   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
37   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
38   nat64_out2in_trace_t *t = va_arg (*args, nat64_out2in_trace_t *);
39
40   s =
41     format (s, "NAT64-out2in: sw_if_index %d, next index %d", t->sw_if_index,
42             t->next_index);
43
44   return s;
45 }
46
47 typedef struct
48 {
49   u32 sw_if_index;
50   u32 next_index;
51   u8 cached;
52 } nat64_out2in_reass_trace_t;
53
54 static u8 *
55 format_nat64_out2in_reass_trace (u8 * s, va_list * args)
56 {
57   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
58   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
59   nat64_out2in_reass_trace_t *t =
60     va_arg (*args, nat64_out2in_reass_trace_t *);
61
62   s =
63     format (s, "NAT64-out2in-reass: sw_if_index %d, next index %d, status %s",
64             t->sw_if_index, t->next_index,
65             t->cached ? "cached" : "translated");
66
67   return s;
68 }
69
70 vlib_node_registration_t nat64_out2in_node;
71 vlib_node_registration_t nat64_out2in_reass_node;
72 vlib_node_registration_t nat64_out2in_handoff_node;
73
74 #define foreach_nat64_out2in_error                       \
75 _(UNSUPPORTED_PROTOCOL, "Unsupported protocol")          \
76 _(OUT2IN_PACKETS, "Good out2in packets processed")       \
77 _(NO_TRANSLATION, "No translation")                      \
78 _(UNKNOWN, "unknown")                                    \
79 _(DROP_FRAGMENT, "Drop fragment")                        \
80 _(MAX_REASS, "Maximum reassemblies exceeded")            \
81 _(MAX_FRAG, "Maximum fragments per reassembly exceeded")
82
83
84 typedef enum
85 {
86 #define _(sym,str) NAT64_OUT2IN_ERROR_##sym,
87   foreach_nat64_out2in_error
88 #undef _
89     NAT64_OUT2IN_N_ERROR,
90 } nat64_out2in_error_t;
91
92 static char *nat64_out2in_error_strings[] = {
93 #define _(sym,string) string,
94   foreach_nat64_out2in_error
95 #undef _
96 };
97
98 typedef enum
99 {
100   NAT64_OUT2IN_NEXT_IP6_LOOKUP,
101   NAT64_OUT2IN_NEXT_IP4_LOOKUP,
102   NAT64_OUT2IN_NEXT_DROP,
103   NAT64_OUT2IN_NEXT_REASS,
104   NAT64_OUT2IN_N_NEXT,
105 } nat64_out2in_next_t;
106
107 typedef struct nat64_out2in_set_ctx_t_
108 {
109   vlib_buffer_t *b;
110   vlib_main_t *vm;
111   u32 thread_index;
112 } nat64_out2in_set_ctx_t;
113
114 static int
115 nat64_out2in_tcp_udp_set_cb (ip4_header_t * ip4, ip6_header_t * ip6,
116                              void *arg)
117 {
118   nat64_main_t *nm = &nat64_main;
119   nat64_out2in_set_ctx_t *ctx = arg;
120   nat64_db_bib_entry_t *bibe;
121   nat64_db_st_entry_t *ste;
122   ip46_address_t saddr, daddr;
123   ip6_address_t ip6_saddr;
124   udp_header_t *udp = ip4_next_header (ip4);
125   tcp_header_t *tcp = ip4_next_header (ip4);
126   u8 proto = ip4->protocol;
127   u16 dport = udp->dst_port;
128   u16 sport = udp->src_port;
129   u32 sw_if_index, fib_index;
130   u16 *checksum;
131   ip_csum_t csum;
132   nat64_db_t *db = &nm->db[ctx->thread_index];
133
134   sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX];
135   fib_index = ip4_fib_table_get_index_for_sw_if_index (sw_if_index);
136
137   memset (&saddr, 0, sizeof (saddr));
138   saddr.ip4.as_u32 = ip4->src_address.as_u32;
139   memset (&daddr, 0, sizeof (daddr));
140   daddr.ip4.as_u32 = ip4->dst_address.as_u32;
141
142   ste =
143     nat64_db_st_entry_find (db, &daddr, &saddr, dport, sport, proto,
144                             fib_index, 0);
145   if (ste)
146     {
147       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
148       if (!bibe)
149         return -1;
150     }
151   else
152     {
153       bibe = nat64_db_bib_entry_find (db, &daddr, dport, proto, fib_index, 0);
154
155       if (!bibe)
156         return -1;
157
158       nat64_compose_ip6 (&ip6_saddr, &ip4->src_address, bibe->fib_index);
159       ste =
160         nat64_db_st_entry_create (db, bibe, &ip6_saddr, &saddr.ip4, sport);
161     }
162
163   ip6->src_address.as_u64[0] = ste->in_r_addr.as_u64[0];
164   ip6->src_address.as_u64[1] = ste->in_r_addr.as_u64[1];
165
166   ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0];
167   ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1];
168   udp->dst_port = bibe->in_port;
169
170   if (proto == IP_PROTOCOL_UDP)
171     checksum = &udp->checksum;
172   else
173     {
174       checksum = &tcp->checksum;
175       nat64_tcp_session_set_state (ste, tcp, 0);
176     }
177
178   csum = ip_csum_sub_even (*checksum, dport);
179   csum = ip_csum_add_even (csum, udp->dst_port);
180   *checksum = ip_csum_fold (csum);
181
182   vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index;
183
184   nat64_session_reset_timeout (ste, ctx->vm);
185
186   return 0;
187 }
188
189 static int
190 nat64_out2in_icmp_set_cb (ip4_header_t * ip4, ip6_header_t * ip6, void *arg)
191 {
192   nat64_main_t *nm = &nat64_main;
193   nat64_out2in_set_ctx_t *ctx = arg;
194   nat64_db_bib_entry_t *bibe;
195   nat64_db_st_entry_t *ste;
196   ip46_address_t saddr, daddr;
197   ip6_address_t ip6_saddr;
198   u32 sw_if_index, fib_index;
199   icmp46_header_t *icmp = ip4_next_header (ip4);
200   nat64_db_t *db = &nm->db[ctx->thread_index];
201
202   sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX];
203   fib_index = ip4_fib_table_get_index_for_sw_if_index (sw_if_index);
204
205   memset (&saddr, 0, sizeof (saddr));
206   saddr.ip4.as_u32 = ip4->src_address.as_u32;
207   memset (&daddr, 0, sizeof (daddr));
208   daddr.ip4.as_u32 = ip4->dst_address.as_u32;
209
210   if (icmp->type == ICMP6_echo_request || icmp->type == ICMP6_echo_reply)
211     {
212       u16 out_id = ((u16 *) (icmp))[2];
213       ste =
214         nat64_db_st_entry_find (db, &daddr, &saddr, out_id, 0,
215                                 IP_PROTOCOL_ICMP, fib_index, 0);
216
217       if (ste)
218         {
219           bibe =
220             nat64_db_bib_entry_by_index (db, IP_PROTOCOL_ICMP,
221                                          ste->bibe_index);
222           if (!bibe)
223             return -1;
224         }
225       else
226         {
227           bibe =
228             nat64_db_bib_entry_find (db, &daddr, out_id,
229                                      IP_PROTOCOL_ICMP, fib_index, 0);
230           if (!bibe)
231             return -1;
232
233           nat64_compose_ip6 (&ip6_saddr, &ip4->src_address, bibe->fib_index);
234           ste =
235             nat64_db_st_entry_create (db, bibe, &ip6_saddr, &saddr.ip4, 0);
236         }
237
238       nat64_session_reset_timeout (ste, ctx->vm);
239
240       ip6->src_address.as_u64[0] = ste->in_r_addr.as_u64[0];
241       ip6->src_address.as_u64[1] = ste->in_r_addr.as_u64[1];
242
243       ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0];
244       ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1];
245       ((u16 *) (icmp))[2] = bibe->in_port;
246
247       vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index;
248     }
249   else
250     {
251       ip6_header_t *inner_ip6 = (ip6_header_t *) u8_ptr_add (icmp, 8);
252
253       nat64_compose_ip6 (&ip6->src_address, &ip4->src_address,
254                          vnet_buffer (ctx->b)->sw_if_index[VLIB_TX]);
255       ip6->dst_address.as_u64[0] = inner_ip6->src_address.as_u64[0];
256       ip6->dst_address.as_u64[1] = inner_ip6->src_address.as_u64[1];
257     }
258
259   return 0;
260 }
261
262 static int
263 nat64_out2in_inner_icmp_set_cb (ip4_header_t * ip4, ip6_header_t * ip6,
264                                 void *arg)
265 {
266   nat64_main_t *nm = &nat64_main;
267   nat64_out2in_set_ctx_t *ctx = arg;
268   nat64_db_bib_entry_t *bibe;
269   nat64_db_st_entry_t *ste;
270   ip46_address_t saddr, daddr;
271   u32 sw_if_index, fib_index;
272   u8 proto = ip4->protocol;
273   nat64_db_t *db = &nm->db[ctx->thread_index];
274
275   sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX];
276   fib_index =
277     fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index);
278
279   memset (&saddr, 0, sizeof (saddr));
280   saddr.ip4.as_u32 = ip4->src_address.as_u32;
281   memset (&daddr, 0, sizeof (daddr));
282   daddr.ip4.as_u32 = ip4->dst_address.as_u32;
283
284   if (proto == IP_PROTOCOL_ICMP6)
285     {
286       icmp46_header_t *icmp = ip4_next_header (ip4);
287       u16 out_id = ((u16 *) (icmp))[2];
288       proto = IP_PROTOCOL_ICMP;
289
290       if (!
291           (icmp->type == ICMP6_echo_request
292            || icmp->type == ICMP6_echo_reply))
293         return -1;
294
295       ste =
296         nat64_db_st_entry_find (db, &saddr, &daddr, out_id, 0, proto,
297                                 fib_index, 0);
298       if (!ste)
299         return -1;
300
301       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
302       if (!bibe)
303         return -1;
304
305       ip6->dst_address.as_u64[0] = ste->in_r_addr.as_u64[0];
306       ip6->dst_address.as_u64[1] = ste->in_r_addr.as_u64[1];
307       ip6->src_address.as_u64[0] = bibe->in_addr.as_u64[0];
308       ip6->src_address.as_u64[1] = bibe->in_addr.as_u64[1];
309       ((u16 *) (icmp))[2] = bibe->in_port;
310
311       vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index;
312     }
313   else
314     {
315       udp_header_t *udp = ip4_next_header (ip4);
316       tcp_header_t *tcp = ip4_next_header (ip4);
317       u16 dport = udp->dst_port;
318       u16 sport = udp->src_port;
319       u16 *checksum;
320       ip_csum_t csum;
321
322       ste =
323         nat64_db_st_entry_find (db, &saddr, &daddr, sport, dport, proto,
324                                 fib_index, 0);
325       if (!ste)
326         return -1;
327
328       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
329       if (!bibe)
330         return -1;
331
332       nat64_compose_ip6 (&ip6->dst_address, &daddr.ip4, bibe->fib_index);
333       ip6->src_address.as_u64[0] = bibe->in_addr.as_u64[0];
334       ip6->src_address.as_u64[1] = bibe->in_addr.as_u64[1];
335       udp->src_port = bibe->in_port;
336
337       if (proto == IP_PROTOCOL_UDP)
338         checksum = &udp->checksum;
339       else
340         checksum = &tcp->checksum;
341       if (*checksum)
342         {
343           csum = ip_csum_sub_even (*checksum, sport);
344           csum = ip_csum_add_even (csum, udp->src_port);
345           *checksum = ip_csum_fold (csum);
346         }
347
348       vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index;
349     }
350
351   return 0;
352 }
353
354 static int
355 nat64_out2in_unk_proto_set_cb (ip4_header_t * ip4, ip6_header_t * ip6,
356                                void *arg)
357 {
358   nat64_main_t *nm = &nat64_main;
359   nat64_out2in_set_ctx_t *ctx = arg;
360   nat64_db_bib_entry_t *bibe;
361   nat64_db_st_entry_t *ste;
362   ip46_address_t saddr, daddr;
363   ip6_address_t ip6_saddr;
364   u32 sw_if_index, fib_index;
365   u8 proto = ip4->protocol;
366   nat64_db_t *db = &nm->db[ctx->thread_index];
367
368   sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX];
369   fib_index = ip4_fib_table_get_index_for_sw_if_index (sw_if_index);
370
371   memset (&saddr, 0, sizeof (saddr));
372   saddr.ip4.as_u32 = ip4->src_address.as_u32;
373   memset (&daddr, 0, sizeof (daddr));
374   daddr.ip4.as_u32 = ip4->dst_address.as_u32;
375
376   ste =
377     nat64_db_st_entry_find (db, &daddr, &saddr, 0, 0, proto, fib_index, 0);
378   if (ste)
379     {
380       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
381       if (!bibe)
382         return -1;
383     }
384   else
385     {
386       bibe = nat64_db_bib_entry_find (db, &daddr, 0, proto, fib_index, 0);
387
388       if (!bibe)
389         return -1;
390
391       nat64_compose_ip6 (&ip6_saddr, &ip4->src_address, bibe->fib_index);
392       ste = nat64_db_st_entry_create (db, bibe, &ip6_saddr, &saddr.ip4, 0);
393     }
394
395   nat64_session_reset_timeout (ste, ctx->vm);
396
397   ip6->src_address.as_u64[0] = ste->in_r_addr.as_u64[0];
398   ip6->src_address.as_u64[1] = ste->in_r_addr.as_u64[1];
399
400   ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0];
401   ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1];
402
403   vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index;
404
405   return 0;
406 }
407
408 static uword
409 nat64_out2in_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
410                       vlib_frame_t * frame)
411 {
412   u32 n_left_from, *from, *to_next;
413   nat64_out2in_next_t next_index;
414   u32 pkts_processed = 0;
415   u32 thread_index = vm->thread_index;
416
417   from = vlib_frame_vector_args (frame);
418   n_left_from = frame->n_vectors;
419   next_index = node->cached_next_index;
420   while (n_left_from > 0)
421     {
422       u32 n_left_to_next;
423
424       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
425
426       while (n_left_from > 0 && n_left_to_next > 0)
427         {
428           u32 bi0;
429           vlib_buffer_t *b0;
430           u32 next0;
431           ip4_header_t *ip40;
432           u32 proto0;
433           nat64_out2in_set_ctx_t ctx0;
434           udp_header_t *udp0;
435
436           /* speculatively enqueue b0 to the current next frame */
437           bi0 = from[0];
438           to_next[0] = bi0;
439           from += 1;
440           to_next += 1;
441           n_left_from -= 1;
442           n_left_to_next -= 1;
443
444           b0 = vlib_get_buffer (vm, bi0);
445           ip40 = vlib_buffer_get_current (b0);
446
447           ctx0.b = b0;
448           ctx0.vm = vm;
449           ctx0.thread_index = thread_index;
450
451           next0 = NAT64_OUT2IN_NEXT_IP6_LOOKUP;
452
453           proto0 = ip_proto_to_snat_proto (ip40->protocol);
454
455           if (PREDICT_FALSE (proto0 == ~0))
456             {
457               if (ip4_to_ip6 (b0, nat64_out2in_unk_proto_set_cb, &ctx0))
458                 {
459                   next0 = NAT64_OUT2IN_NEXT_DROP;
460                   b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION];
461                 }
462               goto trace0;
463             }
464
465           if (PREDICT_FALSE (ip4_is_fragment (ip40)))
466             {
467               next0 = NAT64_OUT2IN_NEXT_REASS;
468               goto trace0;
469             }
470
471           if (proto0 == SNAT_PROTOCOL_ICMP)
472             {
473               if (icmp_to_icmp6
474                   (b0, nat64_out2in_icmp_set_cb, &ctx0,
475                    nat64_out2in_inner_icmp_set_cb, &ctx0))
476                 {
477                   next0 = NAT64_OUT2IN_NEXT_DROP;
478                   b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION];
479                   goto trace0;
480                 }
481             }
482           else
483             {
484               if (ip4_to_ip6_tcp_udp (b0, nat64_out2in_tcp_udp_set_cb, &ctx0))
485                 {
486                   udp0 = ip4_next_header (ip40);
487                   /*
488                    * Send DHCP packets to the ipv4 stack, or we won't
489                    * be able to use dhcp client on the outside interface
490                    */
491                   if ((proto0 == SNAT_PROTOCOL_UDP)
492                       && (udp0->dst_port ==
493                           clib_host_to_net_u16 (UDP_DST_PORT_dhcp_to_client)))
494                     {
495                       next0 = NAT64_OUT2IN_NEXT_IP4_LOOKUP;
496                       goto trace0;
497                     }
498                   next0 = NAT64_OUT2IN_NEXT_DROP;
499                   b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION];
500                   goto trace0;
501                 }
502             }
503
504         trace0:
505           if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
506                              && (b0->flags & VLIB_BUFFER_IS_TRACED)))
507             {
508               nat64_out2in_trace_t *t =
509                 vlib_add_trace (vm, node, b0, sizeof (*t));
510               t->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
511               t->next_index = next0;
512             }
513
514           pkts_processed += next0 != NAT64_OUT2IN_NEXT_DROP;
515
516           /* verify speculative enqueue, maybe switch current next frame */
517           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
518                                            n_left_to_next, bi0, next0);
519         }
520       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
521     }
522   vlib_node_increment_counter (vm, nat64_out2in_node.index,
523                                NAT64_OUT2IN_ERROR_OUT2IN_PACKETS,
524                                pkts_processed);
525   return frame->n_vectors;
526 }
527
528 /* *INDENT-OFF* */
529 VLIB_REGISTER_NODE (nat64_out2in_node) = {
530   .function = nat64_out2in_node_fn,
531   .name = "nat64-out2in",
532   .vector_size = sizeof (u32),
533   .format_trace = format_nat64_out2in_trace,
534   .type = VLIB_NODE_TYPE_INTERNAL,
535   .n_errors = ARRAY_LEN (nat64_out2in_error_strings),
536   .error_strings = nat64_out2in_error_strings,
537   .n_next_nodes = NAT64_OUT2IN_N_NEXT,
538   /* edit / add dispositions here */
539   .next_nodes = {
540     [NAT64_OUT2IN_NEXT_DROP] = "error-drop",
541     [NAT64_OUT2IN_NEXT_IP6_LOOKUP] = "ip6-lookup",
542     [NAT64_OUT2IN_NEXT_IP4_LOOKUP] = "ip4-lookup",
543     [NAT64_OUT2IN_NEXT_REASS] = "nat64-out2in-reass",
544   },
545 };
546 /* *INDENT-ON* */
547
548 VLIB_NODE_FUNCTION_MULTIARCH (nat64_out2in_node, nat64_out2in_node_fn);
549
550 typedef struct nat64_out2in_frag_set_ctx_t_
551 {
552   vlib_main_t *vm;
553   vlib_buffer_t *b;
554   u32 sess_index;
555   u32 thread_index;
556   u8 proto;
557   u8 first_frag;
558 } nat64_out2in_frag_set_ctx_t;
559
560 static int
561 nat64_out2in_frag_set_cb (ip4_header_t * ip4, ip6_header_t * ip6, void *arg)
562 {
563   nat64_main_t *nm = &nat64_main;
564   nat64_out2in_frag_set_ctx_t *ctx = arg;
565   nat64_db_st_entry_t *ste;
566   nat64_db_bib_entry_t *bibe;
567   udp_header_t *udp = ip4_next_header (ip4);
568   ip_csum_t csum;
569   u16 *checksum;
570   nat64_db_t *db = &nm->db[ctx->thread_index];
571
572   ste = nat64_db_st_entry_by_index (db, ctx->proto, ctx->sess_index);
573   if (!ste)
574     return -1;
575
576   bibe = nat64_db_bib_entry_by_index (db, ctx->proto, ste->bibe_index);
577   if (!bibe)
578     return -1;
579
580   if (ctx->first_frag)
581     {
582       udp->dst_port = bibe->in_port;
583
584       if (ip4->protocol == IP_PROTOCOL_UDP)
585         {
586           checksum = &udp->checksum;
587
588           if (!checksum)
589             {
590               u16 udp_len =
591                 clib_host_to_net_u16 (ip4->length) - sizeof (*ip4);
592               csum = ip_incremental_checksum (0, udp, udp_len);
593               csum =
594                 ip_csum_with_carry (csum, clib_host_to_net_u16 (udp_len));
595               csum =
596                 ip_csum_with_carry (csum,
597                                     clib_host_to_net_u16 (IP_PROTOCOL_UDP));
598               csum = ip_csum_with_carry (csum, ste->in_r_addr.as_u64[0]);
599               csum = ip_csum_with_carry (csum, ste->in_r_addr.as_u64[1]);
600               csum = ip_csum_with_carry (csum, bibe->in_addr.as_u64[0]);
601               csum = ip_csum_with_carry (csum, bibe->in_addr.as_u64[1]);
602               *checksum = ~ip_csum_fold (csum);
603             }
604           else
605             {
606               csum = ip_csum_sub_even (*checksum, bibe->out_addr.as_u32);
607               csum = ip_csum_sub_even (csum, ste->out_r_addr.as_u32);
608               csum = ip_csum_sub_even (csum, bibe->out_port);
609               csum = ip_csum_add_even (csum, ste->in_r_addr.as_u64[0]);
610               csum = ip_csum_add_even (csum, ste->in_r_addr.as_u64[1]);
611               csum = ip_csum_add_even (csum, bibe->in_addr.as_u64[0]);
612               csum = ip_csum_add_even (csum, bibe->in_addr.as_u64[1]);
613               csum = ip_csum_add_even (csum, bibe->in_port);
614               *checksum = ip_csum_fold (csum);
615             }
616         }
617       else
618         {
619           tcp_header_t *tcp = ip4_next_header (ip4);
620           nat64_tcp_session_set_state (ste, tcp, 0);
621           checksum = &tcp->checksum;
622           csum = ip_csum_sub_even (*checksum, bibe->out_addr.as_u32);
623           csum = ip_csum_sub_even (csum, ste->out_r_addr.as_u32);
624           csum = ip_csum_sub_even (csum, bibe->out_port);
625           csum = ip_csum_add_even (csum, ste->in_r_addr.as_u64[0]);
626           csum = ip_csum_add_even (csum, ste->in_r_addr.as_u64[1]);
627           csum = ip_csum_add_even (csum, bibe->in_addr.as_u64[0]);
628           csum = ip_csum_add_even (csum, bibe->in_addr.as_u64[1]);
629           csum = ip_csum_add_even (csum, bibe->in_port);
630           *checksum = ip_csum_fold (csum);
631         }
632
633     }
634
635   ip6->src_address.as_u64[0] = ste->in_r_addr.as_u64[0];
636   ip6->src_address.as_u64[1] = ste->in_r_addr.as_u64[1];
637
638   ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0];
639   ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1];
640
641   vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index;
642
643   nat64_session_reset_timeout (ste, ctx->vm);
644
645   return 0;
646 }
647
648 static uword
649 nat64_out2in_reass_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
650                             vlib_frame_t * frame)
651 {
652   u32 n_left_from, *from, *to_next;
653   nat64_out2in_next_t next_index;
654   u32 pkts_processed = 0;
655   u32 *fragments_to_drop = 0;
656   u32 *fragments_to_loopback = 0;
657   nat64_main_t *nm = &nat64_main;
658   u32 thread_index = vm->thread_index;
659
660   from = vlib_frame_vector_args (frame);
661   n_left_from = frame->n_vectors;
662   next_index = node->cached_next_index;
663
664   while (n_left_from > 0)
665     {
666       u32 n_left_to_next;
667
668       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
669
670       while (n_left_from > 0 && n_left_to_next > 0)
671         {
672           u32 bi0;
673           vlib_buffer_t *b0;
674           u32 next0;
675           ip4_header_t *ip40;
676           u8 cached0 = 0;
677           u32 sw_if_index0, fib_index0;
678           udp_header_t *udp0;
679           nat_reass_ip4_t *reass0;
680           ip46_address_t saddr0, daddr0;
681           nat64_db_st_entry_t *ste0;
682           nat64_db_bib_entry_t *bibe0;
683           ip6_address_t ip6_saddr0;
684           nat64_out2in_frag_set_ctx_t ctx0;
685           nat64_db_t *db = &nm->db[thread_index];
686
687           /* speculatively enqueue b0 to the current next frame */
688           bi0 = from[0];
689           to_next[0] = bi0;
690           from += 1;
691           to_next += 1;
692           n_left_from -= 1;
693           n_left_to_next -= 1;
694
695           b0 = vlib_get_buffer (vm, bi0);
696           next0 = NAT64_OUT2IN_NEXT_IP6_LOOKUP;
697
698           sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
699           fib_index0 =
700             fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4,
701                                                  sw_if_index0);
702
703           ctx0.thread_index = thread_index;
704
705           if (PREDICT_FALSE (nat_reass_is_drop_frag (1)))
706             {
707               next0 = NAT64_OUT2IN_NEXT_DROP;
708               b0->error = node->errors[NAT64_OUT2IN_ERROR_DROP_FRAGMENT];
709               goto trace0;
710             }
711
712           ip40 = vlib_buffer_get_current (b0);
713
714           if (PREDICT_FALSE (!(ip40->protocol == IP_PROTOCOL_TCP
715                                || ip40->protocol == IP_PROTOCOL_UDP)))
716             {
717               next0 = NAT64_OUT2IN_NEXT_DROP;
718               b0->error = node->errors[NAT64_OUT2IN_ERROR_DROP_FRAGMENT];
719               goto trace0;
720             }
721
722           udp0 = ip4_next_header (ip40);
723
724           reass0 = nat_ip4_reass_find_or_create (ip40->src_address,
725                                                  ip40->dst_address,
726                                                  ip40->fragment_id,
727                                                  ip40->protocol,
728                                                  1, &fragments_to_drop);
729
730           if (PREDICT_FALSE (!reass0))
731             {
732               next0 = NAT64_OUT2IN_NEXT_DROP;
733               b0->error = node->errors[NAT64_OUT2IN_ERROR_MAX_REASS];
734               goto trace0;
735             }
736
737           if (PREDICT_FALSE (ip4_is_first_fragment (ip40)))
738             {
739               ctx0.first_frag = 1;
740
741               memset (&saddr0, 0, sizeof (saddr0));
742               saddr0.ip4.as_u32 = ip40->src_address.as_u32;
743               memset (&daddr0, 0, sizeof (daddr0));
744               daddr0.ip4.as_u32 = ip40->dst_address.as_u32;
745
746               ste0 =
747                 nat64_db_st_entry_find (db, &daddr0, &saddr0,
748                                         udp0->dst_port, udp0->src_port,
749                                         ip40->protocol, fib_index0, 0);
750               if (!ste0)
751                 {
752                   bibe0 =
753                     nat64_db_bib_entry_find (db, &daddr0, udp0->dst_port,
754                                              ip40->protocol, fib_index0, 0);
755                   if (!bibe0)
756                     {
757                       next0 = NAT64_OUT2IN_NEXT_DROP;
758                       b0->error =
759                         node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION];
760                       goto trace0;
761                     }
762
763                   nat64_compose_ip6 (&ip6_saddr0, &ip40->src_address,
764                                      bibe0->fib_index);
765                   ste0 =
766                     nat64_db_st_entry_create (db, bibe0, &ip6_saddr0,
767                                               &saddr0.ip4, udp0->src_port);
768
769                   if (!ste0)
770                     {
771                       next0 = NAT64_OUT2IN_NEXT_DROP;
772                       b0->error =
773                         node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION];
774                       goto trace0;
775                     }
776                 }
777               reass0->sess_index = nat64_db_st_entry_get_index (db, ste0);
778               reass0->thread_index = thread_index;
779
780               nat_ip4_reass_get_frags (reass0, &fragments_to_loopback);
781             }
782           else
783             {
784               ctx0.first_frag = 0;
785
786               if (PREDICT_FALSE (reass0->sess_index == (u32) ~ 0))
787                 {
788                   if (nat_ip4_reass_add_fragment (reass0, bi0))
789                     {
790                       b0->error = node->errors[NAT64_OUT2IN_ERROR_MAX_FRAG];
791                       next0 = NAT64_OUT2IN_NEXT_DROP;
792                       goto trace0;
793                     }
794                   cached0 = 1;
795                   goto trace0;
796                 }
797             }
798
799           ctx0.sess_index = reass0->sess_index;
800           ctx0.proto = ip40->protocol;
801           ctx0.vm = vm;
802           ctx0.b = b0;
803
804           if (ip4_to_ip6_fragmented (b0, nat64_out2in_frag_set_cb, &ctx0))
805             {
806               next0 = NAT64_OUT2IN_NEXT_DROP;
807               b0->error = node->errors[NAT64_OUT2IN_ERROR_UNKNOWN];
808               goto trace0;
809             }
810
811         trace0:
812           if (PREDICT_FALSE
813               ((node->flags & VLIB_NODE_FLAG_TRACE)
814                && (b0->flags & VLIB_BUFFER_IS_TRACED)))
815             {
816               nat64_out2in_reass_trace_t *t =
817                 vlib_add_trace (vm, node, b0, sizeof (*t));
818               t->cached = cached0;
819               t->sw_if_index = sw_if_index0;
820               t->next_index = next0;
821             }
822
823           if (cached0)
824             {
825               n_left_to_next++;
826               to_next--;
827             }
828           else
829             {
830               pkts_processed += next0 != NAT64_OUT2IN_NEXT_DROP;
831
832               /* verify speculative enqueue, maybe switch current next frame */
833               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
834                                                to_next, n_left_to_next,
835                                                bi0, next0);
836             }
837
838           if (n_left_from == 0 && vec_len (fragments_to_loopback))
839             {
840               from = vlib_frame_vector_args (frame);
841               u32 len = vec_len (fragments_to_loopback);
842               if (len <= VLIB_FRAME_SIZE)
843                 {
844                   clib_memcpy (from, fragments_to_loopback,
845                                sizeof (u32) * len);
846                   n_left_from = len;
847                   vec_reset_length (fragments_to_loopback);
848                 }
849               else
850                 {
851                   clib_memcpy (from,
852                                fragments_to_loopback + (len -
853                                                         VLIB_FRAME_SIZE),
854                                sizeof (u32) * VLIB_FRAME_SIZE);
855                   n_left_from = VLIB_FRAME_SIZE;
856                   _vec_len (fragments_to_loopback) = len - VLIB_FRAME_SIZE;
857                 }
858             }
859         }
860
861       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
862     }
863
864   vlib_node_increment_counter (vm, nat64_out2in_reass_node.index,
865                                NAT64_OUT2IN_ERROR_OUT2IN_PACKETS,
866                                pkts_processed);
867
868   nat_send_all_to_node (vm, fragments_to_drop, node,
869                         &node->errors[NAT64_OUT2IN_ERROR_DROP_FRAGMENT],
870                         NAT64_OUT2IN_NEXT_DROP);
871
872   vec_free (fragments_to_drop);
873   vec_free (fragments_to_loopback);
874   return frame->n_vectors;
875 }
876
877 /* *INDENT-OFF* */
878 VLIB_REGISTER_NODE (nat64_out2in_reass_node) = {
879   .function = nat64_out2in_reass_node_fn,
880   .name = "nat64-out2in-reass",
881   .vector_size = sizeof (u32),
882   .format_trace = format_nat64_out2in_reass_trace,
883   .type = VLIB_NODE_TYPE_INTERNAL,
884   .n_errors = ARRAY_LEN (nat64_out2in_error_strings),
885   .error_strings = nat64_out2in_error_strings,
886   .n_next_nodes = NAT64_OUT2IN_N_NEXT,
887   /* edit / add dispositions here */
888   .next_nodes = {
889     [NAT64_OUT2IN_NEXT_DROP] = "error-drop",
890     [NAT64_OUT2IN_NEXT_IP6_LOOKUP] = "ip6-lookup",
891     [NAT64_OUT2IN_NEXT_IP4_LOOKUP] = "ip4-lookup",
892     [NAT64_OUT2IN_NEXT_REASS] = "nat64-out2in-reass",
893   },
894 };
895 /* *INDENT-ON* */
896
897 VLIB_NODE_FUNCTION_MULTIARCH (nat64_out2in_reass_node,
898                               nat64_out2in_reass_node_fn);
899
900 typedef struct
901 {
902   u32 next_worker_index;
903   u8 do_handoff;
904 } nat64_out2in_handoff_trace_t;
905
906 static u8 *
907 format_nat64_out2in_handoff_trace (u8 * s, va_list * args)
908 {
909   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
910   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
911   nat64_out2in_handoff_trace_t *t =
912     va_arg (*args, nat64_out2in_handoff_trace_t *);
913   char *m;
914
915   m = t->do_handoff ? "next worker" : "same worker";
916   s = format (s, "NAT64-OUT2IN-HANDOFF: %s %d", m, t->next_worker_index);
917
918   return s;
919 }
920
921 static inline uword
922 nat64_out2in_handoff_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
923                               vlib_frame_t * frame)
924 {
925   nat64_main_t *nm = &nat64_main;
926   vlib_thread_main_t *tm = vlib_get_thread_main ();
927   u32 n_left_from, *from, *to_next = 0, *to_next_drop = 0;
928   static __thread vlib_frame_queue_elt_t **handoff_queue_elt_by_worker_index;
929   static __thread vlib_frame_queue_t **congested_handoff_queue_by_worker_index
930     = 0;
931   vlib_frame_queue_elt_t *hf = 0;
932   vlib_frame_queue_t *fq;
933   vlib_frame_t *f = 0, *d = 0;
934   int i;
935   u32 n_left_to_next_worker = 0, *to_next_worker = 0;
936   u32 next_worker_index = 0;
937   u32 current_worker_index = ~0;
938   u32 thread_index = vm->thread_index;
939   u32 fq_index;
940   u32 to_node_index;
941
942   fq_index = nm->fq_out2in_index;
943   to_node_index = nat64_out2in_node.index;
944
945   if (PREDICT_FALSE (handoff_queue_elt_by_worker_index == 0))
946     {
947       vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1);
948
949       vec_validate_init_empty (congested_handoff_queue_by_worker_index,
950                                tm->n_vlib_mains - 1,
951                                (vlib_frame_queue_t *) (~0));
952     }
953
954   from = vlib_frame_vector_args (frame);
955   n_left_from = frame->n_vectors;
956
957   while (n_left_from > 0)
958     {
959       u32 bi0;
960       vlib_buffer_t *b0;
961       ip4_header_t *ip0;
962       u8 do_handoff;
963
964       bi0 = from[0];
965       from += 1;
966       n_left_from -= 1;
967
968       b0 = vlib_get_buffer (vm, bi0);
969
970       ip0 = vlib_buffer_get_current (b0);
971
972       next_worker_index = nat64_get_worker_out2in (ip0);
973
974       if (PREDICT_FALSE (next_worker_index != thread_index))
975         {
976           do_handoff = 1;
977
978           if (next_worker_index != current_worker_index)
979             {
980               fq =
981                 is_vlib_frame_queue_congested (fq_index, next_worker_index,
982                                                30,
983                                                congested_handoff_queue_by_worker_index);
984
985               if (fq)
986                 {
987                   /* if this is 1st frame */
988                   if (!d)
989                     {
990                       d = vlib_get_frame_to_node (vm, nm->error_node_index);
991                       to_next_drop = vlib_frame_vector_args (d);
992                     }
993
994                   to_next_drop[0] = bi0;
995                   to_next_drop += 1;
996                   d->n_vectors++;
997                   goto trace0;
998                 }
999
1000               if (hf)
1001                 hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1002
1003               hf =
1004                 vlib_get_worker_handoff_queue_elt (fq_index,
1005                                                    next_worker_index,
1006                                                    handoff_queue_elt_by_worker_index);
1007               n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
1008               to_next_worker = &hf->buffer_index[hf->n_vectors];
1009               current_worker_index = next_worker_index;
1010             }
1011
1012           ASSERT (to_next_worker != 0);
1013
1014           /* enqueue to correct worker thread */
1015           to_next_worker[0] = bi0;
1016           to_next_worker++;
1017           n_left_to_next_worker--;
1018
1019           if (n_left_to_next_worker == 0)
1020             {
1021               hf->n_vectors = VLIB_FRAME_SIZE;
1022               vlib_put_frame_queue_elt (hf);
1023               current_worker_index = ~0;
1024               handoff_queue_elt_by_worker_index[next_worker_index] = 0;
1025               hf = 0;
1026             }
1027         }
1028       else
1029         {
1030           do_handoff = 0;
1031           /* if this is 1st frame */
1032           if (!f)
1033             {
1034               f = vlib_get_frame_to_node (vm, to_node_index);
1035               to_next = vlib_frame_vector_args (f);
1036             }
1037
1038           to_next[0] = bi0;
1039           to_next += 1;
1040           f->n_vectors++;
1041         }
1042
1043     trace0:
1044       if (PREDICT_FALSE
1045           ((node->flags & VLIB_NODE_FLAG_TRACE)
1046            && (b0->flags & VLIB_BUFFER_IS_TRACED)))
1047         {
1048           nat64_out2in_handoff_trace_t *t =
1049             vlib_add_trace (vm, node, b0, sizeof (*t));
1050           t->next_worker_index = next_worker_index;
1051           t->do_handoff = do_handoff;
1052         }
1053     }
1054
1055   if (f)
1056     vlib_put_frame_to_node (vm, to_node_index, f);
1057
1058   if (d)
1059     vlib_put_frame_to_node (vm, nm->error_node_index, d);
1060
1061   if (hf)
1062     hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1063
1064   /* Ship frames to the worker nodes */
1065   for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
1066     {
1067       if (handoff_queue_elt_by_worker_index[i])
1068         {
1069           hf = handoff_queue_elt_by_worker_index[i];
1070           /*
1071            * It works better to let the handoff node
1072            * rate-adapt, always ship the handoff queue element.
1073            */
1074           if (1 || hf->n_vectors == hf->last_n_vectors)
1075             {
1076               vlib_put_frame_queue_elt (hf);
1077               handoff_queue_elt_by_worker_index[i] = 0;
1078             }
1079           else
1080             hf->last_n_vectors = hf->n_vectors;
1081         }
1082       congested_handoff_queue_by_worker_index[i] =
1083         (vlib_frame_queue_t *) (~0);
1084     }
1085   hf = 0;
1086   current_worker_index = ~0;
1087   return frame->n_vectors;
1088 }
1089
1090 /* *INDENT-OFF* */
1091 VLIB_REGISTER_NODE (nat64_out2in_handoff_node) = {
1092   .function = nat64_out2in_handoff_node_fn,
1093   .name = "nat64-out2in-handoff",
1094   .vector_size = sizeof (u32),
1095   .format_trace = format_nat64_out2in_handoff_trace,
1096   .type = VLIB_NODE_TYPE_INTERNAL,
1097
1098   .n_next_nodes = 1,
1099
1100   .next_nodes = {
1101     [0] = "error-drop",
1102   },
1103 };
1104 /* *INDENT-ON* */
1105
1106 VLIB_NODE_FUNCTION_MULTIARCH (nat64_out2in_handoff_node,
1107                               nat64_out2in_handoff_node_fn);
1108 /*
1109  * fd.io coding-style-patch-verification: ON
1110  *
1111  * Local Variables:
1112  * eval: (c-set-style "gnu")
1113  * End:
1114  */