NAT: TCP MSS clamping
[vpp.git] / src / plugins / nat / nat64_in2out.c
1 /*
2  * Copyright (c) 2017 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /**
16  * @file
17  * @brief NAT64 IPv6 to IPv4 translation (inside to outside network)
18  */
19
20 #include <nat/nat64.h>
21 #include <nat/nat_reass.h>
22 #include <nat/nat_inlines.h>
23 #include <vnet/ip/ip6_to_ip4.h>
24 #include <vnet/fib/fib_table.h>
25
26 typedef struct
27 {
28   u32 sw_if_index;
29   u32 next_index;
30   u8 is_slow_path;
31 } nat64_in2out_trace_t;
32
33 static u8 *
34 format_nat64_in2out_trace (u8 * s, va_list * args)
35 {
36   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
37   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
38   nat64_in2out_trace_t *t = va_arg (*args, nat64_in2out_trace_t *);
39   char *tag;
40
41   tag = t->is_slow_path ? "NAT64-in2out-slowpath" : "NAT64-in2out";
42
43   s =
44     format (s, "%s: sw_if_index %d, next index %d", tag, t->sw_if_index,
45             t->next_index);
46
47   return s;
48 }
49
50 typedef struct
51 {
52   u32 sw_if_index;
53   u32 next_index;
54   u8 cached;
55 } nat64_in2out_reass_trace_t;
56
57 static u8 *
58 format_nat64_in2out_reass_trace (u8 * s, va_list * args)
59 {
60   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
61   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
62   nat64_in2out_reass_trace_t *t =
63     va_arg (*args, nat64_in2out_reass_trace_t *);
64
65   s =
66     format (s, "NAT64-in2out-reass: sw_if_index %d, next index %d, status %s",
67             t->sw_if_index, t->next_index,
68             t->cached ? "cached" : "translated");
69
70   return s;
71 }
72
73 vlib_node_registration_t nat64_in2out_node;
74 vlib_node_registration_t nat64_in2out_slowpath_node;
75 vlib_node_registration_t nat64_in2out_reass_node;
76 vlib_node_registration_t nat64_in2out_handoff_node;
77
78 #define foreach_nat64_in2out_error                       \
79 _(UNSUPPORTED_PROTOCOL, "unsupported protocol")          \
80 _(IN2OUT_PACKETS, "good in2out packets processed")       \
81 _(NO_TRANSLATION, "no translation")                      \
82 _(UNKNOWN, "unknown")                                    \
83 _(DROP_FRAGMENT, "Drop fragment")                        \
84 _(MAX_REASS, "Maximum reassemblies exceeded")            \
85 _(MAX_FRAG, "Maximum fragments per reassembly exceeded")
86
87
88 typedef enum
89 {
90 #define _(sym,str) NAT64_IN2OUT_ERROR_##sym,
91   foreach_nat64_in2out_error
92 #undef _
93     NAT64_IN2OUT_N_ERROR,
94 } nat64_in2out_error_t;
95
96 static char *nat64_in2out_error_strings[] = {
97 #define _(sym,string) string,
98   foreach_nat64_in2out_error
99 #undef _
100 };
101
102 typedef enum
103 {
104   NAT64_IN2OUT_NEXT_IP4_LOOKUP,
105   NAT64_IN2OUT_NEXT_IP6_LOOKUP,
106   NAT64_IN2OUT_NEXT_DROP,
107   NAT64_IN2OUT_NEXT_SLOWPATH,
108   NAT64_IN2OUT_NEXT_REASS,
109   NAT64_IN2OUT_N_NEXT,
110 } nat64_in2out_next_t;
111
112 typedef struct nat64_in2out_set_ctx_t_
113 {
114   vlib_buffer_t *b;
115   vlib_main_t *vm;
116   u32 thread_index;
117 } nat64_in2out_set_ctx_t;
118
119 static inline u8
120 nat64_not_translate (u32 sw_if_index, ip6_address_t ip6_addr)
121 {
122   ip6_address_t *addr;
123   ip6_main_t *im6 = &ip6_main;
124   ip_lookup_main_t *lm6 = &im6->lookup_main;
125   ip_interface_address_t *ia = 0;
126
127   /* *INDENT-OFF* */
128   foreach_ip_interface_address (lm6, ia, sw_if_index, 0,
129   ({
130         addr = ip_interface_address_get_address (lm6, ia);
131         if (0 == ip6_address_compare (addr, &ip6_addr))
132                 return 1;
133   }));
134   /* *INDENT-ON* */
135
136   return 0;
137 }
138
139 /**
140  * @brief Check whether is a hairpinning.
141  *
142  * If the destination IP address of the packet is an IPv4 address assigned to
143  * the NAT64 itself, then the packet is a hairpin packet.
144  *
145  * param dst_addr Destination address of the packet.
146  *
147  * @returns 1 if hairpinning, otherwise 0.
148  */
149 static_always_inline int
150 is_hairpinning (ip6_address_t * dst_addr)
151 {
152   nat64_main_t *nm = &nat64_main;
153   int i;
154
155   for (i = 0; i < vec_len (nm->addr_pool); i++)
156     {
157       if (nm->addr_pool[i].addr.as_u32 == dst_addr->as_u32[3])
158         return 1;
159     }
160
161   return 0;
162 }
163
164 static int
165 nat64_in2out_tcp_udp_set_cb (ip6_header_t * ip6, ip4_header_t * ip4,
166                              void *arg)
167 {
168   nat64_main_t *nm = &nat64_main;
169   nat64_in2out_set_ctx_t *ctx = arg;
170   nat64_db_bib_entry_t *bibe;
171   nat64_db_st_entry_t *ste;
172   ip46_address_t saddr, daddr;
173   u32 sw_if_index, fib_index;
174   udp_header_t *udp = ip6_next_header (ip6);
175   u8 proto = ip6->protocol;
176   u16 sport = udp->src_port;
177   u16 dport = udp->dst_port;
178   nat64_db_t *db = &nm->db[ctx->thread_index];
179
180   sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX];
181   fib_index =
182     fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index);
183
184   saddr.as_u64[0] = ip6->src_address.as_u64[0];
185   saddr.as_u64[1] = ip6->src_address.as_u64[1];
186   daddr.as_u64[0] = ip6->dst_address.as_u64[0];
187   daddr.as_u64[1] = ip6->dst_address.as_u64[1];
188
189   ste =
190     nat64_db_st_entry_find (db, &saddr, &daddr, sport, dport, proto,
191                             fib_index, 1);
192
193   if (ste)
194     {
195       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
196       if (!bibe)
197         return -1;
198     }
199   else
200     {
201       bibe = nat64_db_bib_entry_find (db, &saddr, sport, proto, fib_index, 1);
202
203       if (!bibe)
204         {
205           u16 out_port;
206           ip4_address_t out_addr;
207           if (nat64_alloc_out_addr_and_port
208               (fib_index, ip_proto_to_snat_proto (proto), &out_addr,
209                &out_port, ctx->thread_index))
210             return -1;
211
212           bibe =
213             nat64_db_bib_entry_create (db, &ip6->src_address, &out_addr,
214                                        sport, out_port, fib_index, proto, 0);
215           if (!bibe)
216             return -1;
217         }
218
219       nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index);
220       ste =
221         nat64_db_st_entry_create (db, bibe, &ip6->dst_address,
222                                   &daddr.ip4, dport);
223       if (!ste)
224         return -1;
225     }
226
227   ip4->src_address.as_u32 = bibe->out_addr.as_u32;
228   udp->src_port = bibe->out_port;
229
230   ip4->dst_address.as_u32 = ste->out_r_addr.as_u32;
231
232   if (proto == IP_PROTOCOL_TCP)
233     {
234       u16 *checksum;
235       ip_csum_t csum;
236       tcp_header_t *tcp = ip6_next_header (ip6);
237
238       nat64_tcp_session_set_state (ste, tcp, 1);
239       checksum = &tcp->checksum;
240       csum = ip_csum_sub_even (*checksum, sport);
241       csum = ip_csum_add_even (csum, udp->src_port);
242       mss_clamping (nm->sm, tcp, &csum);
243       *checksum = ip_csum_fold (csum);
244     }
245
246   nat64_session_reset_timeout (ste, ctx->vm);
247
248   return 0;
249 }
250
251 static int
252 nat64_in2out_icmp_set_cb (ip6_header_t * ip6, ip4_header_t * ip4, void *arg)
253 {
254   nat64_main_t *nm = &nat64_main;
255   nat64_in2out_set_ctx_t *ctx = arg;
256   nat64_db_bib_entry_t *bibe;
257   nat64_db_st_entry_t *ste;
258   ip46_address_t saddr, daddr;
259   u32 sw_if_index, fib_index;
260   icmp46_header_t *icmp = ip6_next_header (ip6);
261   nat64_db_t *db = &nm->db[ctx->thread_index];
262
263   sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX];
264   fib_index =
265     fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index);
266
267   saddr.as_u64[0] = ip6->src_address.as_u64[0];
268   saddr.as_u64[1] = ip6->src_address.as_u64[1];
269   daddr.as_u64[0] = ip6->dst_address.as_u64[0];
270   daddr.as_u64[1] = ip6->dst_address.as_u64[1];
271
272   if (icmp->type == ICMP4_echo_request || icmp->type == ICMP4_echo_reply)
273     {
274       u16 in_id = ((u16 *) (icmp))[2];
275       ste =
276         nat64_db_st_entry_find (db, &saddr, &daddr, in_id, 0,
277                                 IP_PROTOCOL_ICMP, fib_index, 1);
278
279       if (ste)
280         {
281           bibe =
282             nat64_db_bib_entry_by_index (db, IP_PROTOCOL_ICMP,
283                                          ste->bibe_index);
284           if (!bibe)
285             return -1;
286         }
287       else
288         {
289           bibe =
290             nat64_db_bib_entry_find (db, &saddr, in_id,
291                                      IP_PROTOCOL_ICMP, fib_index, 1);
292
293           if (!bibe)
294             {
295               u16 out_id;
296               ip4_address_t out_addr;
297               if (nat64_alloc_out_addr_and_port
298                   (fib_index, SNAT_PROTOCOL_ICMP, &out_addr, &out_id,
299                    ctx->thread_index))
300                 return -1;
301
302               bibe =
303                 nat64_db_bib_entry_create (db, &ip6->src_address,
304                                            &out_addr, in_id, out_id,
305                                            fib_index, IP_PROTOCOL_ICMP, 0);
306               if (!bibe)
307                 return -1;
308             }
309
310           nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index);
311           ste =
312             nat64_db_st_entry_create (db, bibe, &ip6->dst_address,
313                                       &daddr.ip4, 0);
314           if (!ste)
315             return -1;
316         }
317
318       nat64_session_reset_timeout (ste, ctx->vm);
319
320       ip4->src_address.as_u32 = bibe->out_addr.as_u32;
321       ((u16 *) (icmp))[2] = bibe->out_port;
322
323       ip4->dst_address.as_u32 = ste->out_r_addr.as_u32;
324     }
325   else
326     {
327       if (!vec_len (nm->addr_pool))
328         return -1;
329
330       ip4->src_address.as_u32 = nm->addr_pool[0].addr.as_u32;
331       nat64_extract_ip4 (&ip6->dst_address, &ip4->dst_address, fib_index);
332     }
333
334   return 0;
335 }
336
337 static int
338 nat64_in2out_inner_icmp_set_cb (ip6_header_t * ip6, ip4_header_t * ip4,
339                                 void *arg)
340 {
341   nat64_main_t *nm = &nat64_main;
342   nat64_in2out_set_ctx_t *ctx = arg;
343   nat64_db_st_entry_t *ste;
344   nat64_db_bib_entry_t *bibe;
345   ip46_address_t saddr, daddr;
346   u32 sw_if_index, fib_index;
347   u8 proto = ip6->protocol;
348   nat64_db_t *db = &nm->db[ctx->thread_index];
349
350   sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX];
351   fib_index =
352     fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index);
353
354   saddr.as_u64[0] = ip6->src_address.as_u64[0];
355   saddr.as_u64[1] = ip6->src_address.as_u64[1];
356   daddr.as_u64[0] = ip6->dst_address.as_u64[0];
357   daddr.as_u64[1] = ip6->dst_address.as_u64[1];
358
359   if (proto == IP_PROTOCOL_ICMP6)
360     {
361       icmp46_header_t *icmp = ip6_next_header (ip6);
362       u16 in_id = ((u16 *) (icmp))[2];
363       proto = IP_PROTOCOL_ICMP;
364
365       if (!
366           (icmp->type == ICMP4_echo_request
367            || icmp->type == ICMP4_echo_reply))
368         return -1;
369
370       ste =
371         nat64_db_st_entry_find (db, &daddr, &saddr, in_id, 0, proto,
372                                 fib_index, 1);
373       if (!ste)
374         return -1;
375
376       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
377       if (!bibe)
378         return -1;
379
380       ip4->dst_address.as_u32 = bibe->out_addr.as_u32;
381       ((u16 *) (icmp))[2] = bibe->out_port;
382       ip4->src_address.as_u32 = ste->out_r_addr.as_u32;
383     }
384   else
385     {
386       udp_header_t *udp = ip6_next_header (ip6);
387       tcp_header_t *tcp = ip6_next_header (ip6);
388       u16 *checksum;
389       ip_csum_t csum;
390
391       u16 sport = udp->src_port;
392       u16 dport = udp->dst_port;
393
394       ste =
395         nat64_db_st_entry_find (db, &daddr, &saddr, dport, sport, proto,
396                                 fib_index, 1);
397       if (!ste)
398         return -1;
399
400       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
401       if (!bibe)
402         return -1;
403
404       ip4->dst_address.as_u32 = bibe->out_addr.as_u32;
405       udp->dst_port = bibe->out_port;
406       ip4->src_address.as_u32 = ste->out_r_addr.as_u32;
407
408       if (proto == IP_PROTOCOL_TCP)
409         checksum = &tcp->checksum;
410       else
411         checksum = &udp->checksum;
412       csum = ip_csum_sub_even (*checksum, dport);
413       csum = ip_csum_add_even (csum, udp->dst_port);
414       *checksum = ip_csum_fold (csum);
415     }
416
417   return 0;
418 }
419
420 typedef struct unk_proto_st_walk_ctx_t_
421 {
422   ip6_address_t src_addr;
423   ip6_address_t dst_addr;
424   ip4_address_t out_addr;
425   u32 fib_index;
426   u32 thread_index;
427   u8 proto;
428 } unk_proto_st_walk_ctx_t;
429
430 static int
431 unk_proto_st_walk (nat64_db_st_entry_t * ste, void *arg)
432 {
433   nat64_main_t *nm = &nat64_main;
434   unk_proto_st_walk_ctx_t *ctx = arg;
435   nat64_db_bib_entry_t *bibe;
436   ip46_address_t saddr, daddr;
437   nat64_db_t *db = &nm->db[ctx->thread_index];
438
439   if (ip46_address_is_equal (&ste->in_r_addr, &ctx->dst_addr))
440     {
441       bibe = nat64_db_bib_entry_by_index (db, ste->proto, ste->bibe_index);
442       if (!bibe)
443         return -1;
444
445       if (ip46_address_is_equal (&bibe->in_addr, &ctx->src_addr)
446           && bibe->fib_index == ctx->fib_index)
447         {
448           memset (&saddr, 0, sizeof (saddr));
449           saddr.ip4.as_u32 = bibe->out_addr.as_u32;
450           memset (&daddr, 0, sizeof (daddr));
451           nat64_extract_ip4 (&ctx->dst_addr, &daddr.ip4, ctx->fib_index);
452
453           if (nat64_db_st_entry_find
454               (db, &daddr, &saddr, 0, 0, ctx->proto, ctx->fib_index, 0))
455             return -1;
456
457           ctx->out_addr.as_u32 = bibe->out_addr.as_u32;
458           return 1;
459         }
460     }
461
462   return 0;
463 }
464
465 static int
466 nat64_in2out_unk_proto_set_cb (ip6_header_t * ip6, ip4_header_t * ip4,
467                                void *arg)
468 {
469   nat64_main_t *nm = &nat64_main;
470   nat64_in2out_set_ctx_t *s_ctx = arg;
471   nat64_db_bib_entry_t *bibe;
472   nat64_db_st_entry_t *ste;
473   ip46_address_t saddr, daddr, addr;
474   u32 sw_if_index, fib_index;
475   u8 proto = ip6->protocol;
476   int i;
477   nat64_db_t *db = &nm->db[s_ctx->thread_index];
478
479   sw_if_index = vnet_buffer (s_ctx->b)->sw_if_index[VLIB_RX];
480   fib_index =
481     fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index);
482
483   saddr.as_u64[0] = ip6->src_address.as_u64[0];
484   saddr.as_u64[1] = ip6->src_address.as_u64[1];
485   daddr.as_u64[0] = ip6->dst_address.as_u64[0];
486   daddr.as_u64[1] = ip6->dst_address.as_u64[1];
487
488   ste =
489     nat64_db_st_entry_find (db, &saddr, &daddr, 0, 0, proto, fib_index, 1);
490
491   if (ste)
492     {
493       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
494       if (!bibe)
495         return -1;
496     }
497   else
498     {
499       bibe = nat64_db_bib_entry_find (db, &saddr, 0, proto, fib_index, 1);
500
501       if (!bibe)
502         {
503           /* Choose same out address as for TCP/UDP session to same dst */
504           unk_proto_st_walk_ctx_t ctx = {
505             .src_addr.as_u64[0] = ip6->src_address.as_u64[0],
506             .src_addr.as_u64[1] = ip6->src_address.as_u64[1],
507             .dst_addr.as_u64[0] = ip6->dst_address.as_u64[0],
508             .dst_addr.as_u64[1] = ip6->dst_address.as_u64[1],
509             .out_addr.as_u32 = 0,
510             .fib_index = fib_index,
511             .proto = proto,
512             .thread_index = s_ctx->thread_index,
513           };
514
515           nat64_db_st_walk (db, IP_PROTOCOL_TCP, unk_proto_st_walk, &ctx);
516
517           if (!ctx.out_addr.as_u32)
518             nat64_db_st_walk (db, IP_PROTOCOL_UDP, unk_proto_st_walk, &ctx);
519
520           /* Verify if out address is not already in use for protocol */
521           memset (&addr, 0, sizeof (addr));
522           addr.ip4.as_u32 = ctx.out_addr.as_u32;
523           if (nat64_db_bib_entry_find (db, &addr, 0, proto, 0, 0))
524             ctx.out_addr.as_u32 = 0;
525
526           if (!ctx.out_addr.as_u32)
527             {
528               for (i = 0; i < vec_len (nm->addr_pool); i++)
529                 {
530                   addr.ip4.as_u32 = nm->addr_pool[i].addr.as_u32;
531                   if (!nat64_db_bib_entry_find (db, &addr, 0, proto, 0, 0))
532                     break;
533                 }
534             }
535
536           if (!ctx.out_addr.as_u32)
537             return -1;
538
539           bibe =
540             nat64_db_bib_entry_create (db, &ip6->src_address,
541                                        &ctx.out_addr, 0, 0, fib_index, proto,
542                                        0);
543           if (!bibe)
544             return -1;
545         }
546
547       nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index);
548       ste =
549         nat64_db_st_entry_create (db, bibe, &ip6->dst_address, &daddr.ip4, 0);
550       if (!ste)
551         return -1;
552     }
553
554   nat64_session_reset_timeout (ste, s_ctx->vm);
555
556   ip4->src_address.as_u32 = bibe->out_addr.as_u32;
557   ip4->dst_address.as_u32 = ste->out_r_addr.as_u32;
558
559   return 0;
560 }
561
562
563
564 static int
565 nat64_in2out_tcp_udp_hairpinning (vlib_main_t * vm, vlib_buffer_t * b,
566                                   ip6_header_t * ip6, u32 thread_index)
567 {
568   nat64_main_t *nm = &nat64_main;
569   nat64_db_bib_entry_t *bibe;
570   nat64_db_st_entry_t *ste;
571   ip46_address_t saddr, daddr;
572   u32 sw_if_index, fib_index;
573   udp_header_t *udp = ip6_next_header (ip6);
574   tcp_header_t *tcp = ip6_next_header (ip6);
575   u8 proto = ip6->protocol;
576   u16 sport = udp->src_port;
577   u16 dport = udp->dst_port;
578   u16 *checksum;
579   ip_csum_t csum;
580   nat64_db_t *db = &nm->db[thread_index];
581
582   sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX];
583   fib_index =
584     fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index);
585
586   saddr.as_u64[0] = ip6->src_address.as_u64[0];
587   saddr.as_u64[1] = ip6->src_address.as_u64[1];
588   daddr.as_u64[0] = ip6->dst_address.as_u64[0];
589   daddr.as_u64[1] = ip6->dst_address.as_u64[1];
590
591   if (proto == IP_PROTOCOL_UDP)
592     checksum = &udp->checksum;
593   else
594     checksum = &tcp->checksum;
595
596   csum = ip_csum_sub_even (*checksum, ip6->src_address.as_u64[0]);
597   csum = ip_csum_sub_even (csum, ip6->src_address.as_u64[1]);
598   csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[0]);
599   csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[1]);
600   csum = ip_csum_sub_even (csum, sport);
601   csum = ip_csum_sub_even (csum, dport);
602
603   ste =
604     nat64_db_st_entry_find (db, &saddr, &daddr, sport, dport, proto,
605                             fib_index, 1);
606
607   if (ste)
608     {
609       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
610       if (!bibe)
611         return -1;
612     }
613   else
614     {
615       bibe = nat64_db_bib_entry_find (db, &saddr, sport, proto, fib_index, 1);
616
617       if (!bibe)
618         {
619           u16 out_port;
620           ip4_address_t out_addr;
621           if (nat64_alloc_out_addr_and_port
622               (fib_index, ip_proto_to_snat_proto (proto), &out_addr,
623                &out_port, thread_index))
624             return -1;
625
626           bibe =
627             nat64_db_bib_entry_create (db, &ip6->src_address, &out_addr,
628                                        sport, out_port, fib_index, proto, 0);
629           if (!bibe)
630             return -1;
631         }
632
633       nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index);
634       ste =
635         nat64_db_st_entry_create (db, bibe, &ip6->dst_address,
636                                   &daddr.ip4, dport);
637       if (!ste)
638         return -1;
639     }
640
641   if (proto == IP_PROTOCOL_TCP)
642     nat64_tcp_session_set_state (ste, tcp, 1);
643
644   nat64_session_reset_timeout (ste, vm);
645
646   sport = udp->src_port = bibe->out_port;
647   nat64_compose_ip6 (&ip6->src_address, &bibe->out_addr, fib_index);
648
649   memset (&daddr, 0, sizeof (daddr));
650   daddr.ip4.as_u32 = ste->out_r_addr.as_u32;
651
652   bibe = 0;
653   /* *INDENT-OFF* */
654   vec_foreach (db, nm->db)
655     {
656       bibe = nat64_db_bib_entry_find (db, &daddr, dport, proto, 0, 0);
657
658       if (bibe)
659         break;
660     }
661   /* *INDENT-ON* */
662
663   if (!bibe)
664     return -1;
665
666   ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0];
667   ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1];
668   udp->dst_port = bibe->in_port;
669
670   csum = ip_csum_add_even (csum, ip6->src_address.as_u64[0]);
671   csum = ip_csum_add_even (csum, ip6->src_address.as_u64[1]);
672   csum = ip_csum_add_even (csum, ip6->dst_address.as_u64[0]);
673   csum = ip_csum_add_even (csum, ip6->dst_address.as_u64[1]);
674   csum = ip_csum_add_even (csum, udp->src_port);
675   csum = ip_csum_add_even (csum, udp->dst_port);
676   *checksum = ip_csum_fold (csum);
677
678   return 0;
679 }
680
681 static int
682 nat64_in2out_icmp_hairpinning (vlib_main_t * vm, vlib_buffer_t * b,
683                                ip6_header_t * ip6, u32 thread_index)
684 {
685   nat64_main_t *nm = &nat64_main;
686   nat64_db_bib_entry_t *bibe;
687   nat64_db_st_entry_t *ste;
688   icmp46_header_t *icmp = ip6_next_header (ip6);
689   ip6_header_t *inner_ip6;
690   ip46_address_t saddr, daddr;
691   u32 sw_if_index, fib_index;
692   u8 proto;
693   udp_header_t *udp;
694   tcp_header_t *tcp;
695   u16 *checksum, sport, dport;
696   ip_csum_t csum;
697   nat64_db_t *db = &nm->db[thread_index];
698
699   if (icmp->type == ICMP6_echo_request || icmp->type == ICMP6_echo_reply)
700     return -1;
701
702   inner_ip6 = (ip6_header_t *) u8_ptr_add (icmp, 8);
703
704   proto = inner_ip6->protocol;
705
706   if (proto == IP_PROTOCOL_ICMP6)
707     return -1;
708
709   sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX];
710   fib_index =
711     fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index);
712
713   saddr.as_u64[0] = inner_ip6->src_address.as_u64[0];
714   saddr.as_u64[1] = inner_ip6->src_address.as_u64[1];
715   daddr.as_u64[0] = inner_ip6->dst_address.as_u64[0];
716   daddr.as_u64[1] = inner_ip6->dst_address.as_u64[1];
717
718   udp = ip6_next_header (inner_ip6);
719   tcp = ip6_next_header (inner_ip6);
720
721   sport = udp->src_port;
722   dport = udp->dst_port;
723
724   if (proto == IP_PROTOCOL_UDP)
725     checksum = &udp->checksum;
726   else
727     checksum = &tcp->checksum;
728
729   csum = ip_csum_sub_even (*checksum, inner_ip6->src_address.as_u64[0]);
730   csum = ip_csum_sub_even (csum, inner_ip6->src_address.as_u64[1]);
731   csum = ip_csum_sub_even (csum, inner_ip6->dst_address.as_u64[0]);
732   csum = ip_csum_sub_even (csum, inner_ip6->dst_address.as_u64[1]);
733   csum = ip_csum_sub_even (csum, sport);
734   csum = ip_csum_sub_even (csum, dport);
735
736   ste =
737     nat64_db_st_entry_find (db, &daddr, &saddr, dport, sport, proto,
738                             fib_index, 1);
739   if (!ste)
740     return -1;
741
742   bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
743   if (!bibe)
744     return -1;
745
746   dport = udp->dst_port = bibe->out_port;
747   nat64_compose_ip6 (&inner_ip6->dst_address, &bibe->out_addr, fib_index);
748
749   memset (&saddr, 0, sizeof (saddr));
750   memset (&daddr, 0, sizeof (daddr));
751   saddr.ip4.as_u32 = ste->out_r_addr.as_u32;
752   daddr.ip4.as_u32 = bibe->out_addr.as_u32;
753
754   ste = 0;
755   /* *INDENT-OFF* */
756   vec_foreach (db, nm->db)
757     {
758       ste = nat64_db_st_entry_find (db, &saddr, &daddr, sport, dport, proto,
759                                     0, 0);
760
761       if (ste)
762         break;
763     }
764   /* *INDENT-ON* */
765
766   if (!ste)
767     return -1;
768
769   bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
770   if (!bibe)
771     return -1;
772
773   inner_ip6->src_address.as_u64[0] = bibe->in_addr.as_u64[0];
774   inner_ip6->src_address.as_u64[1] = bibe->in_addr.as_u64[1];
775   udp->src_port = bibe->in_port;
776
777   csum = ip_csum_add_even (csum, inner_ip6->src_address.as_u64[0]);
778   csum = ip_csum_add_even (csum, inner_ip6->src_address.as_u64[1]);
779   csum = ip_csum_add_even (csum, inner_ip6->dst_address.as_u64[0]);
780   csum = ip_csum_add_even (csum, inner_ip6->dst_address.as_u64[1]);
781   csum = ip_csum_add_even (csum, udp->src_port);
782   csum = ip_csum_add_even (csum, udp->dst_port);
783   *checksum = ip_csum_fold (csum);
784
785   if (!vec_len (nm->addr_pool))
786     return -1;
787
788   nat64_compose_ip6 (&ip6->src_address, &nm->addr_pool[0].addr, fib_index);
789   ip6->dst_address.as_u64[0] = inner_ip6->src_address.as_u64[0];
790   ip6->dst_address.as_u64[1] = inner_ip6->src_address.as_u64[1];
791
792   icmp->checksum = 0;
793   csum = ip_csum_with_carry (0, ip6->payload_length);
794   csum = ip_csum_with_carry (csum, clib_host_to_net_u16 (ip6->protocol));
795   csum = ip_csum_with_carry (csum, ip6->src_address.as_u64[0]);
796   csum = ip_csum_with_carry (csum, ip6->src_address.as_u64[1]);
797   csum = ip_csum_with_carry (csum, ip6->dst_address.as_u64[0]);
798   csum = ip_csum_with_carry (csum, ip6->dst_address.as_u64[1]);
799   csum =
800     ip_incremental_checksum (csum, icmp,
801                              clib_net_to_host_u16 (ip6->payload_length));
802   icmp->checksum = ~ip_csum_fold (csum);
803
804   return 0;
805 }
806
807 static int
808 nat64_in2out_unk_proto_hairpinning (vlib_main_t * vm, vlib_buffer_t * b,
809                                     ip6_header_t * ip6, u32 thread_index)
810 {
811   nat64_main_t *nm = &nat64_main;
812   nat64_db_bib_entry_t *bibe;
813   nat64_db_st_entry_t *ste;
814   ip46_address_t saddr, daddr, addr;
815   u32 sw_if_index, fib_index;
816   u8 proto = ip6->protocol;
817   int i;
818   nat64_db_t *db = &nm->db[thread_index];
819
820   sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX];
821   fib_index =
822     fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index);
823
824   saddr.as_u64[0] = ip6->src_address.as_u64[0];
825   saddr.as_u64[1] = ip6->src_address.as_u64[1];
826   daddr.as_u64[0] = ip6->dst_address.as_u64[0];
827   daddr.as_u64[1] = ip6->dst_address.as_u64[1];
828
829   ste =
830     nat64_db_st_entry_find (db, &saddr, &daddr, 0, 0, proto, fib_index, 1);
831
832   if (ste)
833     {
834       bibe = nat64_db_bib_entry_by_index (db, proto, ste->bibe_index);
835       if (!bibe)
836         return -1;
837     }
838   else
839     {
840       bibe = nat64_db_bib_entry_find (db, &saddr, 0, proto, fib_index, 1);
841
842       if (!bibe)
843         {
844           /* Choose same out address as for TCP/UDP session to same dst */
845           unk_proto_st_walk_ctx_t ctx = {
846             .src_addr.as_u64[0] = ip6->src_address.as_u64[0],
847             .src_addr.as_u64[1] = ip6->src_address.as_u64[1],
848             .dst_addr.as_u64[0] = ip6->dst_address.as_u64[0],
849             .dst_addr.as_u64[1] = ip6->dst_address.as_u64[1],
850             .out_addr.as_u32 = 0,
851             .fib_index = fib_index,
852             .proto = proto,
853             .thread_index = thread_index,
854           };
855
856           nat64_db_st_walk (db, IP_PROTOCOL_TCP, unk_proto_st_walk, &ctx);
857
858           if (!ctx.out_addr.as_u32)
859             nat64_db_st_walk (db, IP_PROTOCOL_UDP, unk_proto_st_walk, &ctx);
860
861           /* Verify if out address is not already in use for protocol */
862           memset (&addr, 0, sizeof (addr));
863           addr.ip4.as_u32 = ctx.out_addr.as_u32;
864           if (nat64_db_bib_entry_find (db, &addr, 0, proto, 0, 0))
865             ctx.out_addr.as_u32 = 0;
866
867           if (!ctx.out_addr.as_u32)
868             {
869               for (i = 0; i < vec_len (nm->addr_pool); i++)
870                 {
871                   addr.ip4.as_u32 = nm->addr_pool[i].addr.as_u32;
872                   if (!nat64_db_bib_entry_find (db, &addr, 0, proto, 0, 0))
873                     break;
874                 }
875             }
876
877           if (!ctx.out_addr.as_u32)
878             return -1;
879
880           bibe =
881             nat64_db_bib_entry_create (db, &ip6->src_address,
882                                        &ctx.out_addr, 0, 0, fib_index, proto,
883                                        0);
884           if (!bibe)
885             return -1;
886         }
887
888       nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index);
889       ste =
890         nat64_db_st_entry_create (db, bibe, &ip6->dst_address, &daddr.ip4, 0);
891       if (!ste)
892         return -1;
893     }
894
895   nat64_session_reset_timeout (ste, vm);
896
897   nat64_compose_ip6 (&ip6->src_address, &bibe->out_addr, fib_index);
898
899   memset (&daddr, 0, sizeof (daddr));
900   daddr.ip4.as_u32 = ste->out_r_addr.as_u32;
901
902   bibe = 0;
903   /* *INDENT-OFF* */
904   vec_foreach (db, nm->db)
905     {
906       bibe = nat64_db_bib_entry_find (db, &daddr, 0, proto, 0, 0);
907
908       if (bibe)
909         break;
910     }
911   /* *INDENT-ON* */
912
913   if (!bibe)
914     return -1;
915
916   ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0];
917   ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1];
918
919   return 0;
920 }
921
922 static inline uword
923 nat64_in2out_node_fn_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
924                              vlib_frame_t * frame, u8 is_slow_path)
925 {
926   u32 n_left_from, *from, *to_next;
927   nat64_in2out_next_t next_index;
928   u32 pkts_processed = 0;
929   u32 stats_node_index;
930   u32 thread_index = vm->thread_index;
931
932   stats_node_index =
933     is_slow_path ? nat64_in2out_slowpath_node.index : nat64_in2out_node.index;
934
935   from = vlib_frame_vector_args (frame);
936   n_left_from = frame->n_vectors;
937   next_index = node->cached_next_index;
938
939   while (n_left_from > 0)
940     {
941       u32 n_left_to_next;
942
943       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
944
945       while (n_left_from > 0 && n_left_to_next > 0)
946         {
947           u32 bi0;
948           vlib_buffer_t *b0;
949           u32 next0;
950           ip6_header_t *ip60;
951           u16 l4_offset0, frag_offset0;
952           u8 l4_protocol0;
953           u32 proto0;
954           nat64_in2out_set_ctx_t ctx0;
955           u32 sw_if_index0;
956
957           /* speculatively enqueue b0 to the current next frame */
958           bi0 = from[0];
959           to_next[0] = bi0;
960           from += 1;
961           to_next += 1;
962           n_left_from -= 1;
963           n_left_to_next -= 1;
964
965           b0 = vlib_get_buffer (vm, bi0);
966           ip60 = vlib_buffer_get_current (b0);
967
968           ctx0.b = b0;
969           ctx0.vm = vm;
970           ctx0.thread_index = thread_index;
971
972           next0 = NAT64_IN2OUT_NEXT_IP4_LOOKUP;
973
974           if (PREDICT_FALSE
975               (ip6_parse
976                (ip60, b0->current_length, &l4_protocol0, &l4_offset0,
977                 &frag_offset0)))
978             {
979               next0 = NAT64_IN2OUT_NEXT_DROP;
980               b0->error = node->errors[NAT64_IN2OUT_ERROR_UNKNOWN];
981               goto trace0;
982             }
983
984           sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
985
986           if (nat64_not_translate (sw_if_index0, ip60->dst_address))
987             {
988               next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP;
989               goto trace0;
990             }
991
992           proto0 = ip_proto_to_snat_proto (l4_protocol0);
993
994           if (is_slow_path)
995             {
996               if (PREDICT_TRUE (proto0 == ~0))
997                 {
998                   if (is_hairpinning (&ip60->dst_address))
999                     {
1000                       next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP;
1001                       if (nat64_in2out_unk_proto_hairpinning
1002                           (vm, b0, ip60, thread_index))
1003                         {
1004                           next0 = NAT64_IN2OUT_NEXT_DROP;
1005                           b0->error =
1006                             node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1007                         }
1008                       goto trace0;
1009                     }
1010
1011                   if (ip6_to_ip4 (b0, nat64_in2out_unk_proto_set_cb, &ctx0))
1012                     {
1013                       next0 = NAT64_IN2OUT_NEXT_DROP;
1014                       b0->error =
1015                         node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1016                       goto trace0;
1017                     }
1018                 }
1019               goto trace0;
1020             }
1021           else
1022             {
1023               if (PREDICT_FALSE (proto0 == ~0))
1024                 {
1025                   next0 = NAT64_IN2OUT_NEXT_SLOWPATH;
1026                   goto trace0;
1027                 }
1028             }
1029
1030           if (PREDICT_FALSE
1031               (ip60->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION))
1032             {
1033               next0 = NAT64_IN2OUT_NEXT_REASS;
1034               goto trace0;
1035             }
1036
1037           if (proto0 == SNAT_PROTOCOL_ICMP)
1038             {
1039               if (is_hairpinning (&ip60->dst_address))
1040                 {
1041                   next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP;
1042                   if (nat64_in2out_icmp_hairpinning
1043                       (vm, b0, ip60, thread_index))
1044                     {
1045                       next0 = NAT64_IN2OUT_NEXT_DROP;
1046                       b0->error =
1047                         node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1048                     }
1049                   goto trace0;
1050                 }
1051
1052               if (icmp6_to_icmp
1053                   (b0, nat64_in2out_icmp_set_cb, &ctx0,
1054                    nat64_in2out_inner_icmp_set_cb, &ctx0))
1055                 {
1056                   next0 = NAT64_IN2OUT_NEXT_DROP;
1057                   b0->error = node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1058                   goto trace0;
1059                 }
1060             }
1061           else if (proto0 == SNAT_PROTOCOL_TCP || proto0 == SNAT_PROTOCOL_UDP)
1062             {
1063               if (is_hairpinning (&ip60->dst_address))
1064                 {
1065                   next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP;
1066                   if (nat64_in2out_tcp_udp_hairpinning
1067                       (vm, b0, ip60, thread_index))
1068                     {
1069                       next0 = NAT64_IN2OUT_NEXT_DROP;
1070                       b0->error =
1071                         node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1072                     }
1073                   goto trace0;
1074                 }
1075
1076               if (ip6_to_ip4_tcp_udp
1077                   (b0, nat64_in2out_tcp_udp_set_cb, &ctx0, 0))
1078                 {
1079                   next0 = NAT64_IN2OUT_NEXT_DROP;
1080                   b0->error = node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1081                   goto trace0;
1082                 }
1083             }
1084
1085         trace0:
1086           if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
1087                              && (b0->flags & VLIB_BUFFER_IS_TRACED)))
1088             {
1089               nat64_in2out_trace_t *t =
1090                 vlib_add_trace (vm, node, b0, sizeof (*t));
1091               t->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
1092               t->next_index = next0;
1093               t->is_slow_path = is_slow_path;
1094             }
1095
1096           pkts_processed += next0 != NAT64_IN2OUT_NEXT_DROP;
1097
1098           /* verify speculative enqueue, maybe switch current next frame */
1099           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1100                                            n_left_to_next, bi0, next0);
1101         }
1102       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1103     }
1104   vlib_node_increment_counter (vm, stats_node_index,
1105                                NAT64_IN2OUT_ERROR_IN2OUT_PACKETS,
1106                                pkts_processed);
1107   return frame->n_vectors;
1108 }
1109
1110 static uword
1111 nat64_in2out_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
1112                       vlib_frame_t * frame)
1113 {
1114   return nat64_in2out_node_fn_inline (vm, node, frame, 0);
1115 }
1116
1117 /* *INDENT-OFF* */
1118 VLIB_REGISTER_NODE (nat64_in2out_node) = {
1119   .function = nat64_in2out_node_fn,
1120   .name = "nat64-in2out",
1121   .vector_size = sizeof (u32),
1122   .format_trace = format_nat64_in2out_trace,
1123   .type = VLIB_NODE_TYPE_INTERNAL,
1124   .n_errors = ARRAY_LEN (nat64_in2out_error_strings),
1125   .error_strings = nat64_in2out_error_strings,
1126   .n_next_nodes = NAT64_IN2OUT_N_NEXT,
1127   /* edit / add dispositions here */
1128   .next_nodes = {
1129     [NAT64_IN2OUT_NEXT_DROP] = "error-drop",
1130     [NAT64_IN2OUT_NEXT_IP4_LOOKUP] = "ip4-lookup",
1131     [NAT64_IN2OUT_NEXT_IP6_LOOKUP] = "ip6-lookup",
1132     [NAT64_IN2OUT_NEXT_SLOWPATH] = "nat64-in2out-slowpath",
1133     [NAT64_IN2OUT_NEXT_REASS] = "nat64-in2out-reass",
1134   },
1135 };
1136 /* *INDENT-ON* */
1137
1138 VLIB_NODE_FUNCTION_MULTIARCH (nat64_in2out_node, nat64_in2out_node_fn);
1139
1140 static uword
1141 nat64_in2out_slowpath_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
1142                                vlib_frame_t * frame)
1143 {
1144   return nat64_in2out_node_fn_inline (vm, node, frame, 1);
1145 }
1146
1147 /* *INDENT-OFF* */
1148 VLIB_REGISTER_NODE (nat64_in2out_slowpath_node) = {
1149   .function = nat64_in2out_slowpath_node_fn,
1150   .name = "nat64-in2out-slowpath",
1151   .vector_size = sizeof (u32),
1152   .format_trace = format_nat64_in2out_trace,
1153   .type = VLIB_NODE_TYPE_INTERNAL,
1154   .n_errors = ARRAY_LEN (nat64_in2out_error_strings),
1155   .error_strings = nat64_in2out_error_strings,
1156   .n_next_nodes = NAT64_IN2OUT_N_NEXT,
1157   /* edit / add dispositions here */
1158   .next_nodes = {
1159     [NAT64_IN2OUT_NEXT_DROP] = "error-drop",
1160     [NAT64_IN2OUT_NEXT_IP4_LOOKUP] = "ip4-lookup",
1161     [NAT64_IN2OUT_NEXT_IP6_LOOKUP] = "ip6-lookup",
1162     [NAT64_IN2OUT_NEXT_SLOWPATH] = "nat64-in2out-slowpath",
1163     [NAT64_IN2OUT_NEXT_REASS] = "nat64-in2out-reass",
1164   },
1165 };
1166 /* *INDENT-ON* */
1167
1168 VLIB_NODE_FUNCTION_MULTIARCH (nat64_in2out_slowpath_node,
1169                               nat64_in2out_slowpath_node_fn);
1170
1171 typedef struct nat64_in2out_frag_set_ctx_t_
1172 {
1173   vlib_main_t *vm;
1174   u32 sess_index;
1175   u32 thread_index;
1176   u16 l4_offset;
1177   u8 proto;
1178   u8 first_frag;
1179 } nat64_in2out_frag_set_ctx_t;
1180
1181 static int
1182 nat64_in2out_frag_set_cb (ip6_header_t * ip6, ip4_header_t * ip4, void *arg)
1183 {
1184   nat64_main_t *nm = &nat64_main;
1185   nat64_in2out_frag_set_ctx_t *ctx = arg;
1186   nat64_db_st_entry_t *ste;
1187   nat64_db_bib_entry_t *bibe;
1188   udp_header_t *udp;
1189   nat64_db_t *db = &nm->db[ctx->thread_index];
1190
1191   ste = nat64_db_st_entry_by_index (db, ctx->proto, ctx->sess_index);
1192   if (!ste)
1193     return -1;
1194
1195   bibe = nat64_db_bib_entry_by_index (db, ctx->proto, ste->bibe_index);
1196   if (!bibe)
1197     return -1;
1198
1199   nat64_session_reset_timeout (ste, ctx->vm);
1200
1201   if (ctx->first_frag)
1202     {
1203       udp = (udp_header_t *) u8_ptr_add (ip6, ctx->l4_offset);
1204
1205       if (ctx->proto == IP_PROTOCOL_TCP)
1206         {
1207           u16 *checksum;
1208           ip_csum_t csum;
1209           tcp_header_t *tcp = (tcp_header_t *) udp;
1210
1211           nat64_tcp_session_set_state (ste, tcp, 1);
1212           checksum = &tcp->checksum;
1213           csum = ip_csum_sub_even (*checksum, tcp->src_port);
1214           csum = ip_csum_sub_even (csum, ip6->src_address.as_u64[0]);
1215           csum = ip_csum_sub_even (csum, ip6->src_address.as_u64[1]);
1216           csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[0]);
1217           csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[1]);
1218           csum = ip_csum_add_even (csum, bibe->out_port);
1219           csum = ip_csum_add_even (csum, bibe->out_addr.as_u32);
1220           csum = ip_csum_add_even (csum, ste->out_r_addr.as_u32);
1221           *checksum = ip_csum_fold (csum);
1222         }
1223
1224       udp->src_port = bibe->out_port;
1225     }
1226
1227   ip4->src_address.as_u32 = bibe->out_addr.as_u32;
1228   ip4->dst_address.as_u32 = ste->out_r_addr.as_u32;
1229
1230   return 0;
1231 }
1232
1233 static int
1234 nat64_in2out_frag_hairpinning (vlib_buffer_t * b, ip6_header_t * ip6,
1235                                nat64_in2out_frag_set_ctx_t * ctx)
1236 {
1237   nat64_main_t *nm = &nat64_main;
1238   nat64_db_st_entry_t *ste;
1239   nat64_db_bib_entry_t *bibe;
1240   udp_header_t *udp = (udp_header_t *) u8_ptr_add (ip6, ctx->l4_offset);
1241   tcp_header_t *tcp = (tcp_header_t *) udp;
1242   u16 sport = udp->src_port;
1243   u16 dport = udp->dst_port;
1244   u16 *checksum;
1245   ip_csum_t csum;
1246   ip46_address_t daddr;
1247   nat64_db_t *db = &nm->db[ctx->thread_index];
1248
1249   if (ctx->first_frag)
1250     {
1251       if (ctx->proto == IP_PROTOCOL_UDP)
1252         checksum = &udp->checksum;
1253       else
1254         checksum = &tcp->checksum;
1255
1256       csum = ip_csum_sub_even (*checksum, ip6->src_address.as_u64[0]);
1257       csum = ip_csum_sub_even (csum, ip6->src_address.as_u64[1]);
1258       csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[0]);
1259       csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[1]);
1260       csum = ip_csum_sub_even (csum, sport);
1261       csum = ip_csum_sub_even (csum, dport);
1262     }
1263
1264   ste = nat64_db_st_entry_by_index (db, ctx->proto, ctx->sess_index);
1265   if (!ste)
1266     return -1;
1267
1268   bibe = nat64_db_bib_entry_by_index (db, ctx->proto, ste->bibe_index);
1269   if (!bibe)
1270     return -1;
1271
1272   if (ctx->proto == IP_PROTOCOL_TCP)
1273     nat64_tcp_session_set_state (ste, tcp, 1);
1274
1275   nat64_session_reset_timeout (ste, ctx->vm);
1276
1277   sport = bibe->out_port;
1278   dport = ste->r_port;
1279
1280   nat64_compose_ip6 (&ip6->src_address, &bibe->out_addr, bibe->fib_index);
1281
1282   memset (&daddr, 0, sizeof (daddr));
1283   daddr.ip4.as_u32 = ste->out_r_addr.as_u32;
1284
1285   bibe = 0;
1286   /* *INDENT-OFF* */
1287   vec_foreach (db, nm->db)
1288     {
1289       bibe = nat64_db_bib_entry_find (db, &daddr, dport, ctx->proto, 0, 0);
1290
1291       if (bibe)
1292         break;
1293     }
1294   /* *INDENT-ON* */
1295
1296   if (!bibe)
1297     return -1;
1298
1299   ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0];
1300   ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1];
1301
1302   if (ctx->first_frag)
1303     {
1304       udp->dst_port = bibe->in_port;
1305       udp->src_port = sport;
1306       csum = ip_csum_add_even (csum, ip6->src_address.as_u64[0]);
1307       csum = ip_csum_add_even (csum, ip6->src_address.as_u64[1]);
1308       csum = ip_csum_add_even (csum, ip6->dst_address.as_u64[0]);
1309       csum = ip_csum_add_even (csum, ip6->dst_address.as_u64[1]);
1310       csum = ip_csum_add_even (csum, udp->src_port);
1311       csum = ip_csum_add_even (csum, udp->dst_port);
1312       *checksum = ip_csum_fold (csum);
1313     }
1314
1315   return 0;
1316 }
1317
1318 static uword
1319 nat64_in2out_reass_node_fn (vlib_main_t * vm,
1320                             vlib_node_runtime_t * node, vlib_frame_t * frame)
1321 {
1322   u32 n_left_from, *from, *to_next;
1323   nat64_in2out_next_t next_index;
1324   u32 pkts_processed = 0;
1325   u32 *fragments_to_drop = 0;
1326   u32 *fragments_to_loopback = 0;
1327   nat64_main_t *nm = &nat64_main;
1328   u32 thread_index = vm->thread_index;
1329
1330   from = vlib_frame_vector_args (frame);
1331   n_left_from = frame->n_vectors;
1332   next_index = node->cached_next_index;
1333
1334   while (n_left_from > 0)
1335     {
1336       u32 n_left_to_next;
1337
1338       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1339
1340       while (n_left_from > 0 && n_left_to_next > 0)
1341         {
1342           u32 bi0;
1343           vlib_buffer_t *b0;
1344           u32 next0;
1345           u8 cached0 = 0;
1346           ip6_header_t *ip60;
1347           u16 l4_offset0, frag_offset0;
1348           u8 l4_protocol0;
1349           nat_reass_ip6_t *reass0;
1350           ip6_frag_hdr_t *frag0;
1351           nat64_db_bib_entry_t *bibe0;
1352           nat64_db_st_entry_t *ste0;
1353           udp_header_t *udp0;
1354           snat_protocol_t proto0;
1355           u32 sw_if_index0, fib_index0;
1356           ip46_address_t saddr0, daddr0;
1357           nat64_in2out_frag_set_ctx_t ctx0;
1358           nat64_db_t *db = &nm->db[thread_index];
1359
1360           /* speculatively enqueue b0 to the current next frame */
1361           bi0 = from[0];
1362           to_next[0] = bi0;
1363           from += 1;
1364           to_next += 1;
1365           n_left_from -= 1;
1366           n_left_to_next -= 1;
1367
1368           b0 = vlib_get_buffer (vm, bi0);
1369           next0 = NAT64_IN2OUT_NEXT_IP4_LOOKUP;
1370
1371           sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
1372           fib_index0 =
1373             fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6,
1374                                                  sw_if_index0);
1375
1376           ctx0.thread_index = thread_index;
1377
1378           if (PREDICT_FALSE (nat_reass_is_drop_frag (1)))
1379             {
1380               next0 = NAT64_IN2OUT_NEXT_DROP;
1381               b0->error = node->errors[NAT64_IN2OUT_ERROR_DROP_FRAGMENT];
1382               goto trace0;
1383             }
1384
1385           ip60 = (ip6_header_t *) vlib_buffer_get_current (b0);
1386
1387           if (PREDICT_FALSE
1388               (ip6_parse
1389                (ip60, b0->current_length, &l4_protocol0, &l4_offset0,
1390                 &frag_offset0)))
1391             {
1392               next0 = NAT64_IN2OUT_NEXT_DROP;
1393               b0->error = node->errors[NAT64_IN2OUT_ERROR_UNKNOWN];
1394               goto trace0;
1395             }
1396
1397           if (PREDICT_FALSE
1398               (!(l4_protocol0 == IP_PROTOCOL_TCP
1399                  || l4_protocol0 == IP_PROTOCOL_UDP)))
1400             {
1401               next0 = NAT64_IN2OUT_NEXT_DROP;
1402               b0->error = node->errors[NAT64_IN2OUT_ERROR_DROP_FRAGMENT];
1403               goto trace0;
1404             }
1405
1406           udp0 = (udp_header_t *) u8_ptr_add (ip60, l4_offset0);
1407           frag0 = (ip6_frag_hdr_t *) u8_ptr_add (ip60, frag_offset0);
1408           proto0 = ip_proto_to_snat_proto (l4_protocol0);
1409
1410           reass0 = nat_ip6_reass_find_or_create (ip60->src_address,
1411                                                  ip60->dst_address,
1412                                                  frag0->identification,
1413                                                  l4_protocol0,
1414                                                  1, &fragments_to_drop);
1415
1416           if (PREDICT_FALSE (!reass0))
1417             {
1418               next0 = NAT64_IN2OUT_NEXT_DROP;
1419               b0->error = node->errors[NAT64_IN2OUT_ERROR_MAX_REASS];
1420               goto trace0;
1421             }
1422
1423           if (PREDICT_TRUE (ip6_frag_hdr_offset (frag0)))
1424             {
1425               ctx0.first_frag = 0;
1426               if (PREDICT_FALSE (reass0->sess_index == (u32) ~ 0))
1427                 {
1428                   if (nat_ip6_reass_add_fragment
1429                       (reass0, bi0, &fragments_to_drop))
1430                     {
1431                       b0->error = node->errors[NAT64_IN2OUT_ERROR_MAX_FRAG];
1432                       next0 = NAT64_IN2OUT_NEXT_DROP;
1433                       goto trace0;
1434                     }
1435                   cached0 = 1;
1436                   goto trace0;
1437                 }
1438             }
1439           else
1440             {
1441               ctx0.first_frag = 1;
1442
1443               saddr0.as_u64[0] = ip60->src_address.as_u64[0];
1444               saddr0.as_u64[1] = ip60->src_address.as_u64[1];
1445               daddr0.as_u64[0] = ip60->dst_address.as_u64[0];
1446               daddr0.as_u64[1] = ip60->dst_address.as_u64[1];
1447
1448               ste0 =
1449                 nat64_db_st_entry_find (db, &saddr0, &daddr0,
1450                                         udp0->src_port, udp0->dst_port,
1451                                         l4_protocol0, fib_index0, 1);
1452               if (!ste0)
1453                 {
1454                   bibe0 =
1455                     nat64_db_bib_entry_find (db, &saddr0, udp0->src_port,
1456                                              l4_protocol0, fib_index0, 1);
1457                   if (!bibe0)
1458                     {
1459                       u16 out_port0;
1460                       ip4_address_t out_addr0;
1461                       if (nat64_alloc_out_addr_and_port
1462                           (fib_index0, proto0, &out_addr0, &out_port0,
1463                            thread_index))
1464                         {
1465                           next0 = NAT64_IN2OUT_NEXT_DROP;
1466                           b0->error =
1467                             node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1468                           goto trace0;
1469                         }
1470
1471                       bibe0 =
1472                         nat64_db_bib_entry_create (db,
1473                                                    &ip60->src_address,
1474                                                    &out_addr0, udp0->src_port,
1475                                                    out_port0, fib_index0,
1476                                                    l4_protocol0, 0);
1477                       if (!bibe0)
1478                         {
1479                           next0 = NAT64_IN2OUT_NEXT_DROP;
1480                           b0->error =
1481                             node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1482                           goto trace0;
1483                         }
1484                     }
1485                   nat64_extract_ip4 (&ip60->dst_address, &daddr0.ip4,
1486                                      fib_index0);
1487                   ste0 =
1488                     nat64_db_st_entry_create (db, bibe0,
1489                                               &ip60->dst_address, &daddr0.ip4,
1490                                               udp0->dst_port);
1491                   if (!ste0)
1492                     {
1493                       next0 = NAT64_IN2OUT_NEXT_DROP;
1494                       b0->error =
1495                         node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1496                       goto trace0;
1497                     }
1498                 }
1499               reass0->sess_index = nat64_db_st_entry_get_index (db, ste0);
1500
1501               nat_ip6_reass_get_frags (reass0, &fragments_to_loopback);
1502             }
1503
1504           ctx0.sess_index = reass0->sess_index;
1505           ctx0.proto = l4_protocol0;
1506           ctx0.vm = vm;
1507           ctx0.l4_offset = l4_offset0;
1508
1509           if (PREDICT_FALSE (is_hairpinning (&ip60->dst_address)))
1510             {
1511               next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP;
1512               if (nat64_in2out_frag_hairpinning (b0, ip60, &ctx0))
1513                 {
1514                   next0 = NAT64_IN2OUT_NEXT_DROP;
1515                   b0->error = node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION];
1516                 }
1517               goto trace0;
1518             }
1519           else
1520             {
1521               if (ip6_to_ip4_fragmented (b0, nat64_in2out_frag_set_cb, &ctx0))
1522                 {
1523                   next0 = NAT64_IN2OUT_NEXT_DROP;
1524                   b0->error = node->errors[NAT64_IN2OUT_ERROR_UNKNOWN];
1525                   goto trace0;
1526                 }
1527             }
1528
1529         trace0:
1530           if (PREDICT_FALSE
1531               ((node->flags & VLIB_NODE_FLAG_TRACE)
1532                && (b0->flags & VLIB_BUFFER_IS_TRACED)))
1533             {
1534               nat64_in2out_reass_trace_t *t =
1535                 vlib_add_trace (vm, node, b0, sizeof (*t));
1536               t->cached = cached0;
1537               t->sw_if_index = sw_if_index0;
1538               t->next_index = next0;
1539             }
1540
1541           if (cached0)
1542             {
1543               n_left_to_next++;
1544               to_next--;
1545             }
1546           else
1547             {
1548               pkts_processed += next0 != NAT64_IN2OUT_NEXT_DROP;
1549
1550               /* verify speculative enqueue, maybe switch current next frame */
1551               vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1552                                                to_next, n_left_to_next,
1553                                                bi0, next0);
1554             }
1555
1556           if (n_left_from == 0 && vec_len (fragments_to_loopback))
1557             {
1558               from = vlib_frame_vector_args (frame);
1559               u32 len = vec_len (fragments_to_loopback);
1560               if (len <= VLIB_FRAME_SIZE)
1561                 {
1562                   clib_memcpy (from, fragments_to_loopback,
1563                                sizeof (u32) * len);
1564                   n_left_from = len;
1565                   vec_reset_length (fragments_to_loopback);
1566                 }
1567               else
1568                 {
1569                   clib_memcpy (from,
1570                                fragments_to_loopback + (len -
1571                                                         VLIB_FRAME_SIZE),
1572                                sizeof (u32) * VLIB_FRAME_SIZE);
1573                   n_left_from = VLIB_FRAME_SIZE;
1574                   _vec_len (fragments_to_loopback) = len - VLIB_FRAME_SIZE;
1575                 }
1576             }
1577         }
1578
1579       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1580     }
1581
1582   vlib_node_increment_counter (vm, nat64_in2out_reass_node.index,
1583                                NAT64_IN2OUT_ERROR_IN2OUT_PACKETS,
1584                                pkts_processed);
1585
1586   nat_send_all_to_node (vm, fragments_to_drop, node,
1587                         &node->errors[NAT64_IN2OUT_ERROR_DROP_FRAGMENT],
1588                         NAT64_IN2OUT_NEXT_DROP);
1589
1590   vec_free (fragments_to_drop);
1591   vec_free (fragments_to_loopback);
1592   return frame->n_vectors;
1593 }
1594
1595 /* *INDENT-OFF* */
1596 VLIB_REGISTER_NODE (nat64_in2out_reass_node) = {
1597   .function = nat64_in2out_reass_node_fn,
1598   .name = "nat64-in2out-reass",
1599   .vector_size = sizeof (u32),
1600   .format_trace = format_nat64_in2out_reass_trace,
1601   .type = VLIB_NODE_TYPE_INTERNAL,
1602   .n_errors = ARRAY_LEN (nat64_in2out_error_strings),
1603   .error_strings = nat64_in2out_error_strings,
1604   .n_next_nodes = NAT64_IN2OUT_N_NEXT,
1605   /* edit / add dispositions here */
1606   .next_nodes = {
1607     [NAT64_IN2OUT_NEXT_DROP] = "error-drop",
1608     [NAT64_IN2OUT_NEXT_IP4_LOOKUP] = "ip4-lookup",
1609     [NAT64_IN2OUT_NEXT_IP6_LOOKUP] = "ip6-lookup",
1610     [NAT64_IN2OUT_NEXT_SLOWPATH] = "nat64-in2out-slowpath",
1611     [NAT64_IN2OUT_NEXT_REASS] = "nat64-in2out-reass",
1612   },
1613 };
1614 /* *INDENT-ON* */
1615
1616 VLIB_NODE_FUNCTION_MULTIARCH (nat64_in2out_reass_node,
1617                               nat64_in2out_reass_node_fn);
1618
1619 typedef struct
1620 {
1621   u32 next_worker_index;
1622   u8 do_handoff;
1623 } nat64_in2out_handoff_trace_t;
1624
1625 static u8 *
1626 format_nat64_in2out_handoff_trace (u8 * s, va_list * args)
1627 {
1628   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1629   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1630   nat64_in2out_handoff_trace_t *t =
1631     va_arg (*args, nat64_in2out_handoff_trace_t *);
1632   char *m;
1633
1634   m = t->do_handoff ? "next worker" : "same worker";
1635   s = format (s, "NAT64-IN2OUT-HANDOFF: %s %d", m, t->next_worker_index);
1636
1637   return s;
1638 }
1639
1640 static inline uword
1641 nat64_in2out_handoff_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
1642                               vlib_frame_t * frame)
1643 {
1644   nat64_main_t *nm = &nat64_main;
1645   vlib_thread_main_t *tm = vlib_get_thread_main ();
1646   u32 n_left_from, *from, *to_next = 0, *to_next_drop = 0;
1647   static __thread vlib_frame_queue_elt_t **handoff_queue_elt_by_worker_index;
1648   static __thread vlib_frame_queue_t **congested_handoff_queue_by_worker_index
1649     = 0;
1650   vlib_frame_queue_elt_t *hf = 0;
1651   vlib_frame_queue_t *fq;
1652   vlib_frame_t *f = 0, *d = 0;
1653   int i;
1654   u32 n_left_to_next_worker = 0, *to_next_worker = 0;
1655   u32 next_worker_index = 0;
1656   u32 current_worker_index = ~0;
1657   u32 thread_index = vm->thread_index;
1658   u32 fq_index;
1659   u32 to_node_index;
1660
1661   fq_index = nm->fq_in2out_index;
1662   to_node_index = nat64_in2out_node.index;
1663
1664   if (PREDICT_FALSE (handoff_queue_elt_by_worker_index == 0))
1665     {
1666       vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1);
1667
1668       vec_validate_init_empty (congested_handoff_queue_by_worker_index,
1669                                tm->n_vlib_mains - 1,
1670                                (vlib_frame_queue_t *) (~0));
1671     }
1672
1673   from = vlib_frame_vector_args (frame);
1674   n_left_from = frame->n_vectors;
1675
1676   while (n_left_from > 0)
1677     {
1678       u32 bi0;
1679       vlib_buffer_t *b0;
1680       ip6_header_t *ip0;
1681       u8 do_handoff;
1682
1683       bi0 = from[0];
1684       from += 1;
1685       n_left_from -= 1;
1686
1687       b0 = vlib_get_buffer (vm, bi0);
1688
1689       ip0 = vlib_buffer_get_current (b0);
1690
1691       next_worker_index = nat64_get_worker_in2out (&ip0->src_address);
1692
1693       if (PREDICT_FALSE (next_worker_index != thread_index))
1694         {
1695           do_handoff = 1;
1696
1697           if (next_worker_index != current_worker_index)
1698             {
1699               fq =
1700                 is_vlib_frame_queue_congested (fq_index, next_worker_index,
1701                                                30,
1702                                                congested_handoff_queue_by_worker_index);
1703
1704               if (fq)
1705                 {
1706                   /* if this is 1st frame */
1707                   if (!d)
1708                     {
1709                       d = vlib_get_frame_to_node (vm, nm->error_node_index);
1710                       to_next_drop = vlib_frame_vector_args (d);
1711                     }
1712
1713                   to_next_drop[0] = bi0;
1714                   to_next_drop += 1;
1715                   d->n_vectors++;
1716                   goto trace0;
1717                 }
1718
1719               if (hf)
1720                 hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1721
1722               hf =
1723                 vlib_get_worker_handoff_queue_elt (fq_index,
1724                                                    next_worker_index,
1725                                                    handoff_queue_elt_by_worker_index);
1726               n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
1727               to_next_worker = &hf->buffer_index[hf->n_vectors];
1728               current_worker_index = next_worker_index;
1729             }
1730
1731           ASSERT (to_next_worker != 0);
1732
1733           /* enqueue to correct worker thread */
1734           to_next_worker[0] = bi0;
1735           to_next_worker++;
1736           n_left_to_next_worker--;
1737
1738           if (n_left_to_next_worker == 0)
1739             {
1740               hf->n_vectors = VLIB_FRAME_SIZE;
1741               vlib_put_frame_queue_elt (hf);
1742               current_worker_index = ~0;
1743               handoff_queue_elt_by_worker_index[next_worker_index] = 0;
1744               hf = 0;
1745             }
1746         }
1747       else
1748         {
1749           do_handoff = 0;
1750           /* if this is 1st frame */
1751           if (!f)
1752             {
1753               f = vlib_get_frame_to_node (vm, to_node_index);
1754               to_next = vlib_frame_vector_args (f);
1755             }
1756
1757           to_next[0] = bi0;
1758           to_next += 1;
1759           f->n_vectors++;
1760         }
1761
1762     trace0:
1763       if (PREDICT_FALSE
1764           ((node->flags & VLIB_NODE_FLAG_TRACE)
1765            && (b0->flags & VLIB_BUFFER_IS_TRACED)))
1766         {
1767           nat64_in2out_handoff_trace_t *t =
1768             vlib_add_trace (vm, node, b0, sizeof (*t));
1769           t->next_worker_index = next_worker_index;
1770           t->do_handoff = do_handoff;
1771         }
1772     }
1773
1774   if (f)
1775     vlib_put_frame_to_node (vm, to_node_index, f);
1776
1777   if (d)
1778     vlib_put_frame_to_node (vm, nm->error_node_index, d);
1779
1780   if (hf)
1781     hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1782
1783   /* Ship frames to the worker nodes */
1784   for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
1785     {
1786       if (handoff_queue_elt_by_worker_index[i])
1787         {
1788           hf = handoff_queue_elt_by_worker_index[i];
1789           /*
1790            * It works better to let the handoff node
1791            * rate-adapt, always ship the handoff queue element.
1792            */
1793           if (1 || hf->n_vectors == hf->last_n_vectors)
1794             {
1795               vlib_put_frame_queue_elt (hf);
1796               handoff_queue_elt_by_worker_index[i] = 0;
1797             }
1798           else
1799             hf->last_n_vectors = hf->n_vectors;
1800         }
1801       congested_handoff_queue_by_worker_index[i] =
1802         (vlib_frame_queue_t *) (~0);
1803     }
1804   hf = 0;
1805   current_worker_index = ~0;
1806   return frame->n_vectors;
1807 }
1808
1809 /* *INDENT-OFF* */
1810 VLIB_REGISTER_NODE (nat64_in2out_handoff_node) = {
1811   .function = nat64_in2out_handoff_node_fn,
1812   .name = "nat64-in2out-handoff",
1813   .vector_size = sizeof (u32),
1814   .format_trace = format_nat64_in2out_handoff_trace,
1815   .type = VLIB_NODE_TYPE_INTERNAL,
1816
1817   .n_next_nodes = 1,
1818
1819   .next_nodes = {
1820     [0] = "error-drop",
1821   },
1822 };
1823 /* *INDENT-ON* */
1824
1825 VLIB_NODE_FUNCTION_MULTIARCH (nat64_in2out_handoff_node,
1826                               nat64_in2out_handoff_node_fn);
1827
1828 /*
1829  * fd.io coding-style-patch-verification: ON
1830  *
1831  * Local Variables:
1832  * eval: (c-set-style "gnu")
1833  * End:
1834  */