map: use SVR for MAP-E
[vpp.git] / src / plugins / map / ip6_map.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #include "map.h"
16
17 #include <vnet/ip/ip_frag.h>
18 #include <vnet/ip/ip4_to_ip6.h>
19 #include <vnet/ip/ip6_to_ip4.h>
20 #include <vnet/ip/reass/ip4_sv_reass.h>
21
22 enum ip6_map_next_e
23 {
24   IP6_MAP_NEXT_IP4_LOOKUP,
25 #ifdef MAP_SKIP_IP6_LOOKUP
26   IP6_MAP_NEXT_IP4_REWRITE,
27 #endif
28   IP6_MAP_NEXT_IP6_REASS,
29   IP6_MAP_NEXT_IP4_REASS,
30   IP6_MAP_NEXT_IP4_FRAGMENT,
31   IP6_MAP_NEXT_IP6_ICMP_RELAY,
32   IP6_MAP_NEXT_IP6_LOCAL,
33   IP6_MAP_NEXT_DROP,
34   IP6_MAP_NEXT_ICMP,
35   IP6_MAP_N_NEXT,
36 };
37
38 enum ip6_map_ip6_reass_next_e
39 {
40   IP6_MAP_IP6_REASS_NEXT_IP6_MAP,
41   IP6_MAP_IP6_REASS_NEXT_DROP,
42   IP6_MAP_IP6_REASS_N_NEXT,
43 };
44
45 enum ip6_map_post_ip4_reass_next_e
46 {
47   IP6_MAP_POST_IP4_REASS_NEXT_IP4_LOOKUP,
48   IP6_MAP_POST_IP4_REASS_NEXT_IP4_FRAGMENT,
49   IP6_MAP_POST_IP4_REASS_NEXT_DROP,
50   IP6_MAP_POST_IP4_REASS_N_NEXT,
51 };
52
53 enum ip6_icmp_relay_next_e
54 {
55   IP6_ICMP_RELAY_NEXT_IP4_LOOKUP,
56   IP6_ICMP_RELAY_NEXT_DROP,
57   IP6_ICMP_RELAY_N_NEXT,
58 };
59
60 vlib_node_registration_t ip6_map_post_ip4_reass_node;
61 vlib_node_registration_t ip6_map_ip6_reass_node;
62 static vlib_node_registration_t ip6_map_icmp_relay_node;
63
64 typedef struct
65 {
66   u32 map_domain_index;
67   u16 port;
68   u8 cached;
69 } map_ip6_map_ip4_reass_trace_t;
70
71 u8 *
72 format_ip6_map_post_ip4_reass_trace (u8 * s, va_list * args)
73 {
74   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
75   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
76   map_ip6_map_ip4_reass_trace_t *t =
77     va_arg (*args, map_ip6_map_ip4_reass_trace_t *);
78   return format (s, "MAP domain index: %d L4 port: %u Status: %s",
79                  t->map_domain_index, clib_net_to_host_u16 (t->port),
80                  t->cached ? "cached" : "forwarded");
81 }
82
83 typedef struct
84 {
85   u16 offset;
86   u16 frag_len;
87   u8 out;
88 } map_ip6_map_ip6_reass_trace_t;
89
90 u8 *
91 format_ip6_map_ip6_reass_trace (u8 * s, va_list * args)
92 {
93   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
94   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
95   map_ip6_map_ip6_reass_trace_t *t =
96     va_arg (*args, map_ip6_map_ip6_reass_trace_t *);
97   return format (s, "Offset: %d Fragment length: %d Status: %s", t->offset,
98                  t->frag_len, t->out ? "out" : "in");
99 }
100
101 /*
102  * ip6_map_sec_check
103  */
104 static_always_inline bool
105 ip6_map_sec_check (map_domain_t * d, u16 port, ip4_header_t * ip4,
106                    ip6_header_t * ip6)
107 {
108   u16 sp4 = clib_net_to_host_u16 (port);
109   u32 sa4 = clib_net_to_host_u32 (ip4->src_address.as_u32);
110   u64 sal6 = map_get_pfx (d, sa4, sp4);
111   u64 sar6 = map_get_sfx (d, sa4, sp4);
112
113   if (PREDICT_FALSE
114       (sal6 != clib_net_to_host_u64 (ip6->src_address.as_u64[0])
115        || sar6 != clib_net_to_host_u64 (ip6->src_address.as_u64[1])))
116     return (false);
117   return (true);
118 }
119
120 static_always_inline void
121 ip6_map_security_check (map_domain_t * d, vlib_buffer_t * b0,
122                         ip4_header_t * ip4, ip6_header_t * ip6, u32 * next,
123                         u8 * error)
124 {
125   map_main_t *mm = &map_main;
126   if (d->ea_bits_len || d->rules)
127     {
128       if (d->psid_length > 0)
129         {
130           if (!ip4_is_fragment (ip4))
131             {
132               u16 port = ip4_get_port (ip4, 1);
133               if (port)
134                 {
135                   if (mm->sec_check)
136                     *error =
137                       ip6_map_sec_check (d, port, ip4,
138                                          ip6) ? MAP_ERROR_NONE :
139                       MAP_ERROR_DECAP_SEC_CHECK;
140                 }
141               else
142                 {
143                   *error = MAP_ERROR_BAD_PROTOCOL;
144                 }
145             }
146           else
147             {
148               if (mm->sec_check_frag)
149                 {
150                   vnet_buffer (b0)->ip.reass.next_index =
151                     map_main.ip4_sv_reass_custom_next_index;
152                   *next = IP6_MAP_NEXT_IP4_REASS;
153                 }
154             }
155         }
156     }
157 }
158
159 static_always_inline bool
160 ip6_map_ip4_lookup_bypass (vlib_buffer_t * p0, ip4_header_t * ip)
161 {
162 #ifdef MAP_SKIP_IP6_LOOKUP
163   if (FIB_NODE_INDEX_INVALID != pre_resolved[FIB_PROTOCOL_IP4].fei)
164     {
165       vnet_buffer (p0)->ip.adj_index[VLIB_TX] =
166         pre_resolved[FIB_PROTOCOL_IP4].dpo.dpoi_index;
167       return (true);
168     }
169 #endif
170   return (false);
171 }
172
173 /*
174  * ip6_map
175  */
176 static uword
177 ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
178 {
179   u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
180   vlib_node_runtime_t *error_node =
181     vlib_node_get_runtime (vm, ip6_map_node.index);
182   map_main_t *mm = &map_main;
183   vlib_combined_counter_main_t *cm = mm->domain_counters;
184   u32 thread_index = vm->thread_index;
185
186   from = vlib_frame_vector_args (frame);
187   n_left_from = frame->n_vectors;
188   next_index = node->cached_next_index;
189   while (n_left_from > 0)
190     {
191       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
192
193       /* Dual loop */
194       while (n_left_from >= 4 && n_left_to_next >= 2)
195         {
196           u32 pi0, pi1;
197           vlib_buffer_t *p0, *p1;
198           u8 error0 = MAP_ERROR_NONE;
199           u8 error1 = MAP_ERROR_NONE;
200           map_domain_t *d0 = 0, *d1 = 0;
201           ip4_header_t *ip40, *ip41;
202           ip6_header_t *ip60, *ip61;
203           u16 port0 = 0, port1 = 0;
204           u32 map_domain_index0 = ~0, map_domain_index1 = ~0;
205           u32 next0 = IP6_MAP_NEXT_IP4_LOOKUP;
206           u32 next1 = IP6_MAP_NEXT_IP4_LOOKUP;
207
208           /* Prefetch next iteration. */
209           {
210             vlib_buffer_t *p2, *p3;
211
212             p2 = vlib_get_buffer (vm, from[2]);
213             p3 = vlib_get_buffer (vm, from[3]);
214
215             vlib_prefetch_buffer_header (p2, LOAD);
216             vlib_prefetch_buffer_header (p3, LOAD);
217
218             /* IPv6 + IPv4 header + 8 bytes of ULP */
219             CLIB_PREFETCH (p2->data, 68, LOAD);
220             CLIB_PREFETCH (p3->data, 68, LOAD);
221           }
222
223           pi0 = to_next[0] = from[0];
224           pi1 = to_next[1] = from[1];
225           from += 2;
226           n_left_from -= 2;
227           to_next += 2;
228           n_left_to_next -= 2;
229
230           p0 = vlib_get_buffer (vm, pi0);
231           p1 = vlib_get_buffer (vm, pi1);
232           ip60 = vlib_buffer_get_current (p0);
233           ip61 = vlib_buffer_get_current (p1);
234           vlib_buffer_advance (p0, sizeof (ip6_header_t));
235           vlib_buffer_advance (p1, sizeof (ip6_header_t));
236           ip40 = vlib_buffer_get_current (p0);
237           ip41 = vlib_buffer_get_current (p1);
238
239           /*
240            * Encapsulated IPv4 packet
241            *   - IPv4 fragmented -> Pass to virtual reassembly unless security check disabled
242            *   - Lookup/Rewrite or Fragment node in case of packet > MTU
243            * Fragmented IPv6 packet
244            * ICMP IPv6 packet
245            *   - Error -> Pass to ICMPv6/ICMPv4 relay
246            *   - Info -> Pass to IPv6 local
247            * Anything else -> drop
248            */
249           if (PREDICT_TRUE
250               (ip60->protocol == IP_PROTOCOL_IP_IN_IP
251                && clib_net_to_host_u16 (ip60->payload_length) > 20))
252             {
253               d0 =
254                 ip4_map_get_domain ((ip4_address_t *) & ip40->
255                                     src_address.as_u32, &map_domain_index0,
256                                     &error0);
257             }
258           else if (ip60->protocol == IP_PROTOCOL_ICMP6 &&
259                    clib_net_to_host_u16 (ip60->payload_length) >
260                    sizeof (icmp46_header_t))
261             {
262               icmp46_header_t *icmp = (void *) (ip60 + 1);
263               next0 = (icmp->type == ICMP6_echo_request
264                        || icmp->type ==
265                        ICMP6_echo_reply) ? IP6_MAP_NEXT_IP6_LOCAL :
266                 IP6_MAP_NEXT_IP6_ICMP_RELAY;
267             }
268           else if (ip60->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION)
269             {
270               next0 = IP6_MAP_NEXT_IP6_REASS;
271             }
272           else
273             {
274               error0 = MAP_ERROR_BAD_PROTOCOL;
275             }
276           if (PREDICT_TRUE
277               (ip61->protocol == IP_PROTOCOL_IP_IN_IP
278                && clib_net_to_host_u16 (ip61->payload_length) > 20))
279             {
280               d1 =
281                 ip4_map_get_domain ((ip4_address_t *) & ip41->
282                                     src_address.as_u32, &map_domain_index1,
283                                     &error1);
284             }
285           else if (ip61->protocol == IP_PROTOCOL_ICMP6 &&
286                    clib_net_to_host_u16 (ip61->payload_length) >
287                    sizeof (icmp46_header_t))
288             {
289               icmp46_header_t *icmp = (void *) (ip61 + 1);
290               next1 = (icmp->type == ICMP6_echo_request
291                        || icmp->type ==
292                        ICMP6_echo_reply) ? IP6_MAP_NEXT_IP6_LOCAL :
293                 IP6_MAP_NEXT_IP6_ICMP_RELAY;
294             }
295           else if (ip61->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION)
296             {
297               next1 = IP6_MAP_NEXT_IP6_REASS;
298             }
299           else
300             {
301               error1 = MAP_ERROR_BAD_PROTOCOL;
302             }
303
304           if (d0)
305             {
306               /* MAP inbound security check */
307               ip6_map_security_check (d0, p0, ip40, ip60, &next0, &error0);
308
309               if (PREDICT_TRUE (error0 == MAP_ERROR_NONE &&
310                                 next0 == IP6_MAP_NEXT_IP4_LOOKUP))
311                 {
312                   if (PREDICT_FALSE
313                       (d0->mtu
314                        && (clib_host_to_net_u16 (ip40->length) > d0->mtu)))
315                     {
316                       vnet_buffer (p0)->ip_frag.flags = 0;
317                       vnet_buffer (p0)->ip_frag.next_index =
318                         IP4_FRAG_NEXT_IP4_LOOKUP;
319                       vnet_buffer (p0)->ip_frag.mtu = d0->mtu;
320                       next0 = IP6_MAP_NEXT_IP4_FRAGMENT;
321                     }
322                   else
323                     {
324                       next0 =
325                         ip6_map_ip4_lookup_bypass (p0,
326                                                    ip40) ?
327                         IP6_MAP_NEXT_IP4_REWRITE : next0;
328                     }
329                   vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
330                                                    thread_index,
331                                                    map_domain_index0, 1,
332                                                    clib_net_to_host_u16
333                                                    (ip40->length));
334                 }
335             }
336           if (d1)
337             {
338               /* MAP inbound security check */
339               ip6_map_security_check (d1, p1, ip41, ip61, &next1, &error1);
340
341               if (PREDICT_TRUE (error1 == MAP_ERROR_NONE &&
342                                 next1 == IP6_MAP_NEXT_IP4_LOOKUP))
343                 {
344                   if (PREDICT_FALSE
345                       (d1->mtu
346                        && (clib_host_to_net_u16 (ip41->length) > d1->mtu)))
347                     {
348                       vnet_buffer (p1)->ip_frag.flags = 0;
349                       vnet_buffer (p1)->ip_frag.next_index =
350                         IP4_FRAG_NEXT_IP4_LOOKUP;
351                       vnet_buffer (p1)->ip_frag.mtu = d1->mtu;
352                       next1 = IP6_MAP_NEXT_IP4_FRAGMENT;
353                     }
354                   else
355                     {
356                       next1 =
357                         ip6_map_ip4_lookup_bypass (p1,
358                                                    ip41) ?
359                         IP6_MAP_NEXT_IP4_REWRITE : next1;
360                     }
361                   vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
362                                                    thread_index,
363                                                    map_domain_index1, 1,
364                                                    clib_net_to_host_u16
365                                                    (ip41->length));
366                 }
367             }
368
369           if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
370             {
371               map_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr));
372               tr->map_domain_index = map_domain_index0;
373               tr->port = port0;
374             }
375
376           if (PREDICT_FALSE (p1->flags & VLIB_BUFFER_IS_TRACED))
377             {
378               map_trace_t *tr = vlib_add_trace (vm, node, p1, sizeof (*tr));
379               tr->map_domain_index = map_domain_index1;
380               tr->port = port1;
381             }
382
383           if (error0 == MAP_ERROR_DECAP_SEC_CHECK && mm->icmp6_enabled)
384             {
385               /* Set ICMP parameters */
386               vlib_buffer_advance (p0, -sizeof (ip6_header_t));
387               icmp6_error_set_vnet_buffer (p0, ICMP6_destination_unreachable,
388                                            ICMP6_destination_unreachable_source_address_failed_policy,
389                                            0);
390               next0 = IP6_MAP_NEXT_ICMP;
391             }
392           else
393             {
394               next0 = (error0 == MAP_ERROR_NONE) ? next0 : IP6_MAP_NEXT_DROP;
395             }
396
397           if (error1 == MAP_ERROR_DECAP_SEC_CHECK && mm->icmp6_enabled)
398             {
399               /* Set ICMP parameters */
400               vlib_buffer_advance (p1, -sizeof (ip6_header_t));
401               icmp6_error_set_vnet_buffer (p1, ICMP6_destination_unreachable,
402                                            ICMP6_destination_unreachable_source_address_failed_policy,
403                                            0);
404               next1 = IP6_MAP_NEXT_ICMP;
405             }
406           else
407             {
408               next1 = (error1 == MAP_ERROR_NONE) ? next1 : IP6_MAP_NEXT_DROP;
409             }
410
411           /* Reset packet */
412           if (next0 == IP6_MAP_NEXT_IP6_LOCAL)
413             vlib_buffer_advance (p0, -sizeof (ip6_header_t));
414           if (next1 == IP6_MAP_NEXT_IP6_LOCAL)
415             vlib_buffer_advance (p1, -sizeof (ip6_header_t));
416
417           p0->error = error_node->errors[error0];
418           p1->error = error_node->errors[error1];
419           vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
420                                            n_left_to_next, pi0, pi1, next0,
421                                            next1);
422         }
423
424       /* Single loop */
425       while (n_left_from > 0 && n_left_to_next > 0)
426         {
427           u32 pi0;
428           vlib_buffer_t *p0;
429           u8 error0 = MAP_ERROR_NONE;
430           map_domain_t *d0 = 0;
431           ip4_header_t *ip40;
432           ip6_header_t *ip60;
433           i32 port0 = 0;
434           u32 map_domain_index0 = ~0;
435           u32 next0 = IP6_MAP_NEXT_IP4_LOOKUP;
436
437           pi0 = to_next[0] = from[0];
438           from += 1;
439           n_left_from -= 1;
440           to_next += 1;
441           n_left_to_next -= 1;
442
443           p0 = vlib_get_buffer (vm, pi0);
444           ip60 = vlib_buffer_get_current (p0);
445           vlib_buffer_advance (p0, sizeof (ip6_header_t));
446           ip40 = vlib_buffer_get_current (p0);
447
448           /*
449            * Encapsulated IPv4 packet
450            *   - IPv4 fragmented -> Pass to virtual reassembly unless security check disabled
451            *   - Lookup/Rewrite or Fragment node in case of packet > MTU
452            * Fragmented IPv6 packet
453            * ICMP IPv6 packet
454            *   - Error -> Pass to ICMPv6/ICMPv4 relay
455            *   - Info -> Pass to IPv6 local
456            * Anything else -> drop
457            */
458           if (PREDICT_TRUE
459               (ip60->protocol == IP_PROTOCOL_IP_IN_IP
460                && clib_net_to_host_u16 (ip60->payload_length) > 20))
461             {
462               d0 =
463                 ip4_map_get_domain ((ip4_address_t *) & ip40->
464                                     src_address.as_u32, &map_domain_index0,
465                                     &error0);
466             }
467           else if (ip60->protocol == IP_PROTOCOL_ICMP6 &&
468                    clib_net_to_host_u16 (ip60->payload_length) >
469                    sizeof (icmp46_header_t))
470             {
471               icmp46_header_t *icmp = (void *) (ip60 + 1);
472               next0 = (icmp->type == ICMP6_echo_request
473                        || icmp->type ==
474                        ICMP6_echo_reply) ? IP6_MAP_NEXT_IP6_LOCAL :
475                 IP6_MAP_NEXT_IP6_ICMP_RELAY;
476             }
477           else if (ip60->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION &&
478                    (((ip6_frag_hdr_t *) (ip60 + 1))->next_hdr ==
479                     IP_PROTOCOL_IP_IN_IP))
480             {
481               next0 = IP6_MAP_NEXT_IP6_REASS;
482             }
483           else
484             {
485               /* XXX: Move get_domain to ip6_get_domain lookup on source */
486               //error0 = MAP_ERROR_BAD_PROTOCOL;
487               vlib_buffer_advance (p0, -sizeof (ip6_header_t));
488               vnet_feature_next (&next0, p0);
489             }
490
491           if (d0)
492             {
493               /* MAP inbound security check */
494               ip6_map_security_check (d0, p0, ip40, ip60, &next0, &error0);
495
496               if (PREDICT_TRUE (error0 == MAP_ERROR_NONE &&
497                                 next0 == IP6_MAP_NEXT_IP4_LOOKUP))
498                 {
499                   if (PREDICT_FALSE
500                       (d0->mtu
501                        && (clib_host_to_net_u16 (ip40->length) > d0->mtu)))
502                     {
503                       vnet_buffer (p0)->ip_frag.flags = 0;
504                       vnet_buffer (p0)->ip_frag.next_index =
505                         IP4_FRAG_NEXT_IP4_LOOKUP;
506                       vnet_buffer (p0)->ip_frag.mtu = d0->mtu;
507                       next0 = IP6_MAP_NEXT_IP4_FRAGMENT;
508                     }
509                   else
510                     {
511                       next0 =
512                         ip6_map_ip4_lookup_bypass (p0,
513                                                    ip40) ?
514                         IP6_MAP_NEXT_IP4_REWRITE : next0;
515                     }
516                   vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
517                                                    thread_index,
518                                                    map_domain_index0, 1,
519                                                    clib_net_to_host_u16
520                                                    (ip40->length));
521                 }
522             }
523
524           if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
525             {
526               map_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr));
527               tr->map_domain_index = map_domain_index0;
528               tr->port = (u16) port0;
529             }
530
531           if (mm->icmp6_enabled &&
532               (error0 == MAP_ERROR_DECAP_SEC_CHECK
533                || error0 == MAP_ERROR_NO_DOMAIN))
534             {
535               /* Set ICMP parameters */
536               vlib_buffer_advance (p0, -sizeof (ip6_header_t));
537               icmp6_error_set_vnet_buffer (p0, ICMP6_destination_unreachable,
538                                            ICMP6_destination_unreachable_source_address_failed_policy,
539                                            0);
540               next0 = IP6_MAP_NEXT_ICMP;
541             }
542           else
543             {
544               next0 = (error0 == MAP_ERROR_NONE) ? next0 : IP6_MAP_NEXT_DROP;
545             }
546
547           /* Reset packet */
548           if (next0 == IP6_MAP_NEXT_IP6_LOCAL)
549             vlib_buffer_advance (p0, -sizeof (ip6_header_t));
550
551           p0->error = error_node->errors[error0];
552           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
553                                            n_left_to_next, pi0, next0);
554         }
555       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
556     }
557
558   return frame->n_vectors;
559 }
560
561
562 static_always_inline void
563 ip6_map_ip6_reass_prepare (vlib_main_t * vm, vlib_node_runtime_t * node,
564                            map_ip6_reass_t * r, u32 ** fragments_ready,
565                            u32 ** fragments_to_drop)
566 {
567   ip4_header_t *ip40;
568   ip6_header_t *ip60;
569   ip6_frag_hdr_t *frag0;
570   vlib_buffer_t *p0;
571
572   if (!r->ip4_header.ip_version_and_header_length)
573     return;
574
575   //The IP header is here, we need to check for packets
576   //that can be forwarded
577   int i;
578   for (i = 0; i < MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++)
579     {
580       if (r->fragments[i].pi == ~0 ||
581           ((!r->fragments[i].next_data_len)
582            && (r->fragments[i].next_data_offset != (0xffff))))
583         continue;
584
585       p0 = vlib_get_buffer (vm, r->fragments[i].pi);
586       ip60 = vlib_buffer_get_current (p0);
587       frag0 = (ip6_frag_hdr_t *) (ip60 + 1);
588       ip40 = (ip4_header_t *) (frag0 + 1);
589
590       if (ip6_frag_hdr_offset (frag0))
591         {
592           //Not first fragment, add the IPv4 header
593           clib_memcpy_fast (ip40, &r->ip4_header, 20);
594         }
595
596 #ifdef MAP_IP6_REASS_COUNT_BYTES
597       r->forwarded +=
598         clib_net_to_host_u16 (ip60->payload_length) - sizeof (*frag0);
599 #endif
600
601       if (ip6_frag_hdr_more (frag0))
602         {
603           //Not last fragment, we copy end of next
604           clib_memcpy_fast (u8_ptr_add (ip60, p0->current_length),
605                             r->fragments[i].next_data, 20);
606           p0->current_length += 20;
607           ip60->payload_length = u16_net_add (ip60->payload_length, 20);
608         }
609
610       if (!ip4_is_fragment (ip40))
611         {
612           ip40->fragment_id = frag_id_6to4 (frag0->identification);
613           ip40->flags_and_fragment_offset =
614             clib_host_to_net_u16 (ip6_frag_hdr_offset (frag0));
615         }
616       else
617         {
618           ip40->flags_and_fragment_offset =
619             clib_host_to_net_u16 (ip4_get_fragment_offset (ip40) +
620                                   ip6_frag_hdr_offset (frag0));
621         }
622
623       if (ip6_frag_hdr_more (frag0))
624         ip40->flags_and_fragment_offset |=
625           clib_host_to_net_u16 (IP4_HEADER_FLAG_MORE_FRAGMENTS);
626
627       ip40->length =
628         clib_host_to_net_u16 (p0->current_length - sizeof (*ip60) -
629                               sizeof (*frag0));
630       ip40->checksum = ip4_header_checksum (ip40);
631
632       if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
633         {
634           map_ip6_map_ip6_reass_trace_t *tr =
635             vlib_add_trace (vm, node, p0, sizeof (*tr));
636           tr->offset = ip4_get_fragment_offset (ip40);
637           tr->frag_len = clib_net_to_host_u16 (ip40->length) - sizeof (*ip40);
638           tr->out = 1;
639         }
640
641       vec_add1 (*fragments_ready, r->fragments[i].pi);
642       r->fragments[i].pi = ~0;
643       r->fragments[i].next_data_len = 0;
644       r->fragments[i].next_data_offset = 0;
645       map_main.ip6_reass_buffered_counter--;
646
647       //TODO: Best solution would be that ip6_map handles extension headers
648       // and ignores atomic fragment. But in the meantime, let's just copy the header.
649
650       u8 protocol = frag0->next_hdr;
651       memmove (u8_ptr_add (ip40, -sizeof (*ip60)), ip60, sizeof (*ip60));
652       ((ip6_header_t *) u8_ptr_add (ip40, -sizeof (*ip60)))->protocol =
653         protocol;
654       vlib_buffer_advance (p0, sizeof (*frag0));
655     }
656 }
657
658 void
659 map_ip6_drop_pi (u32 pi)
660 {
661   vlib_main_t *vm = vlib_get_main ();
662   vlib_node_runtime_t *n =
663     vlib_node_get_runtime (vm, ip6_map_ip6_reass_node.index);
664   vlib_set_next_frame_buffer (vm, n, IP6_MAP_IP6_REASS_NEXT_DROP, pi);
665 }
666
667 /*
668  * ip6_reass
669  * TODO: We should count the number of successfully
670  * transmitted fragment bytes and compare that to the last fragment
671  * offset such that we can free the reassembly structure when all fragments
672  * have been forwarded.
673  */
674 static uword
675 ip6_map_ip6_reass (vlib_main_t * vm,
676                    vlib_node_runtime_t * node, vlib_frame_t * frame)
677 {
678   u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
679   vlib_node_runtime_t *error_node =
680     vlib_node_get_runtime (vm, ip6_map_ip6_reass_node.index);
681   u32 *fragments_to_drop = NULL;
682   u32 *fragments_ready = NULL;
683
684   from = vlib_frame_vector_args (frame);
685   n_left_from = frame->n_vectors;
686   next_index = node->cached_next_index;
687   while (n_left_from > 0)
688     {
689       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
690
691       /* Single loop */
692       while (n_left_from > 0 && n_left_to_next > 0)
693         {
694           u32 pi0;
695           vlib_buffer_t *p0;
696           u8 error0 = MAP_ERROR_NONE;
697           ip6_header_t *ip60;
698           ip6_frag_hdr_t *frag0;
699           u16 offset;
700           u16 next_offset;
701           u16 frag_len;
702
703           pi0 = to_next[0] = from[0];
704           from += 1;
705           n_left_from -= 1;
706           to_next += 1;
707           n_left_to_next -= 1;
708
709           p0 = vlib_get_buffer (vm, pi0);
710           ip60 = vlib_buffer_get_current (p0);
711           frag0 = (ip6_frag_hdr_t *) (ip60 + 1);
712           offset =
713             clib_host_to_net_u16 (frag0->fragment_offset_and_more) & (~7);
714           frag_len =
715             clib_net_to_host_u16 (ip60->payload_length) - sizeof (*frag0);
716           next_offset =
717             ip6_frag_hdr_more (frag0) ? (offset + frag_len) : (0xffff);
718
719           //FIXME: Support other extension headers, maybe
720
721           if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
722             {
723               map_ip6_map_ip6_reass_trace_t *tr =
724                 vlib_add_trace (vm, node, p0, sizeof (*tr));
725               tr->offset = offset;
726               tr->frag_len = frag_len;
727               tr->out = 0;
728             }
729
730           map_ip6_reass_lock ();
731           map_ip6_reass_t *r =
732             map_ip6_reass_get (&ip60->src_address, &ip60->dst_address,
733                                frag0->identification, frag0->next_hdr,
734                                &fragments_to_drop);
735           //FIXME: Use better error codes
736           if (PREDICT_FALSE (!r))
737             {
738               // Could not create a caching entry
739               error0 = MAP_ERROR_FRAGMENT_MEMORY;
740             }
741           else if (PREDICT_FALSE ((frag_len <= 20 &&
742                                    (ip6_frag_hdr_more (frag0) || (!offset)))))
743             {
744               //Very small fragment are restricted to the last one and
745               //can't be the first one
746               error0 = MAP_ERROR_FRAGMENT_MALFORMED;
747             }
748           else
749             if (map_ip6_reass_add_fragment
750                 (r, pi0, offset, next_offset, (u8 *) (frag0 + 1), frag_len))
751             {
752               map_ip6_reass_free (r, &fragments_to_drop);
753               error0 = MAP_ERROR_FRAGMENT_MEMORY;
754             }
755           else
756             {
757 #ifdef MAP_IP6_REASS_COUNT_BYTES
758               if (!ip6_frag_hdr_more (frag0))
759                 r->expected_total = offset + frag_len;
760 #endif
761               ip6_map_ip6_reass_prepare (vm, node, r, &fragments_ready,
762                                          &fragments_to_drop);
763 #ifdef MAP_IP6_REASS_COUNT_BYTES
764               if (r->forwarded >= r->expected_total)
765                 map_ip6_reass_free (r, &fragments_to_drop);
766 #endif
767             }
768           map_ip6_reass_unlock ();
769
770           if (error0 == MAP_ERROR_NONE)
771             {
772               if (frag_len > 20)
773                 {
774                   //Dequeue the packet
775                   n_left_to_next++;
776                   to_next--;
777                 }
778               else
779                 {
780                   //All data from that packet was copied no need to keep it, but this is not an error
781                   p0->error = error_node->errors[MAP_ERROR_NONE];
782                   vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
783                                                    to_next, n_left_to_next,
784                                                    pi0,
785                                                    IP6_MAP_IP6_REASS_NEXT_DROP);
786                 }
787             }
788           else
789             {
790               p0->error = error_node->errors[error0];
791               vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
792                                                n_left_to_next, pi0,
793                                                IP6_MAP_IP6_REASS_NEXT_DROP);
794             }
795         }
796       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
797     }
798
799   map_send_all_to_node (vm, fragments_ready, node,
800                         &error_node->errors[MAP_ERROR_NONE],
801                         IP6_MAP_IP6_REASS_NEXT_IP6_MAP);
802   map_send_all_to_node (vm, fragments_to_drop, node,
803                         &error_node->errors[MAP_ERROR_FRAGMENT_DROPPED],
804                         IP6_MAP_IP6_REASS_NEXT_DROP);
805
806   vec_free (fragments_to_drop);
807   vec_free (fragments_ready);
808   return frame->n_vectors;
809 }
810
811 /*
812  * ip6_map_post_ip4_reass
813  */
814 static uword
815 ip6_map_post_ip4_reass (vlib_main_t * vm,
816                         vlib_node_runtime_t * node, vlib_frame_t * frame)
817 {
818   u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
819   vlib_node_runtime_t *error_node =
820     vlib_node_get_runtime (vm, ip6_map_post_ip4_reass_node.index);
821   map_main_t *mm = &map_main;
822   vlib_combined_counter_main_t *cm = mm->domain_counters;
823   u32 thread_index = vm->thread_index;
824
825   from = vlib_frame_vector_args (frame);
826   n_left_from = frame->n_vectors;
827   next_index = node->cached_next_index;
828   while (n_left_from > 0)
829     {
830       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
831
832       /* Single loop */
833       while (n_left_from > 0 && n_left_to_next > 0)
834         {
835           u32 pi0;
836           vlib_buffer_t *p0;
837           u8 error0 = MAP_ERROR_NONE;
838           map_domain_t *d0;
839           ip4_header_t *ip40;
840           ip6_header_t *ip60;
841           i32 port0 = 0;
842           u32 map_domain_index0 = ~0;
843           u32 next0 = IP6_MAP_POST_IP4_REASS_NEXT_IP4_LOOKUP;
844
845           pi0 = to_next[0] = from[0];
846           from += 1;
847           n_left_from -= 1;
848           to_next += 1;
849           n_left_to_next -= 1;
850
851           p0 = vlib_get_buffer (vm, pi0);
852           ip40 = vlib_buffer_get_current (p0);
853           ip60 = ((ip6_header_t *) ip40) - 1;
854
855           d0 =
856             ip4_map_get_domain ((ip4_address_t *) & ip40->src_address.as_u32,
857                                 &map_domain_index0, &error0);
858
859           port0 = vnet_buffer (p0)->ip.reass.l4_src_port;
860
861           if (PREDICT_TRUE (error0 == MAP_ERROR_NONE))
862             error0 =
863               ip6_map_sec_check (d0, port0, ip40,
864                                  ip60) ? MAP_ERROR_NONE :
865               MAP_ERROR_DECAP_SEC_CHECK;
866
867           if (PREDICT_FALSE
868               (d0->mtu && (clib_host_to_net_u16 (ip40->length) > d0->mtu)
869                && error0 == MAP_ERROR_NONE))
870             {
871               vnet_buffer (p0)->ip_frag.flags = 0;
872               vnet_buffer (p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
873               vnet_buffer (p0)->ip_frag.mtu = d0->mtu;
874               next0 = IP6_MAP_POST_IP4_REASS_NEXT_IP4_FRAGMENT;
875             }
876
877           if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
878             {
879               map_ip6_map_ip4_reass_trace_t *tr =
880                 vlib_add_trace (vm, node, p0, sizeof (*tr));
881               tr->map_domain_index = map_domain_index0;
882               tr->port = port0;
883             }
884
885           if (error0 == MAP_ERROR_NONE)
886             vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
887                                              thread_index,
888                                              map_domain_index0, 1,
889                                              clib_net_to_host_u16
890                                              (ip40->length));
891           next0 =
892             (error0 ==
893              MAP_ERROR_NONE) ? next0 : IP6_MAP_POST_IP4_REASS_NEXT_DROP;
894           p0->error = error_node->errors[error0];
895           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
896                                            n_left_to_next, pi0, next0);
897
898         }
899       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
900     }
901   return frame->n_vectors;
902 }
903
904 /*
905  * ip6_icmp_relay
906  */
907 static uword
908 ip6_map_icmp_relay (vlib_main_t * vm,
909                     vlib_node_runtime_t * node, vlib_frame_t * frame)
910 {
911   u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
912   vlib_node_runtime_t *error_node =
913     vlib_node_get_runtime (vm, ip6_map_icmp_relay_node.index);
914   map_main_t *mm = &map_main;
915   u32 thread_index = vm->thread_index;
916   u16 *fragment_ids, *fid;
917
918   from = vlib_frame_vector_args (frame);
919   n_left_from = frame->n_vectors;
920   next_index = node->cached_next_index;
921
922   /* Get random fragment IDs for replies. */
923   fid = fragment_ids =
924     clib_random_buffer_get_data (&vm->random_buffer,
925                                  n_left_from * sizeof (fragment_ids[0]));
926
927   while (n_left_from > 0)
928     {
929       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
930
931       /* Single loop */
932       while (n_left_from > 0 && n_left_to_next > 0)
933         {
934           u32 pi0;
935           vlib_buffer_t *p0;
936           u8 error0 = MAP_ERROR_NONE;
937           ip6_header_t *ip60;
938           u32 next0 = IP6_ICMP_RELAY_NEXT_IP4_LOOKUP;
939           u32 mtu;
940
941           pi0 = to_next[0] = from[0];
942           from += 1;
943           n_left_from -= 1;
944           to_next += 1;
945           n_left_to_next -= 1;
946
947           p0 = vlib_get_buffer (vm, pi0);
948           ip60 = vlib_buffer_get_current (p0);
949           u16 tlen = clib_net_to_host_u16 (ip60->payload_length);
950
951           /*
952            * In:
953            *  IPv6 header           (40)
954            *  ICMPv6 header          (8)
955            *  IPv6 header           (40)
956            *  Original IPv4 header / packet
957            * Out:
958            *  New IPv4 header
959            *  New ICMP header
960            *  Original IPv4 header / packet
961            */
962
963           /* Need at least ICMP(8) + IPv6(40) + IPv4(20) + L4 header(8) */
964           if (tlen < 76)
965             {
966               error0 = MAP_ERROR_ICMP_RELAY;
967               goto error;
968             }
969
970           icmp46_header_t *icmp60 = (icmp46_header_t *) (ip60 + 1);
971           ip6_header_t *inner_ip60 = (ip6_header_t *) (icmp60 + 2);
972
973           if (inner_ip60->protocol != IP_PROTOCOL_IP_IN_IP)
974             {
975               error0 = MAP_ERROR_ICMP_RELAY;
976               goto error;
977             }
978
979           ip4_header_t *inner_ip40 = (ip4_header_t *) (inner_ip60 + 1);
980           vlib_buffer_advance (p0, 60); /* sizeof ( IPv6 + ICMP + IPv6 - IPv4 - ICMP ) */
981           ip4_header_t *new_ip40 = vlib_buffer_get_current (p0);
982           icmp46_header_t *new_icmp40 = (icmp46_header_t *) (new_ip40 + 1);
983
984           /*
985            * Relay according to RFC2473, section 8.3
986            */
987           switch (icmp60->type)
988             {
989             case ICMP6_destination_unreachable:
990             case ICMP6_time_exceeded:
991             case ICMP6_parameter_problem:
992               /* Type 3 - destination unreachable, Code 1 - host unreachable */
993               new_icmp40->type = ICMP4_destination_unreachable;
994               new_icmp40->code =
995                 ICMP4_destination_unreachable_destination_unreachable_host;
996               break;
997
998             case ICMP6_packet_too_big:
999               /* Type 3 - destination unreachable, Code 4 - packet too big */
1000               /* Potential TODO: Adjust domain tunnel MTU based on the value received here */
1001               mtu = clib_net_to_host_u32 (*((u32 *) (icmp60 + 1)));
1002
1003               /* Check DF flag */
1004               if (!
1005                   (inner_ip40->flags_and_fragment_offset &
1006                    clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT)))
1007                 {
1008                   error0 = MAP_ERROR_ICMP_RELAY;
1009                   goto error;
1010                 }
1011
1012               new_icmp40->type = ICMP4_destination_unreachable;
1013               new_icmp40->code =
1014                 ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set;
1015               *((u32 *) (new_icmp40 + 1)) =
1016                 clib_host_to_net_u32 (mtu < 1280 ? 1280 : mtu);
1017               break;
1018
1019             default:
1020               error0 = MAP_ERROR_ICMP_RELAY;
1021               break;
1022             }
1023
1024           /*
1025            * Ensure the total ICMP packet is no longer than 576 bytes (RFC1812)
1026            */
1027           new_ip40->ip_version_and_header_length = 0x45;
1028           new_ip40->tos = 0;
1029           u16 nlen = (tlen - 20) > 576 ? 576 : tlen - 20;
1030           new_ip40->length = clib_host_to_net_u16 (nlen);
1031           new_ip40->fragment_id = fid[0];
1032           fid++;
1033           new_ip40->ttl = 64;
1034           new_ip40->protocol = IP_PROTOCOL_ICMP;
1035           new_ip40->src_address = mm->icmp4_src_address;
1036           new_ip40->dst_address = inner_ip40->src_address;
1037           new_ip40->checksum = ip4_header_checksum (new_ip40);
1038
1039           new_icmp40->checksum = 0;
1040           ip_csum_t sum = ip_incremental_checksum (0, new_icmp40, nlen - 20);
1041           new_icmp40->checksum = ~ip_csum_fold (sum);
1042
1043           vlib_increment_simple_counter (&mm->icmp_relayed, thread_index, 0,
1044                                          1);
1045
1046         error:
1047           if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
1048             {
1049               map_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr));
1050               tr->map_domain_index = 0;
1051               tr->port = 0;
1052             }
1053
1054           next0 =
1055             (error0 == MAP_ERROR_NONE) ? next0 : IP6_ICMP_RELAY_NEXT_DROP;
1056           p0->error = error_node->errors[error0];
1057           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1058                                            n_left_to_next, pi0, next0);
1059         }
1060       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1061     }
1062
1063   return frame->n_vectors;
1064
1065 }
1066
1067 static char *map_error_strings[] = {
1068 #define _(sym,string) string,
1069   foreach_map_error
1070 #undef _
1071 };
1072
1073 /* *INDENT-OFF* */
1074 VNET_FEATURE_INIT (ip6_map_feature, static) =
1075 {
1076   .arc_name = "ip6-unicast",
1077   .node_name = "ip6-map",
1078   .runs_before = VNET_FEATURES ("ip6-flow-classify"),
1079 };
1080
1081 VLIB_REGISTER_NODE(ip6_map_node) = {
1082   .function = ip6_map,
1083   .name = "ip6-map",
1084   .vector_size = sizeof(u32),
1085   .format_trace = format_map_trace,
1086   .type = VLIB_NODE_TYPE_INTERNAL,
1087
1088   .n_errors = MAP_N_ERROR,
1089   .error_strings = map_error_strings,
1090
1091   .n_next_nodes = IP6_MAP_N_NEXT,
1092   .next_nodes = {
1093     [IP6_MAP_NEXT_IP4_LOOKUP] = "ip4-lookup",
1094 #ifdef MAP_SKIP_IP6_LOOKUP
1095     [IP6_MAP_NEXT_IP4_REWRITE] = "ip4-load-balance",
1096 #endif
1097     [IP6_MAP_NEXT_IP6_REASS] = "ip6-map-ip6-reass",
1098     [IP6_MAP_NEXT_IP4_REASS] = "ip4-sv-reassembly-custom-next",
1099     [IP6_MAP_NEXT_IP4_FRAGMENT] = "ip4-frag",
1100     [IP6_MAP_NEXT_IP6_ICMP_RELAY] = "ip6-map-icmp-relay",
1101     [IP6_MAP_NEXT_IP6_LOCAL] = "ip6-local",
1102     [IP6_MAP_NEXT_DROP] = "error-drop",
1103     [IP6_MAP_NEXT_ICMP] = "ip6-icmp-error",
1104   },
1105 };
1106 /* *INDENT-ON* */
1107
1108 /* *INDENT-OFF* */
1109 VLIB_REGISTER_NODE(ip6_map_ip6_reass_node) = {
1110   .function = ip6_map_ip6_reass,
1111   .name = "ip6-map-ip6-reass",
1112   .vector_size = sizeof(u32),
1113   .format_trace = format_ip6_map_ip6_reass_trace,
1114   .type = VLIB_NODE_TYPE_INTERNAL,
1115   .n_errors = MAP_N_ERROR,
1116   .error_strings = map_error_strings,
1117   .n_next_nodes = IP6_MAP_IP6_REASS_N_NEXT,
1118   .next_nodes = {
1119     [IP6_MAP_IP6_REASS_NEXT_IP6_MAP] = "ip6-map",
1120     [IP6_MAP_IP6_REASS_NEXT_DROP] = "error-drop",
1121   },
1122 };
1123 /* *INDENT-ON* */
1124
1125 /* *INDENT-OFF* */
1126 VLIB_REGISTER_NODE(ip6_map_post_ip4_reass_node) = {
1127   .function = ip6_map_post_ip4_reass,
1128   .name = "ip6-map-post-ip4-reass",
1129   .vector_size = sizeof(u32),
1130   .format_trace = format_ip6_map_post_ip4_reass_trace,
1131   .type = VLIB_NODE_TYPE_INTERNAL,
1132   .n_errors = MAP_N_ERROR,
1133   .error_strings = map_error_strings,
1134   .n_next_nodes = IP6_MAP_POST_IP4_REASS_N_NEXT,
1135   .next_nodes = {
1136     [IP6_MAP_POST_IP4_REASS_NEXT_IP4_LOOKUP] = "ip4-lookup",
1137     [IP6_MAP_POST_IP4_REASS_NEXT_IP4_FRAGMENT] = "ip4-frag",
1138     [IP6_MAP_POST_IP4_REASS_NEXT_DROP] = "error-drop",
1139   },
1140 };
1141 /* *INDENT-ON* */
1142
1143 /* *INDENT-OFF* */
1144 VLIB_REGISTER_NODE(ip6_map_icmp_relay_node, static) = {
1145   .function = ip6_map_icmp_relay,
1146   .name = "ip6-map-icmp-relay",
1147   .vector_size = sizeof(u32),
1148   .format_trace = format_map_trace, //FIXME
1149   .type = VLIB_NODE_TYPE_INTERNAL,
1150   .n_errors = MAP_N_ERROR,
1151   .error_strings = map_error_strings,
1152   .n_next_nodes = IP6_ICMP_RELAY_N_NEXT,
1153   .next_nodes = {
1154     [IP6_ICMP_RELAY_NEXT_IP4_LOOKUP] = "ip4-lookup",
1155     [IP6_ICMP_RELAY_NEXT_DROP] = "error-drop",
1156   },
1157 };
1158 /* *INDENT-ON* */
1159
1160 clib_error_t *
1161 ip6_map_init (vlib_main_t * vm)
1162 {
1163   map_main.ip4_sv_reass_custom_next_index =
1164     ip4_sv_reass_custom_register_next_node
1165     (ip6_map_post_ip4_reass_node.index);
1166   return 0;
1167 }
1168
1169 VLIB_INIT_FUNCTION (ip6_map_init) =
1170 {
1171 .runs_after = VLIB_INITS ("map_init"),};
1172
1173 /*
1174  * fd.io coding-style-patch-verification: ON
1175  *
1176  * Local Variables:
1177  * eval: (c-set-style "gnu")
1178  * End:
1179  */