fa5a68c31c2e8423a56740a93e7b83170bc0bef1
[vpp.git] / src / vnet / sr / sr_replicate.c
1 /*
2  * sr_replicate.c: ipv6 segment routing replicator for multicast
3  *
4  * Copyright (c) 2016 Cisco and/or its affiliates.
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 /**
18  *  @file
19  *  @brief Functions for replicating packets across SR tunnels.
20  *
21  *  Leverages rte_pktmbuf_clone() so there is no memcpy for
22  *  invariant parts of the packet.
23  *
24  *  @note Currently requires DPDK
25 */
26
27 #if DPDK > 0                    /* Cannot run replicate without DPDK */
28 #include <vlib/vlib.h>
29 #include <vnet/vnet.h>
30 #include <vnet/pg/pg.h>
31 #include <vnet/sr/sr.h>
32 #include <vnet/devices/dpdk/dpdk.h>
33 #include <vnet/devices/dpdk/dpdk_priv.h>
34 #include <vnet/ip/ip.h>
35 #include <vnet/fib/ip6_fib.h>
36
37 #include <vppinfra/hash.h>
38 #include <vppinfra/error.h>
39 #include <vppinfra/elog.h>
40
41 /**
42  *   @brief sr_replicate state.
43  *
44 */
45 typedef struct
46 {
47   /* convenience */
48   vlib_main_t *vlib_main;
49   vnet_main_t *vnet_main;
50 } sr_replicate_main_t;
51
52 sr_replicate_main_t sr_replicate_main;
53
54 /**
55  *    @brief Information to display in packet trace.
56  *
57 */
58 typedef struct
59 {
60   ip6_address_t src, dst;
61   u16 length;
62   u32 next_index;
63   u32 tunnel_index;
64   u8 sr[256];
65 } sr_replicate_trace_t;
66
67 /**
68  *  @brief packet trace format function.
69  *
70  *  @param *s u8 used for string output
71  *  @param *args va_list  structured input to va_arg to output @ref sr_replicate_trace_t
72  *  @return *s u8 - formatted trace output
73 */
74 static u8 *
75 format_sr_replicate_trace (u8 * s, va_list * args)
76 {
77   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
78   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
79   sr_replicate_trace_t *t = va_arg (*args, sr_replicate_trace_t *);
80   ip6_sr_main_t *sm = &sr_main;
81   ip6_sr_tunnel_t *tun = pool_elt_at_index (sm->tunnels, t->tunnel_index);
82   ip6_fib_t *rx_fib, *tx_fib;
83
84   rx_fib = ip6_fib_get (tun->rx_fib_index);
85   tx_fib = ip6_fib_get (tun->tx_fib_index);
86
87   s = format
88     (s, "SR-REPLICATE: next %s ip6 src %U dst %U len %u\n"
89      "           rx-fib-id %d tx-fib-id %d\n%U",
90      "ip6-lookup",
91      format_ip6_address, &t->src,
92      format_ip6_address, &t->dst, t->length,
93      rx_fib->table_id, tx_fib->table_id,
94      format_ip6_sr_header, t->sr, 0 /* print_hmac */ );
95   return s;
96
97 }
98
99 #define foreach_sr_replicate_error \
100 _(REPLICATED, "sr packets replicated") \
101 _(NO_BUFFERS, "error allocating buffers for replicas") \
102 _(NO_REPLICAS, "no replicas were needed") \
103 _(NO_BUFFER_DROPS, "sr no buffer drops")
104
105 /**
106  * @brief Struct for SR replicate errors
107  */
108 typedef enum
109 {
110 #define _(sym,str) SR_REPLICATE_ERROR_##sym,
111   foreach_sr_replicate_error
112 #undef _
113     SR_REPLICATE_N_ERROR,
114 } sr_replicate_error_t;
115
116 /**
117  * @brief Error strings for SR replicate
118  */
119 static char *sr_replicate_error_strings[] = {
120 #define _(sym,string) string,
121   foreach_sr_replicate_error
122 #undef _
123 };
124
125 /**
126  * @brief Defines next-nodes for packet processing.
127  *
128 */
129 typedef enum
130 {
131   SR_REPLICATE_NEXT_IP6_LOOKUP,
132   SR_REPLICATE_N_NEXT,
133 } sr_replicate_next_t;
134
135 /**
136  *   @brief Single loop packet replicator.
137  *
138  *   @node sr-replicate
139  *   @param vm vlib_main_t
140  *   @return frame->n_vectors uword
141 */
142 static uword
143 sr_replicate_node_fn (vlib_main_t * vm,
144                       vlib_node_runtime_t * node, vlib_frame_t * frame)
145 {
146   dpdk_main_t *dm = &dpdk_main;
147   u32 n_left_from, *from, *to_next;
148   sr_replicate_next_t next_index;
149   int pkts_replicated = 0;
150   ip6_sr_main_t *sm = &sr_main;
151   int no_buffer_drops = 0;
152   vlib_buffer_free_list_t *fl;
153   unsigned socket_id = rte_socket_id ();
154
155   from = vlib_frame_vector_args (frame);
156   n_left_from = frame->n_vectors;
157   next_index = node->cached_next_index;
158
159   fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
160
161   while (n_left_from > 0)
162     {
163       u32 n_left_to_next;
164
165       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
166
167       while (n_left_from > 0 && n_left_to_next > 0)
168         {
169           u32 bi0, hdr_bi0;
170           vlib_buffer_t *b0, *orig_b0;
171           struct rte_mbuf *orig_mb0 = 0, *hdr_mb0 = 0, *clone0 = 0;
172           struct rte_mbuf **hdr_vec = 0, **rte_mbuf_vec = 0;
173           ip6_sr_policy_t *pol0 = 0;
174           ip6_sr_tunnel_t *t0 = 0;
175           ip6_sr_header_t *hdr_sr0 = 0;
176           ip6_header_t *ip0 = 0, *hdr_ip0 = 0;
177           int num_replicas = 0;
178           int i;
179           u32 len_bytes = sizeof (ip6_header_t);
180           u8 next_hdr, ip_next_hdr = IPPROTO_IPV6_ROUTE;
181
182           bi0 = from[0];
183
184           b0 = vlib_get_buffer (vm, bi0);
185           orig_b0 = b0;
186
187           pol0 = pool_elt_at_index (sm->policies,
188                                     vnet_buffer (b0)->ip.save_protocol);
189
190           ip0 = vlib_buffer_get_current (b0);
191           /* Skip forward to the punch-in point */
192           vlib_buffer_advance (b0, sizeof (*ip0));
193           next_hdr = ip0->protocol;
194
195           /* HBH must immediately follow ipv6 header */
196           if (PREDICT_FALSE
197               (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS))
198             {
199               ip6_hop_by_hop_ext_t *ext_hdr =
200                 (ip6_hop_by_hop_ext_t *) ip6_next_header (ip0);
201               u32 ext_hdr_len = 0;
202               ext_hdr_len = ip6_ext_header_len ((ip6_ext_header_t *) ext_hdr);
203               len_bytes += ext_hdr_len;
204               next_hdr = ext_hdr->next_hdr;
205               ext_hdr->next_hdr = IPPROTO_IPV6_ROUTE;
206               ip_next_hdr = IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS;
207               /* Skip forward to the punch-in point */
208               vlib_buffer_advance (b0, ext_hdr_len);
209
210             }
211
212           orig_mb0 = rte_mbuf_from_vlib_buffer (b0);
213
214           i16 delta0 = vlib_buffer_length_in_chain (vm, orig_b0)
215             - (i16) orig_mb0->pkt_len;
216
217           u16 new_data_len0 = (u16) ((i16) orig_mb0->data_len + delta0);
218           u16 new_pkt_len0 = (u16) ((i16) orig_mb0->pkt_len + delta0);
219
220           orig_mb0->data_len = new_data_len0;
221           orig_mb0->pkt_len = new_pkt_len0;
222           orig_mb0->data_off += (u16) (b0->current_data);
223
224           /*
225              Before entering loop determine if we can allocate:
226              - all the new HEADER RTE_MBUFs and assign them to a vector
227              - all the clones
228
229              if successful, then iterate over vectors of resources
230
231            */
232           num_replicas = vec_len (pol0->tunnel_indices);
233
234           if (PREDICT_FALSE (num_replicas == 0))
235             {
236               b0->error = node->errors[SR_REPLICATE_ERROR_NO_REPLICAS];
237               goto do_trace0;
238             }
239
240           vec_reset_length (hdr_vec);
241           vec_reset_length (rte_mbuf_vec);
242
243           for (i = 0; i < num_replicas; i++)
244             {
245               uint8_t nb_seg;
246               struct rte_mbuf *clone0i;
247               vlib_buffer_t *clone0_c, *clone_b0;
248
249               t0 = vec_elt_at_index (sm->tunnels, pol0->tunnel_indices[i]);
250               hdr_mb0 = rte_pktmbuf_alloc (dm->pktmbuf_pools[socket_id]);
251
252               if (i < (num_replicas - 1))
253                 {
254                   /* Not the last tunnel to process */
255                   clone0 = rte_pktmbuf_clone
256                     (orig_mb0, dm->pktmbuf_pools[socket_id]);
257                   if (clone0 == 0)
258                     goto clone_fail;
259                   nb_seg = 0;
260                   clone0i = clone0;
261                   clone0_c = NULL;
262                   while ((clone0->nb_segs >= 1) && (nb_seg < clone0->nb_segs))
263                     {
264
265                       clone_b0 = vlib_buffer_from_rte_mbuf (clone0i);
266                       vlib_buffer_init_for_free_list (clone_b0, fl);
267
268                       ASSERT ((clone_b0->flags & VLIB_BUFFER_NEXT_PRESENT) ==
269                               0);
270                       ASSERT (clone_b0->current_data == 0);
271
272                       clone_b0->current_data =
273                         (clone0i->buf_addr + clone0i->data_off) -
274                         (void *) clone_b0->data;
275
276                       clone_b0->current_length = clone0i->data_len;
277                       if (PREDICT_FALSE (clone0_c != NULL))
278                         {
279                           clone0_c->flags |= VLIB_BUFFER_NEXT_PRESENT;
280                           clone0_c->next_buffer =
281                             vlib_get_buffer_index (vm, clone_b0);
282                         }
283                       clone0_c = clone_b0;
284                       clone0i = clone0i->next;
285                       nb_seg++;
286                     }
287                 }
288               else
289                 /* First tunnel to process, use original MB */
290                 clone0 = orig_mb0;
291
292
293               if (PREDICT_FALSE (!clone0 || !hdr_mb0))
294                 {
295                 clone_fail:
296                   b0->error = node->errors[SR_REPLICATE_ERROR_NO_BUFFERS];
297
298                   vec_foreach_index (i, rte_mbuf_vec)
299                   {
300                     rte_pktmbuf_free (rte_mbuf_vec[i]);
301                   }
302                   vec_free (rte_mbuf_vec);
303
304                   vec_foreach_index (i, hdr_vec)
305                   {
306                     rte_pktmbuf_free (hdr_vec[i]);
307                   }
308                   vec_free (hdr_vec);
309
310                   goto do_trace0;
311                 }
312
313               vec_add1 (hdr_vec, hdr_mb0);
314               vec_add1 (rte_mbuf_vec, clone0);
315
316             }
317
318           for (i = 0; i < num_replicas; i++)
319             {
320               vlib_buffer_t *hdr_b0;
321               u16 new_l0 = 0;
322
323               t0 = vec_elt_at_index (sm->tunnels, pol0->tunnel_indices[i]);
324               /* Our replicas */
325               hdr_mb0 = hdr_vec[i];
326               clone0 = rte_mbuf_vec[i];
327
328               hdr_mb0->data_len = len_bytes + vec_len (t0->rewrite);
329               hdr_mb0->pkt_len = hdr_mb0->data_len +
330                 vlib_buffer_length_in_chain (vm, orig_b0);
331
332               hdr_b0 = vlib_buffer_from_rte_mbuf (hdr_mb0);
333
334               vlib_buffer_init_for_free_list (hdr_b0, fl);
335
336               memcpy (hdr_b0->data, ip0, len_bytes);
337               memcpy (hdr_b0->data + len_bytes, t0->rewrite,
338                       vec_len (t0->rewrite));
339
340               hdr_b0->current_data = 0;
341               hdr_b0->current_length = len_bytes + vec_len (t0->rewrite);
342               hdr_b0->flags = orig_b0->flags | VLIB_BUFFER_NEXT_PRESENT;
343               hdr_b0->trace_index = orig_b0->trace_index;
344               vnet_buffer (hdr_b0)->l2_classify.opaque_index = 0;
345
346               hdr_b0->total_length_not_including_first_buffer =
347                 hdr_mb0->pkt_len - hdr_b0->current_length;
348               vnet_buffer (hdr_b0)->sw_if_index[VLIB_TX] = t0->tx_fib_index;
349
350               hdr_ip0 = (ip6_header_t *) hdr_b0->data;
351               new_l0 = clib_net_to_host_u16 (ip0->payload_length) +
352                 vec_len (t0->rewrite);
353               hdr_ip0->payload_length = clib_host_to_net_u16 (new_l0);
354               hdr_sr0 = (ip6_sr_header_t *) ((u8 *) hdr_ip0 + len_bytes);
355               /* $$$ tune */
356               clib_memcpy (hdr_sr0, t0->rewrite, vec_len (t0->rewrite));
357               hdr_sr0->protocol = next_hdr;
358               hdr_ip0->protocol = ip_next_hdr;
359
360               /* Copy dst address into the DA slot in the segment list */
361               clib_memcpy (hdr_sr0->segments, ip0->dst_address.as_u64,
362                            sizeof (ip6_address_t));
363
364               /* Rewrite the ip6 dst address */
365               hdr_ip0->dst_address.as_u64[0] = t0->first_hop.as_u64[0];
366               hdr_ip0->dst_address.as_u64[1] = t0->first_hop.as_u64[1];
367
368               sr_fix_hmac (sm, hdr_ip0, hdr_sr0);
369
370               /* prepend new header to invariant piece */
371               hdr_mb0->next = clone0;
372               hdr_b0->next_buffer =
373                 vlib_get_buffer_index (vm,
374                                        vlib_buffer_from_rte_mbuf (clone0));
375
376               /* update header's fields */
377               hdr_mb0->pkt_len =
378                 (uint16_t) (hdr_mb0->data_len + clone0->pkt_len);
379               hdr_mb0->nb_segs = (uint8_t) (clone0->nb_segs + 1);
380
381               /* copy metadata from source packet */
382               hdr_mb0->port = clone0->port;
383               hdr_mb0->vlan_tci = clone0->vlan_tci;
384               hdr_mb0->vlan_tci_outer = clone0->vlan_tci_outer;
385               hdr_mb0->tx_offload = clone0->tx_offload;
386               hdr_mb0->hash = clone0->hash;
387
388               hdr_mb0->ol_flags = clone0->ol_flags & ~(IND_ATTACHED_MBUF);
389
390               __rte_mbuf_sanity_check (hdr_mb0, 1);
391
392               hdr_bi0 = vlib_get_buffer_index (vm, hdr_b0);
393
394               to_next[0] = hdr_bi0;
395               to_next += 1;
396               n_left_to_next -= 1;
397
398               if (n_left_to_next == 0)
399                 {
400                   vlib_put_next_frame (vm, node, next_index, n_left_to_next);
401                   vlib_get_next_frame (vm, node, next_index,
402                                        to_next, n_left_to_next);
403
404                 }
405               pkts_replicated++;
406             }
407
408           from += 1;
409           n_left_from -= 1;
410
411         do_trace0:
412           if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
413             {
414               sr_replicate_trace_t *tr = vlib_add_trace (vm, node,
415                                                          b0, sizeof (*tr));
416               tr->tunnel_index = t0 - sm->tunnels;
417               tr->length = 0;
418               if (hdr_ip0)
419                 {
420                   memcpy (tr->src.as_u8, hdr_ip0->src_address.as_u8,
421                           sizeof (tr->src.as_u8));
422                   memcpy (tr->dst.as_u8, hdr_ip0->dst_address.as_u8,
423                           sizeof (tr->dst.as_u8));
424                   if (hdr_ip0->payload_length)
425                     tr->length = clib_net_to_host_u16
426                       (hdr_ip0->payload_length);
427                 }
428               tr->next_index = next_index;
429               if (hdr_sr0)
430                 memcpy (tr->sr, hdr_sr0, sizeof (tr->sr));
431             }
432
433         }
434
435       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
436     }
437
438   vlib_node_increment_counter (vm, sr_replicate_node.index,
439                                SR_REPLICATE_ERROR_REPLICATED,
440                                pkts_replicated);
441
442   vlib_node_increment_counter (vm, sr_replicate_node.index,
443                                SR_REPLICATE_ERROR_NO_BUFFER_DROPS,
444                                no_buffer_drops);
445
446   return frame->n_vectors;
447 }
448
449 /* *INDENT-OFF* */
450 VLIB_REGISTER_NODE (sr_replicate_node) = {
451   .function = sr_replicate_node_fn,
452   .name = "sr-replicate",
453   .vector_size = sizeof (u32),
454   .format_trace = format_sr_replicate_trace,
455   .type = VLIB_NODE_TYPE_INTERNAL,
456
457   .n_errors = ARRAY_LEN(sr_replicate_error_strings),
458   .error_strings = sr_replicate_error_strings,
459
460   .n_next_nodes = SR_REPLICATE_N_NEXT,
461
462   .next_nodes = {
463         [SR_REPLICATE_NEXT_IP6_LOOKUP] = "ip6-lookup",
464   },
465 };
466
467 VLIB_NODE_FUNCTION_MULTIARCH (sr_replicate_node, sr_replicate_node_fn)
468 /* *INDENT-ON* */
469
470 clib_error_t *
471 sr_replicate_init (vlib_main_t * vm)
472 {
473   sr_replicate_main_t *msm = &sr_replicate_main;
474
475   msm->vlib_main = vm;
476   msm->vnet_main = vnet_get_main ();
477
478   return 0;
479 }
480
481 VLIB_INIT_FUNCTION (sr_replicate_init);
482
483 #endif /* DPDK */
484
485 /*
486  * fd.io coding-style-patch-verification: ON
487  *
488  * Local Variables:
489  * eval: (c-set-style "gnu")
490  * End:
491  */