2 * Copyright (c) 2020 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #include <vnet/fib/fib_source.h>
17 #include <vnet/fib/fib_table.h>
18 #include <vnet/fib/fib_entry_track.h>
19 #include <vnet/dpo/load_balance.h>
20 #include <vnet/dpo/drop_dpo.h>
21 #include <vnet/dpo/dpo.h>
23 #include <cnat/cnat_translation.h>
24 #include <cnat/cnat_maglev.h>
25 #include <cnat/cnat_session.h>
26 #include <cnat/cnat_client.h>
28 cnat_translation_t *cnat_translation_pool;
29 clib_bihash_8_8_t cnat_translation_db;
30 addr_resolution_t *tr_resolutions;
31 cnat_if_addr_add_cb_t *cnat_if_addr_add_cbs;
33 static fib_node_type_t cnat_translation_fib_node_type;
35 vlib_combined_counter_main_t cnat_translation_counters = {
36 .name = "cnat-translation",
37 .stat_segment_name = "/net/cnat-translation",
41 cnat_translation_watch_addr (index_t cti, u64 opaque, cnat_endpoint_t * ep,
42 cnat_addr_resol_type_t type)
44 addr_resolution_t *ar;
46 if (INDEX_INVALID == ep->ce_sw_if_index)
49 pool_get (tr_resolutions, ar);
50 ar->af = ep->ce_ip.version;
51 ar->sw_if_index = ep->ce_sw_if_index;
58 cnat_resolve_ep_tuple (cnat_endpoint_tuple_t * path)
60 cnat_resolve_ep (&path->src_ep);
61 cnat_resolve_ep (&path->dst_ep);
65 cnat_translation_unwatch_addr (u32 cti, cnat_addr_resol_type_t type)
67 /* Delete tr resolution entries matching translation index */
68 addr_resolution_t *ar;
69 index_t *indexes = 0, *ari;
70 pool_foreach (ar, tr_resolutions)
72 if ((cti == INDEX_INVALID || ar->cti == cti) &&
73 (ar->type == type || CNAT_RESOLV_ADDR_ANY == type))
74 vec_add1 (indexes, ar - tr_resolutions);
76 vec_foreach (ari, indexes) pool_put_index (tr_resolutions, *ari);
82 cnat_tracker_release (cnat_ep_trk_t * trk)
84 /* We only track fully resolved endpoints */
85 if (!(trk->ct_flags & CNAT_TRK_ACTIVE))
87 dpo_reset (&trk->ct_dpo); // undo fib_entry_contribute_forwarding
88 fib_entry_untrack (trk->ct_fei, trk->ct_sibling);
92 cnat_tracker_track (index_t cti, cnat_ep_trk_t * trk)
95 /* We only track fully resolved endpoints */
96 if (trk->ct_ep[VLIB_TX].ce_flags & CNAT_EP_FLAG_RESOLVED &&
97 trk->ct_ep[VLIB_RX].ce_flags & CNAT_EP_FLAG_RESOLVED)
98 trk->ct_flags |= CNAT_TRK_ACTIVE;
101 trk->ct_flags &= ~CNAT_TRK_ACTIVE;
105 ip_address_to_fib_prefix (&trk->ct_ep[VLIB_TX].ce_ip, &pfx);
106 trk->ct_fei = fib_entry_track (CNAT_FIB_TABLE,
108 cnat_translation_fib_node_type,
109 cti, &trk->ct_sibling);
111 fib_entry_contribute_forwarding (trk->ct_fei,
112 fib_forw_chain_type_from_fib_proto
113 (pfx.fp_proto), &trk->ct_dpo);
117 format_cnat_lb_type (u8 *s, va_list *args)
119 cnat_lb_type_t lb_type = va_arg (*args, int);
120 if (CNAT_LB_DEFAULT == lb_type)
121 s = format (s, "default");
122 else if (CNAT_LB_MAGLEV == lb_type)
123 s = format (s, "maglev");
125 s = format (s, "unknown");
130 unformat_cnat_lb_type (unformat_input_t *input, va_list *args)
132 cnat_lb_type_t *a = va_arg (*args, cnat_lb_type_t *);
133 if (unformat (input, "default"))
134 *a = CNAT_LB_DEFAULT;
135 else if (unformat (input, "maglev"))
143 * Add a translation to the bihash
145 * @param cci the ID of the parent client (invalid if vip not resolved)
146 * @param vip the translation endpoint
147 * @param proto the translation proto
148 * @param cti the translation index to be used as value
151 cnat_add_translation_to_db (index_t cci, cnat_endpoint_t * vip,
152 ip_protocol_t proto, index_t cti)
154 clib_bihash_kv_8_8_t bkey;
156 if (INDEX_INVALID == cci)
158 key = proto << 8 | 0x80 | vip->ce_ip.version;
159 key = key << 16 | vip->ce_port;
160 key = key << 32 | (u32) vip->ce_sw_if_index;
165 key = key << 16 | vip->ce_port;
166 key = key << 32 | (u32) cci;
172 clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 1);
176 * Remove a translation from the bihash
178 * @param cci the ID of the parent client
179 * @param vip the translation endpoint
180 * @param proto the translation proto
183 cnat_remove_translation_from_db (index_t cci, cnat_endpoint_t * vip,
186 clib_bihash_kv_8_8_t bkey;
188 if (INDEX_INVALID == cci)
190 key = proto << 8 | 0x80 | vip->ce_ip.version;
191 key = key << 16 | vip->ce_port;
192 key = key << 32 | (u32) vip->ce_sw_if_index;
197 key = key << 16 | vip->ce_port;
198 key = key << 32 | (u32) cci;
203 clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 0);
209 cnat_translation_stack (cnat_translation_t * ct)
211 fib_protocol_t fproto;
217 fproto = ip_address_family_to_fib_proto (ct->ct_vip.ce_ip.version);
218 dproto = fib_proto_to_dpo (fproto);
220 vec_reset_length (ct->ct_active_paths);
222 vec_foreach (trk, ct->ct_paths)
223 if (trk->ct_flags & CNAT_TRK_ACTIVE)
224 vec_add1 (ct->ct_active_paths, *trk);
226 flow_hash_config_t fhc = IP_FLOW_HASH_DEFAULT;
229 lbi = load_balance_create (vec_len (ct->ct_active_paths),
230 fib_proto_to_dpo (fproto), fhc);
233 vec_foreach (trk, ct->ct_active_paths)
234 load_balance_set_bucket (lbi, ep_idx++, &trk->ct_dpo);
236 if (ep_idx > 0 && CNAT_LB_MAGLEV == ct->lb_type)
237 cnat_translation_init_maglev (ct);
239 dpo_set (&ct->ct_lb, DPO_LOAD_BALANCE, dproto, lbi);
240 dpo_stack (cnat_client_dpo, dproto, &ct->ct_lb, &ct->ct_lb);
241 ct->flags |= CNAT_TR_FLAG_STACKED;
245 cnat_translation_delete (u32 id)
247 cnat_translation_t *ct;
250 if (pool_is_free_index (cnat_translation_pool, id))
251 return (VNET_API_ERROR_NO_SUCH_ENTRY);
253 ct = pool_elt_at_index (cnat_translation_pool, id);
255 dpo_reset (&ct->ct_lb);
257 vec_foreach (trk, ct->ct_active_paths)
258 cnat_tracker_release (trk);
260 cnat_remove_translation_from_db (ct->ct_cci, &ct->ct_vip, ct->ct_proto);
261 cnat_client_translation_deleted (ct->ct_cci);
262 cnat_translation_unwatch_addr (id, CNAT_RESOLV_ADDR_ANY);
263 pool_put (cnat_translation_pool, ct);
269 cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto,
270 cnat_endpoint_tuple_t *paths, u8 flags,
271 cnat_lb_type_t lb_type, flow_hash_config_t fhc)
273 const dpo_id_t tmp = DPO_INVALID;
274 cnat_endpoint_tuple_t *path;
275 const cnat_client_t *cc;
276 cnat_translation_t *ct;
281 if (cnat_resolve_ep (vip))
283 /* vip only contains a sw_if_index for now */
284 ct = cnat_find_translation (vip->ce_sw_if_index, vip->ce_port, proto);
289 /* do we know of this ep's vip */
290 cci = cnat_client_add (&vip->ce_ip, flags);
291 cc = cnat_client_get (cci);
293 ct = cnat_find_translation (cc->parent_cci, vip->ce_port, proto);
298 pool_get_zero (cnat_translation_pool, ct);
300 clib_memcpy (&ct->ct_vip, vip, sizeof (*vip));
301 ct->ct_proto = proto;
303 ct->index = ct - cnat_translation_pool;
304 ct->lb_type = lb_type;
307 cnat_add_translation_to_db (cci, vip, proto, ct->index);
308 cnat_client_translation_added (cci);
310 vlib_validate_combined_counter (&cnat_translation_counters, ct->index);
311 vlib_zero_combined_counter (&cnat_translation_counters, ct->index);
315 cnat_translation_unwatch_addr (ct->index, CNAT_RESOLV_ADDR_ANY);
316 cnat_translation_watch_addr (ct->index, 0, vip,
317 CNAT_RESOLV_ADDR_TRANSLATION);
319 vec_foreach (trk, ct->ct_paths)
321 cnat_tracker_release (trk);
324 vec_reset_length (ct->ct_paths);
325 ct->flags &= ~CNAT_TR_FLAG_STACKED;
328 vec_foreach (path, paths)
330 cnat_resolve_ep_tuple (path);
331 cnat_translation_watch_addr (ct->index,
332 path_idx << 32 | VLIB_RX, &path->src_ep,
333 CNAT_RESOLV_ADDR_BACKEND);
334 cnat_translation_watch_addr (ct->index,
335 path_idx << 32 | VLIB_TX, &path->dst_ep,
336 CNAT_RESOLV_ADDR_BACKEND);
339 vec_add2 (ct->ct_paths, trk, 1);
341 clib_memcpy (&trk->ct_ep[VLIB_TX], &path->dst_ep,
342 sizeof (trk->ct_ep[VLIB_TX]));
343 clib_memcpy (&trk->ct_ep[VLIB_RX], &path->src_ep,
344 sizeof (trk->ct_ep[VLIB_RX]));
345 trk->ct_flags = path->ep_flags;
348 cnat_tracker_track (ct->index, trk);
351 cnat_translation_stack (ct);
357 cnat_translation_walk (cnat_translation_walk_cb_t cb, void *ctx)
361 pool_foreach_index (api, cnat_translation_pool)
369 format_cnat_ep_trk (u8 * s, va_list * args)
371 cnat_ep_trk_t *ck = va_arg (*args, cnat_ep_trk_t *);
372 u32 indent = va_arg (*args, u32);
374 s = format (s, "%U->%U", format_cnat_endpoint, &ck->ct_ep[VLIB_RX],
375 format_cnat_endpoint, &ck->ct_ep[VLIB_TX]);
376 s = format (s, "\n%Ufib-entry:%d", format_white_space, indent, ck->ct_fei);
377 s = format (s, "\n%U%U",
378 format_white_space, indent, format_dpo_id, &ck->ct_dpo, 6);
384 format_cnat_translation (u8 * s, va_list * args)
386 cnat_translation_t *ct = va_arg (*args, cnat_translation_t *);
387 cnat_main_t *cm = &cnat_main;
390 s = format (s, "[%d] ", ct->index);
391 s = format (s, "%U %U ", format_cnat_endpoint, &ct->ct_vip,
392 format_ip_protocol, ct->ct_proto);
393 s = format (s, "lb:%U ", format_cnat_lb_type, ct->lb_type);
395 if ((ct->fhc == 0) || (ct->fhc == IP_FLOW_HASH_DEFAULT))
396 s = format (s, "fhc:0x%x(default)", IP_FLOW_HASH_DEFAULT);
398 s = format (s, "fhc:0x%x", ct->fhc);
400 vec_foreach (ck, ct->ct_paths)
401 s = format (s, "\n%U", format_cnat_ep_trk, ck, 2);
403 /* If printing a trace, the LB object might be deleted */
404 if (!pool_is_free_index (load_balance_pool, ct->ct_lb.dpoi_index))
406 s = format (s, "\n via:");
407 s = format (s, "\n%U%U",
408 format_white_space, 2, format_dpo_id, &ct->ct_lb, 2);
412 if (CNAT_LB_MAGLEV == ct->lb_type)
414 s = format (s, "\nmaglev backends map");
415 uword *bitmap = NULL;
416 clib_bitmap_alloc (bitmap, cm->maglev_len);
417 vec_foreach (ck, ct->ct_paths)
419 clib_bitmap_zero (bitmap);
420 for (u32 i = 0; i < vec_len (ct->lb_maglev); i++)
421 if (ct->lb_maglev[i] == bid)
422 clib_bitmap_set (bitmap, i, 1);
423 s = format (s, "\n backend#%d: %U", bid, format_bitmap_hex, bitmap);
427 clib_bitmap_free (bitmap);
433 static clib_error_t *
434 cnat_translation_show (vlib_main_t * vm,
435 unformat_input_t * input, vlib_cli_command_t * cmd)
438 cnat_translation_t *ct;
442 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
444 if (unformat (input, "%d", &cti))
447 return (clib_error_return (0, "unknown input '%U'",
448 format_unformat_error, input));
451 if (INDEX_INVALID == cti)
453 pool_foreach_index (cti, cnat_translation_pool)
455 ct = pool_elt_at_index (cnat_translation_pool, cti);
456 vlib_cli_output(vm, "%U", format_cnat_translation, ct);
461 vlib_cli_output (vm, "Invalid policy ID:%d", cti);
468 cnat_translation_purge (void)
470 /* purge all the translations */
471 index_t tri, *trp, *trs = NULL;
473 pool_foreach_index (tri, cnat_translation_pool)
478 vec_foreach (trp, trs) cnat_translation_delete (*trp);
480 ASSERT (0 == pool_elts (cnat_translation_pool));
487 VLIB_CLI_COMMAND (cnat_translation_show_cmd_node, static) = {
488 .path = "show cnat translation",
489 .function = cnat_translation_show,
490 .short_help = "show cnat translation <VIP>",
495 cnat_translation_get_node (fib_node_index_t index)
497 cnat_translation_t *ct = cnat_translation_get (index);
498 return (&(ct->ct_node));
501 static cnat_translation_t *
502 cnat_translation_get_from_node (fib_node_t * node)
504 return ((cnat_translation_t *) (((char *) node) -
505 STRUCT_OFFSET_OF (cnat_translation_t,
510 cnat_translation_last_lock_gone (fib_node_t * node)
515 * A back walk has reached this ABF policy
517 static fib_node_back_walk_rc_t
518 cnat_translation_back_walk_notify (fib_node_t * node,
519 fib_node_back_walk_ctx_t * ctx)
522 * re-stack the fmask on the n-eos of the via
524 cnat_translation_t *ct = cnat_translation_get_from_node (node);
526 /* If we have more than FIB_PATH_LIST_POPULAR paths
527 * we might get called during path tracking
528 * (cnat_tracker_track) */
529 if (!(ct->flags & CNAT_TR_FLAG_STACKED))
530 return (FIB_NODE_BACK_WALK_CONTINUE);
532 cnat_translation_stack (ct);
534 return (FIB_NODE_BACK_WALK_CONTINUE);
538 * The translation's graph node virtual function table
540 static const fib_node_vft_t cnat_translation_vft = {
541 .fnv_get = cnat_translation_get_node,
542 .fnv_last_lock = cnat_translation_last_lock_gone,
543 .fnv_back_walk = cnat_translation_back_walk_notify,
546 static clib_error_t *
547 cnat_translation_cli_add_del (vlib_main_t * vm,
548 unformat_input_t * input,
549 vlib_cli_command_t * cmd)
551 u32 del_index = INDEX_INVALID;
552 ip_protocol_t proto = IP_PROTOCOL_TCP;
554 u8 flags = CNAT_FLAG_EXCLUSIVE;
555 cnat_endpoint_tuple_t tmp, *paths = NULL, *path;
556 unformat_input_t _line_input, *line_input = &_line_input;
558 cnat_lb_type_t lb_type;
560 /* Get a line of input. */
561 if (!unformat_user (input, unformat_line_input, line_input))
564 while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
566 if (unformat (line_input, "add"))
567 del_index = INDEX_INVALID;
568 else if (unformat (line_input, "del %d", &del_index))
571 if (unformat (line_input, "proto %U", unformat_ip_protocol, &proto))
573 else if (unformat (line_input, "vip %U", unformat_cnat_ep, &vip))
574 flags = CNAT_FLAG_EXCLUSIVE;
575 else if (unformat (line_input, "real %U", unformat_cnat_ep, &vip))
577 else if (unformat (line_input, "to %U", unformat_cnat_ep_tuple, &tmp))
579 vec_add2 (paths, path, 1);
580 clib_memcpy (path, &tmp, sizeof (cnat_endpoint_tuple_t));
582 else if (unformat (line_input, "%U", unformat_cnat_lb_type, &lb_type))
586 e = clib_error_return (0, "unknown input '%U'",
587 format_unformat_error, line_input);
592 flow_hash_config_t fhc = 0;
593 if (INDEX_INVALID == del_index)
594 cnat_translation_update (&vip, proto, paths, flags, lb_type, fhc);
596 cnat_translation_delete (del_index);
600 unformat_free (line_input);
604 VLIB_CLI_COMMAND (cnat_translation_cli_add_del_command, static) =
606 .path = "cnat translation",
607 .short_help = "cnat translation [add|del] proto [TCP|UDP] [vip|real] [ip|sw_if_index [v6]] [port] [to [ip|sw_if_index [v6]] [port]->[ip|sw_if_index [v6]] [port]]",
608 .function = cnat_translation_cli_add_del,
612 cnat_if_addr_add_del_translation_cb (addr_resolution_t * ar,
613 ip_address_t * address, u8 is_del)
615 cnat_translation_t *ct;
616 ct = cnat_translation_get (ar->cti);
617 if (!is_del && ct->ct_vip.ce_flags & CNAT_EP_FLAG_RESOLVED)
620 cnat_remove_translation_from_db (ct->ct_cci, &ct->ct_vip, ct->ct_proto);
624 ct->ct_vip.ce_flags &= ~CNAT_EP_FLAG_RESOLVED;
625 ct->ct_cci = INDEX_INVALID;
626 cnat_client_translation_deleted (ct->ct_cci);
627 /* Are there remaining addresses ? */
628 if (0 == cnat_resolve_addr (ar->sw_if_index, ar->af, address))
634 ct->ct_cci = cnat_client_add (address, ct->flags);
635 cnat_client_translation_added (ct->ct_cci);
636 ip_address_copy (&ct->ct_vip.ce_ip, address);
637 ct->ct_vip.ce_flags |= CNAT_EP_FLAG_RESOLVED;
640 cnat_add_translation_to_db (ct->ct_cci, &ct->ct_vip, ct->ct_proto,
645 cnat_if_addr_add_del_backend_cb (addr_resolution_t * ar,
646 ip_address_t * address, u8 is_del)
648 cnat_translation_t *ct;
652 u8 direction = ar->opaque & 0xf;
653 u32 path_idx = ar->opaque >> 32;
655 ct = cnat_translation_get (ar->cti);
657 trk = &ct->ct_paths[path_idx];
658 ep = &trk->ct_ep[direction];
660 if (!is_del && ep->ce_flags & CNAT_EP_FLAG_RESOLVED)
663 ASSERT (ep->ce_sw_if_index == ar->sw_if_index);
667 ep->ce_flags &= ~CNAT_EP_FLAG_RESOLVED;
668 /* Are there remaining addresses ? */
669 if (0 == cnat_resolve_addr (ar->sw_if_index, ar->af, address))
675 ip_address_copy (&ep->ce_ip, address);
676 ep->ce_flags |= CNAT_EP_FLAG_RESOLVED;
679 ct->flags &= ~CNAT_TR_FLAG_STACKED;
680 cnat_tracker_track (ar->cti, trk);
682 cnat_translation_stack (ct);
683 ct->flags |= CNAT_TR_FLAG_STACKED;
687 cnat_if_addr_add_del_callback (u32 sw_if_index, ip_address_t * address,
690 addr_resolution_t *ar;
691 pool_foreach (ar, tr_resolutions)
693 if (ar->sw_if_index != sw_if_index)
695 if (ar->af != ip_addr_version (address))
697 cnat_if_addr_add_cbs[ar->type](ar, address, is_del);
702 cnat_ip6_if_addr_add_del_callback (struct ip6_main_t *im,
703 uword opaque, u32 sw_if_index,
704 ip6_address_t * address,
705 u32 address_length, u32 if_address_index,
709 ip_address_set (&addr, address, AF_IP6);
710 cnat_if_addr_add_del_callback (sw_if_index, &addr, is_del);
714 cnat_ip4_if_addr_add_del_callback (struct ip4_main_t *im,
715 uword opaque, u32 sw_if_index,
716 ip4_address_t * address,
717 u32 address_length, u32 if_address_index,
721 ip_address_set (&addr, address, AF_IP4);
722 cnat_if_addr_add_del_callback (sw_if_index, &addr, is_del);
726 cnat_translation_register_addr_add_cb (cnat_addr_resol_type_t typ,
727 cnat_if_addr_add_cb_t fn)
729 vec_validate (cnat_if_addr_add_cbs, CNAT_ADDR_N_RESOLUTIONS);
730 cnat_if_addr_add_cbs[typ] = fn;
733 static clib_error_t *
734 cnat_translation_init (vlib_main_t * vm)
736 ip4_main_t *i4m = &ip4_main;
737 ip6_main_t *i6m = &ip6_main;
738 cnat_main_t *cm = &cnat_main;
739 cnat_translation_fib_node_type =
740 fib_node_register_new_type ("cnat-translation", &cnat_translation_vft);
742 clib_bihash_init_8_8 (&cnat_translation_db, "CNat translation DB",
743 cm->translation_hash_buckets,
744 cm->translation_hash_memory);
746 ip4_add_del_interface_address_callback_t cb4 = { 0 };
747 cb4.function = cnat_ip4_if_addr_add_del_callback;
748 vec_add1 (i4m->add_del_interface_address_callbacks, cb4);
750 ip6_add_del_interface_address_callback_t cb6 = { 0 };
751 cb6.function = cnat_ip6_if_addr_add_del_callback;
752 vec_add1 (i6m->add_del_interface_address_callbacks, cb6);
754 cnat_translation_register_addr_add_cb (CNAT_RESOLV_ADDR_BACKEND,
755 cnat_if_addr_add_del_backend_cb);
756 cnat_translation_register_addr_add_cb (CNAT_RESOLV_ADDR_TRANSLATION,
757 cnat_if_addr_add_del_translation_cb);
762 VLIB_INIT_FUNCTION (cnat_translation_init);
765 * fd.io coding-style-patch-verification: ON
768 * eval: (c-set-style "gnu")