From 4d237874e5c9922330c62ac1b003a9a171c1bc3b Mon Sep 17 00:00:00 2001 From: Nathan Skrzypczak Date: Thu, 25 Feb 2021 11:14:53 +0100 Subject: [PATCH] cnat: Add maglev support * Backend choice in translations is controlled by lb_type switch allowing to enable Maglev. * Size of pool is set with cnat { maglev-len 1009 } Type: feature Change-Id: I956e19d70bc9f3b997b4f8042831164e4b559d17 Signed-off-by: Nathan Skrzypczak --- src/plugins/cnat/cnat.api | 18 +++- src/plugins/cnat/cnat_api.c | 8 +- src/plugins/cnat/cnat_node.h | 29 +++++++ src/plugins/cnat/cnat_node_vip.c | 19 +--- src/plugins/cnat/cnat_session.h | 2 + src/plugins/cnat/cnat_translation.c | 168 ++++++++++++++++++++++++++++++++++-- src/plugins/cnat/cnat_translation.h | 23 ++++- src/plugins/cnat/cnat_types.c | 3 + src/plugins/cnat/cnat_types.h | 8 ++ 9 files changed, 253 insertions(+), 25 deletions(-) diff --git a/src/plugins/cnat/cnat.api b/src/plugins/cnat/cnat.api index 685f9e17146..2b79e0d1b8b 100644 --- a/src/plugins/cnat/cnat.api +++ b/src/plugins/cnat/cnat.api @@ -19,7 +19,7 @@ used to control the ABF plugin */ -option version = "0.1.0"; +option version = "0.2.0"; import "vnet/ip/ip_types.api"; import "vnet/fib/fib_types.api"; import "vnet/interface_types.api"; @@ -29,6 +29,20 @@ enum cnat_translation_flags:u8 CNAT_TRANSLATION_ALLOC_PORT = 1, }; +enum cnat_endpoint_tuple_flags:u8 +{ + /* Dont translate said endpoint tuple but + * still forward */ + CNAT_EPT_NO_NAT = 1, +}; + + +enum cnat_lb_type:u8 +{ + CNAT_LB_TYPE_DEFAULT = 0, + CNAT_LB_TYPE_MAGLEV = 1, +}; + /* An enpoint is either * An IP & a port * An interface, an address familiy and a port */ @@ -44,6 +58,7 @@ typedef cnat_endpoint_tuple { vl_api_cnat_endpoint_t dst_ep; vl_api_cnat_endpoint_t src_ep; + u8 flags; }; typedef cnat_translation @@ -53,6 +68,7 @@ typedef cnat_translation vl_api_ip_proto_t ip_proto; u8 is_real_ip; u8 flags; + vl_api_cnat_lb_type_t lb_type; u32 n_paths; vl_api_cnat_endpoint_tuple_t paths[n_paths]; }; diff --git a/src/plugins/cnat/cnat_api.c b/src/plugins/cnat/cnat_api.c index 1c6ef7b6cf4..99d9c729282 100644 --- a/src/plugins/cnat/cnat_api.c +++ b/src/plugins/cnat/cnat_api.c @@ -67,6 +67,7 @@ cnat_endpoint_tuple_decode (const vl_api_cnat_endpoint_tuple_t * in, if (rv) return rv; rv = cnat_endpoint_decode (&in->dst_ep, &out->dst_ep); + out->ep_flags = in->flags; return rv; } @@ -95,6 +96,7 @@ vl_api_cnat_translation_update_t_handler (vl_api_cnat_translation_update_t u8 flags; int rv = 0; u32 pi, n_paths; + cnat_lb_type_t lb_type; rv = ip_proto_decode (mp->translation.ip_proto, &ip_proto); @@ -119,7 +121,9 @@ vl_api_cnat_translation_update_t_handler (vl_api_cnat_translation_update_t flags = mp->translation.flags; if (!mp->translation.is_real_ip) flags |= CNAT_FLAG_EXCLUSIVE; - id = cnat_translation_update (&vip, ip_proto, paths, flags); + + lb_type = (cnat_lb_type_t) mp->translation.lb_type; + id = cnat_translation_update (&vip, ip_proto, paths, flags, lb_type); vec_free (paths); @@ -172,12 +176,14 @@ cnat_translation_send_details (u32 cti, void *args) mp->translation.id = clib_host_to_net_u32 (cti); cnat_endpoint_encode (&ct->ct_vip, &mp->translation.vip); mp->translation.ip_proto = ip_proto_encode (ct->ct_proto); + mp->translation.lb_type = (vl_api_cnat_lb_type_t) ct->lb_type; path = mp->translation.paths; vec_foreach (trk, ct->ct_paths) { cnat_endpoint_encode (&trk->ct_ep[VLIB_TX], &path->dst_ep); cnat_endpoint_encode (&trk->ct_ep[VLIB_RX], &path->src_ep); + path->flags = trk->ct_flags; path++; } diff --git a/src/plugins/cnat/cnat_node.h b/src/plugins/cnat/cnat_node.h index 56a6c612e1b..157287b0cab 100644 --- a/src/plugins/cnat/cnat_node.h +++ b/src/plugins/cnat/cnat_node.h @@ -803,6 +803,35 @@ error: return; } +static_always_inline cnat_ep_trk_t * +cnat_load_balance (const cnat_translation_t *ct, ip_address_family_t af, + ip4_header_t *ip4, ip6_header_t *ip6, u32 *dpoi_index) +{ + cnat_main_t *cm = &cnat_main; + const load_balance_t *lb0; + const dpo_id_t *dpo0; + u32 hash_c0, bucket0; + + lb0 = load_balance_get (ct->ct_lb.dpoi_index); + if (PREDICT_FALSE (!lb0->lb_n_buckets)) + return (NULL); + + /* session table miss */ + hash_c0 = (AF_IP4 == af ? ip4_compute_flow_hash (ip4, lb0->lb_hash_config) : + ip6_compute_flow_hash (ip6, lb0->lb_hash_config)); + + if (PREDICT_FALSE (ct->lb_type == CNAT_LB_MAGLEV)) + bucket0 = ct->lb_maglev[hash_c0 % cm->maglev_len]; + else + bucket0 = hash_c0 % lb0->lb_n_buckets; + + dpo0 = load_balance_get_fwd_bucket (lb0, bucket0); + + *dpoi_index = dpo0->dpoi_index; + + return &ct->ct_active_paths[bucket0]; +} + /** * Create NAT sessions * rsession_location is the location the (return) session will be diff --git a/src/plugins/cnat/cnat_node_vip.c b/src/plugins/cnat/cnat_node_vip.c index f653aa1e430..bc7d30369ab 100644 --- a/src/plugins/cnat/cnat_node_vip.c +++ b/src/plugins/cnat/cnat_node_vip.c @@ -109,14 +109,12 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b, } /* New flow, create the sessions */ - const load_balance_t *lb0; cnat_ep_trk_t *trk0; - u32 hash_c0, bucket0; u32 rsession_flags = 0; - const dpo_id_t *dpo0; + u32 dpoi_index = -1; - lb0 = load_balance_get (ct->ct_lb.dpoi_index); - if (!lb0->lb_n_buckets) + trk0 = cnat_load_balance (ct, ctx->af, ip4, ip6, &dpoi_index); + if (PREDICT_FALSE (NULL == trk0)) { /* Dont translate & Follow the fib programming */ vnet_buffer (b)->ip.adj_index[VLIB_TX] = cc->cc_parent.dpoi_index; @@ -124,16 +122,7 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b, goto trace; } - /* session table miss */ - hash_c0 = (AF_IP4 == ctx->af ? - ip4_compute_flow_hash (ip4, lb0->lb_hash_config) : - ip6_compute_flow_hash (ip6, lb0->lb_hash_config)); - bucket0 = hash_c0 % lb0->lb_n_buckets; - dpo0 = load_balance_get_fwd_bucket (lb0, bucket0); - /* add the session */ - trk0 = &ct->ct_paths[bucket0]; - ip46_address_copy (&session->value.cs_ip[VLIB_TX], &trk0->ct_ep[VLIB_TX].ce_ip.ip); if (ip_address_is_zero (&trk0->ct_ep[VLIB_RX].ce_ip)) @@ -158,7 +147,7 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b, clib_host_to_net_u16 (trk0->ct_ep[VLIB_RX].ce_port); session->value.dpoi_next_node = ct->ct_lb.dpoi_next_node; - session->value.cs_lbi = dpo0->dpoi_index; + session->value.cs_lbi = dpoi_index; rv = cspm->vip_policy (vm, b, session, &rsession_flags, ct, ctx); if (CNAT_SOURCE_ERROR_USE_DEFAULT == rv) diff --git a/src/plugins/cnat/cnat_session.h b/src/plugins/cnat/cnat_session.h index 540a2f29409..51764504bca 100644 --- a/src/plugins/cnat/cnat_session.h +++ b/src/plugins/cnat/cnat_session.h @@ -122,6 +122,8 @@ typedef enum cnat_session_flag_t_ * This session doesn't have a client, do not attempt to free it */ CNAT_SESSION_FLAG_NO_CLIENT = (1 << 2), + + CNAT_SESSION_FLAG_NO_NAT = (1 << 3), } cnat_session_flag_t; typedef enum cnat_session_location_t_ diff --git a/src/plugins/cnat/cnat_translation.c b/src/plugins/cnat/cnat_translation.c index 65c44d80b19..8b7cf2451b7 100644 --- a/src/plugins/cnat/cnat_translation.c +++ b/src/plugins/cnat/cnat_translation.c @@ -113,6 +113,32 @@ cnat_tracker_track (index_t cti, cnat_ep_trk_t * trk) (pfx.fp_proto), &trk->ct_dpo); } +u8 * +format_cnat_lb_type (u8 *s, va_list *args) +{ + cnat_lb_type_t lb_type = va_arg (*args, int); + if (CNAT_LB_DEFAULT == lb_type) + s = format (s, "default"); + else if (CNAT_LB_MAGLEV == lb_type) + s = format (s, "maglev"); + else + s = format (s, "unknown"); + return (s); +} + +uword +unformat_cnat_lb_type (unformat_input_t *input, va_list *args) +{ + cnat_lb_type_t *a = va_arg (*args, cnat_lb_type_t *); + if (unformat (input, "default")) + *a = CNAT_LB_DEFAULT; + else if (unformat (input, "maglev")) + *a = CNAT_LB_MAGLEV; + else + return 0; + return 1; +} + /** * Add a translation to the bihash * @@ -177,6 +203,109 @@ cnat_remove_translation_from_db (index_t cci, cnat_endpoint_t * vip, clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 0); } +typedef struct +{ + cnat_ep_trk_t *trk; + u32 index; + u32 offset; + u32 skip; +} cnat_maglev_entry_t; + +static int +cnat_maglev_entry_compare (void *_a, void *_b) +{ + cnat_ep_trk_t *a = ((cnat_maglev_entry_t *) _a)->trk; + cnat_ep_trk_t *b = ((cnat_maglev_entry_t *) _b)->trk; + int rv = 0; + if ((rv = + ip_address_cmp (&a->ct_ep[VLIB_TX].ce_ip, &b->ct_ep[VLIB_TX].ce_ip))) + return rv; + if ((rv = a->ct_ep[VLIB_TX].ce_port - a->ct_ep[VLIB_TX].ce_port)) + return rv; + if ((rv = + ip_address_cmp (&a->ct_ep[VLIB_RX].ce_ip, &b->ct_ep[VLIB_RX].ce_ip))) + return rv; + if ((rv = a->ct_ep[VLIB_RX].ce_port - a->ct_ep[VLIB_RX].ce_port)) + return rv; + return 0; +} + +static void +cnat_translation_init_maglev (cnat_translation_t *ct) +{ + cnat_maglev_entry_t *backends = NULL, *bk; + cnat_main_t *cm = &cnat_main; + u32 done = 0; + cnat_ep_trk_t *trk; + int ep_idx = 0; + + vec_foreach (trk, ct->ct_active_paths) + { + cnat_maglev_entry_t bk; + u32 h1, h2; + + if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip)) + { + u32 a, b, c; + a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32; + b = trk->ct_ep[VLIB_TX].ce_port << 16 | trk->ct_ep[VLIB_RX].ce_port; + c = ip_addr_v4 (&trk->ct_ep[VLIB_RX].ce_ip).data_u32; + hash_v3_mix32 (a, b, c); + hash_v3_finalize32 (a, b, c); + h1 = c; + h2 = b; + } + else + { + u64 a, b, c; + a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0] ^ + ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1]; + b = trk->ct_ep[VLIB_TX].ce_port << 16 | trk->ct_ep[VLIB_RX].ce_port; + c = ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[0] ^ + ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[1]; + hash_mix64 (a, b, c); + h1 = c; + h2 = b; + } + + bk.offset = h1 % cm->maglev_len; + bk.skip = h2 % (cm->maglev_len - 1) + 1; + bk.index = ep_idx++; + bk.trk = trk; + vec_add1 (backends, bk); + } + + if (0 == ep_idx) + return; + + vec_sort_with_function (backends, cnat_maglev_entry_compare); + + /* Don't free if previous vector exists, just zero */ + vec_validate (ct->lb_maglev, cm->maglev_len); + vec_set (ct->lb_maglev, -1); + + while (1) + { + vec_foreach (bk, backends) + { + u32 next = 0; + u32 c = (bk->offset + next * bk->skip) % cm->maglev_len; + while (ct->lb_maglev[c] != (u32) -1) + { + next++; + c = (bk->offset + next * bk->skip) % cm->maglev_len; + } + ct->lb_maglev[c] = bk->index; + done++; + if (done == cm->maglev_len) + goto finished; + } + } + +finished: + vec_free (backends); +} + static void cnat_translation_stack (cnat_translation_t * ct) { @@ -202,6 +331,9 @@ cnat_translation_stack (cnat_translation_t * ct) vec_foreach (trk, ct->ct_active_paths) load_balance_set_bucket (lbi, ep_idx++, &trk->ct_dpo); + if (ep_idx > 0 && CNAT_LB_MAGLEV == ct->lb_type) + cnat_translation_init_maglev (ct); + dpo_set (&ct->ct_lb, DPO_LOAD_BALANCE, dproto, lbi); dpo_stack (cnat_client_dpo, dproto, &ct->ct_lb, &ct->ct_lb); ct->flags |= CNAT_TRANSLATION_STACKED; @@ -232,9 +364,9 @@ cnat_translation_delete (u32 id) } u32 -cnat_translation_update (cnat_endpoint_t * vip, - ip_protocol_t proto, - cnat_endpoint_tuple_t * paths, u8 flags) +cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto, + cnat_endpoint_tuple_t *paths, u8 flags, + cnat_lb_type_t lb_type) { cnat_endpoint_tuple_t *path; const cnat_client_t *cc; @@ -266,6 +398,7 @@ cnat_translation_update (cnat_endpoint_t * vip, ct->ct_proto = proto; ct->ct_cci = cci; ct->index = ct - cnat_translation_pool; + ct->lb_type = lb_type; cnat_add_translation_to_db (cci, vip, proto, ct->index); cnat_client_translation_added (cci); @@ -305,6 +438,7 @@ cnat_translation_update (cnat_endpoint_t * vip, sizeof (trk->ct_ep[VLIB_TX])); clib_memcpy (&trk->ct_ep[VLIB_RX], &path->src_ep, sizeof (trk->ct_ep[VLIB_RX])); + trk->ct_flags = path->ep_flags; cnat_tracker_track (ct->index, trk); } @@ -345,11 +479,13 @@ u8 * format_cnat_translation (u8 * s, va_list * args) { cnat_translation_t *ct = va_arg (*args, cnat_translation_t *); + cnat_main_t *cm = &cnat_main; cnat_ep_trk_t *ck; s = format (s, "[%d] ", ct->index); - s = format (s, "%U %U", format_cnat_endpoint, &ct->ct_vip, + s = format (s, "%U %U ", format_cnat_endpoint, &ct->ct_vip, format_ip_protocol, ct->ct_proto); + s = format (s, "lb:%U ", format_cnat_lb_type, ct->lb_type); vec_foreach (ck, ct->ct_paths) s = format (s, "\n%U", format_cnat_ep_trk, ck, 2); @@ -362,6 +498,25 @@ format_cnat_translation (u8 * s, va_list * args) format_white_space, 2, format_dpo_id, &ct->ct_lb, 2); } + u32 bid = 0; + if (CNAT_LB_MAGLEV == ct->lb_type) + { + s = format (s, "\nmaglev backends map"); + uword *bitmap = NULL; + clib_bitmap_alloc (bitmap, cm->maglev_len); + vec_foreach (ck, ct->ct_paths) + { + clib_bitmap_zero (bitmap); + for (u32 i = 0; i < vec_len (ct->lb_maglev); i++) + if (ct->lb_maglev[i] == bid) + clib_bitmap_set (bitmap, i, 1); + s = format (s, "\n backend#%d: %U", bid, format_bitmap_hex, bitmap); + + bid++; + } + clib_bitmap_free (bitmap); + } + return (s); } @@ -490,6 +645,7 @@ cnat_translation_cli_add_del (vlib_main_t * vm, cnat_endpoint_tuple_t tmp, *paths = NULL, *path; unformat_input_t _line_input, *line_input = &_line_input; clib_error_t *e = 0; + cnat_lb_type_t lb_type; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -513,6 +669,8 @@ cnat_translation_cli_add_del (vlib_main_t * vm, vec_add2 (paths, path, 1); clib_memcpy (path, &tmp, sizeof (cnat_endpoint_tuple_t)); } + else if (unformat (line_input, "%U", unformat_cnat_lb_type, &lb_type)) + ; else { e = clib_error_return (0, "unknown input '%U'", @@ -522,7 +680,7 @@ cnat_translation_cli_add_del (vlib_main_t * vm, } if (INDEX_INVALID == del_index) - cnat_translation_update (&vip, proto, paths, flags); + cnat_translation_update (&vip, proto, paths, flags, lb_type); else cnat_translation_delete (del_index); diff --git a/src/plugins/cnat/cnat_translation.h b/src/plugins/cnat/cnat_translation.h index 8bec7396050..af0b94867af 100644 --- a/src/plugins/cnat/cnat_translation.h +++ b/src/plugins/cnat/cnat_translation.h @@ -28,6 +28,7 @@ extern vlib_combined_counter_main_t cnat_translation_counters; typedef enum cnat_trk_flag_t_ { CNAT_TRK_ACTIVE = (1 << 0), + CNAT_TRK_FLAG_NO_NAT = (1 << 1), } cnat_trk_flag_t; /** @@ -80,6 +81,12 @@ typedef enum CNAT_ADDR_N_RESOLUTIONS, } cnat_addr_resol_type_t; +typedef enum __attribute__ ((__packed__)) +{ + CNAT_LB_DEFAULT, + CNAT_LB_MAGLEV, +} cnat_lb_type_t; + /** * Entry used to account for a translation's backend * waiting for address resolution @@ -159,6 +166,16 @@ typedef struct cnat_translation_t_ * Translation flags */ u8 flags; + + /** + * Type of load balancing + */ + cnat_lb_type_t lb_type; + + union + { + u32 *lb_maglev; + }; } cnat_translation_t; extern cnat_translation_t *cnat_translation_pool; @@ -174,10 +191,10 @@ extern u8 *format_cnat_translation (u8 * s, va_list * args); * * @return the ID of the translation. used to delete and gather stats */ -extern u32 cnat_translation_update (cnat_endpoint_t * vip, +extern u32 cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t ip_proto, - cnat_endpoint_tuple_t * - backends, u8 flags); + cnat_endpoint_tuple_t *backends, u8 flags, + cnat_lb_type_t lb_type); /** * Delete a translation diff --git a/src/plugins/cnat/cnat_types.c b/src/plugins/cnat/cnat_types.c index b6c6628961c..74c1c24389f 100644 --- a/src/plugins/cnat/cnat_types.c +++ b/src/plugins/cnat/cnat_types.c @@ -185,6 +185,7 @@ cnat_config (vlib_main_t * vm, unformat_input_t * input) cm->session_max_age = CNAT_DEFAULT_SESSION_MAX_AGE; cm->tcp_max_age = CNAT_DEFAULT_TCP_MAX_AGE; cm->default_scanner_state = CNAT_SCANNER_ON; + cm->maglev_len = CNAT_DEFAULT_MAGLEV_LEN; cm->lazy_init_done = 0; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) @@ -217,6 +218,8 @@ cnat_config (vlib_main_t * vm, unformat_input_t * input) ; else if (unformat (input, "tcp-max-age %u", &cm->tcp_max_age)) ; + else if (unformat (input, "maglev-len %u", &cm->maglev_len)) + ; else return clib_error_return (0, "unknown input '%U'", format_unformat_error, input); diff --git a/src/plugins/cnat/cnat_types.h b/src/plugins/cnat/cnat_types.h index 2c1b7f9be50..f0911d22d75 100644 --- a/src/plugins/cnat/cnat_types.h +++ b/src/plugins/cnat/cnat_types.h @@ -42,6 +42,9 @@ #define CNAT_DEFAULT_TRANSLATION_MEMORY (256 << 10) #define CNAT_DEFAULT_SNAT_MEMORY (64 << 20) +/* Should be prime >~ 100 * numBackends */ +#define CNAT_DEFAULT_MAGLEV_LEN 1009 + /* This should be strictly lower than FIB_SOURCE_INTERFACE * from fib_source.h */ #define CNAT_FIB_SOURCE_PRIORITY 0x02 @@ -69,6 +72,7 @@ typedef struct cnat_endpoint_tuple_t_ { cnat_endpoint_t dst_ep; cnat_endpoint_t src_ep; + u8 ep_flags; /* cnat_trk_flag_t */ } cnat_endpoint_tuple_t; typedef struct @@ -144,6 +148,10 @@ typedef struct cnat_main_ /* Enable or Disable the scanner on startup */ u8 default_scanner_state; + + /* Number of buckets for maglev, should be a + * prime >= 100 * max num bakends */ + u32 maglev_len; } cnat_main_t; typedef struct cnat_timestamp_t_ -- 2.16.6