From 647f609a11e2afb91a5216ca99d0705a3e1212a7 Mon Sep 17 00:00:00 2001 From: Hongjun Ni Date: Tue, 23 Jan 2018 19:17:23 +0800 Subject: [PATCH] Add L3DSR feature in LB plugin L3DSR is used to overcome Layer 2 limitations of Direct Server Return Load Balancing. It maps VIP to DSCP bits, and reuse TOS bits to transfer it to server, and then server will get VIP from DSCP-to-VIP mapping. Please refer to https://www.nanog.org/meetings/nanog51/presentations/Monday/NANOG51.Talk45.nanog51-Schaumann.pdf Change-Id: I403ffeadfb04ed0265086eb2dc41f2e17f8f34cb Signed-off-by: Hongjun Ni --- src/plugins/lb/api.c | 21 +++++-- src/plugins/lb/cli.c | 42 +++++++++++--- src/plugins/lb/lb.api | 14 +++-- src/plugins/lb/lb.c | 60 +++++++++++++++++--- src/plugins/lb/lb.h | 38 +++++++++++-- src/plugins/lb/lb_plugin_doc.md | 33 +++++++---- src/plugins/lb/lb_test.c | 8 ++- src/plugins/lb/node.c | 118 ++++++++++++++++++++++++++-------------- test/test_lb.py | 44 ++++++++++++--- 9 files changed, 282 insertions(+), 96 deletions(-) diff --git a/src/plugins/lb/api.c b/src/plugins/lb/api.c index 7eb49ff6c26..28af6daa421 100644 --- a/src/plugins/lb/api.c +++ b/src/plugins/lb/api.c @@ -116,14 +116,23 @@ vl_api_lb_add_del_vip_t_handler rv = lb_vip_del(vip_index); } else { u32 vip_index; - lb_vip_type_t type; + lb_vip_type_t type = 0; + if (ip46_prefix_is_ip4(&prefix, mp->prefix_length)) { - type = mp->is_gre4?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6; + if (mp->encap == LB_ENCAP_TYPE_GRE4) + type = LB_VIP_TYPE_IP4_GRE4; + else if (mp->encap == LB_ENCAP_TYPE_GRE6) + type = LB_VIP_TYPE_IP4_GRE6; + else if (mp->encap == LB_ENCAP_TYPE_L3DSR) + type = LB_VIP_TYPE_IP4_L3DSR; } else { - type = mp->is_gre4?LB_VIP_TYPE_IP6_GRE4:LB_VIP_TYPE_IP6_GRE6; + if (mp->encap == LB_ENCAP_TYPE_GRE4) + type = LB_VIP_TYPE_IP6_GRE4; + else if (mp->encap == LB_ENCAP_TYPE_GRE6) + type = LB_VIP_TYPE_IP6_GRE6; } - rv = lb_vip_add(&prefix, mp->prefix_length, type, + rv = lb_vip_add(&prefix, mp->prefix_length, type, mp->dscp, mp->new_flows_table_length, &vip_index); } REPLY_MACRO (VL_API_LB_CONF_REPLY); @@ -136,7 +145,9 @@ static void *vl_api_lb_add_del_vip_t_print s = format (0, "SCRIPT: lb_add_del_vip "); s = format (s, "%U ", format_ip46_prefix, (ip46_address_t *)mp->ip_prefix, mp->prefix_length, IP46_TYPE_ANY); - s = format (s, "%s ", mp->is_gre4?"gre4":"gre6"); + + s = format (s, "%s ", (mp->encap==LB_ENCAP_TYPE_GRE4)? + "gre4":(mp->encap==LB_ENCAP_TYPE_GRE6)?"gre6":"l3dsr"); s = format (s, "%u ", mp->new_flows_table_length); s = format (s, "%s ", mp->is_del?"del":"add"); FINISH; diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c index a5a87fccc5f..b29605af984 100644 --- a/src/plugins/lb/cli.c +++ b/src/plugins/lb/cli.c @@ -26,8 +26,9 @@ lb_vip_command_fn (vlib_main_t * vm, u32 new_len = 1024; u8 del = 0; int ret; - u32 gre4 = 0; - lb_vip_type_t type; + u32 encap = 0; + u32 dscp = ~0; + lb_vip_type_t type = 0; clib_error_t *error = 0; if (!unformat_user (input, unformat_line_input, line_input)) @@ -46,9 +47,13 @@ lb_vip_command_fn (vlib_main_t * vm, else if (unformat(line_input, "del")) del = 1; else if (unformat(line_input, "encap gre4")) - gre4 = 1; + encap = LB_ENCAP_TYPE_GRE4; else if (unformat(line_input, "encap gre6")) - gre4 = 0; + encap = LB_ENCAP_TYPE_GRE6; + else if (unformat(line_input, "encap l3dsr")) + encap = LB_ENCAP_TYPE_L3DSR; + else if (unformat(line_input, "dscp %d", &dscp)) + ; else { error = clib_error_return (0, "parse error: '%U'", format_unformat_error, line_input); @@ -56,18 +61,39 @@ lb_vip_command_fn (vlib_main_t * vm, } } + if ((encap != LB_ENCAP_TYPE_L3DSR) && (dscp != ~0) ) + { + error = clib_error_return (0, "lb_vip_add error: " + "should not configure dscp for none L3DSR."); + goto done; + } + + if ((encap == LB_ENCAP_TYPE_L3DSR) && (dscp >= 64 ) ) + { + error = clib_error_return (0, "lb_vip_add error: " + "dscp for L3DSR should be less than 64."); + goto done; + } if (ip46_prefix_is_ip4(&prefix, plen)) { - type = (gre4)?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6; + if (encap == LB_ENCAP_TYPE_GRE4) + type = LB_VIP_TYPE_IP4_GRE4; + else if (encap == LB_ENCAP_TYPE_GRE6) + type = LB_VIP_TYPE_IP4_GRE6; + else if (encap == LB_ENCAP_TYPE_L3DSR) + type = LB_VIP_TYPE_IP4_L3DSR; } else { - type = (gre4)?LB_VIP_TYPE_IP6_GRE4:LB_VIP_TYPE_IP6_GRE6; + if (encap == LB_ENCAP_TYPE_GRE4) + type = LB_VIP_TYPE_IP6_GRE4; + else if (encap == LB_ENCAP_TYPE_GRE6) + type = LB_VIP_TYPE_IP6_GRE6; } lb_garbage_collection(); u32 index; if (!del) { - if ((ret = lb_vip_add(&prefix, plen, type, new_len, &index))) { + if ((ret = lb_vip_add(&prefix, plen, type, (u8)(dscp & 0x3F), new_len, &index))) { error = clib_error_return (0, "lb_vip_add error %d", ret); goto done; } else { @@ -92,7 +118,7 @@ done: VLIB_CLI_COMMAND (lb_vip_command, static) = { .path = "lb vip", - .short_help = "lb vip [encap (gre6|gre4)] [new_len ] [del]", + .short_help = "lb vip [encap (gre6|gre4|l3dsr)] [dscp ] [new_len ] [del]", .function = lb_vip_command_fn, }; diff --git a/src/plugins/lb/lb.api b/src/plugins/lb/lb.api index f5036edf79d..101cee88ded 100644 --- a/src/plugins/lb/lb.api +++ b/src/plugins/lb/lb.api @@ -23,9 +23,10 @@ autoreply define lb_conf /** \brief Add a virtual address (or prefix) @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request - @param ip_prefix - IP address (IPv4 in lower order 32 bits). - @param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4). - @param is_gre4 - Encap is ip4 GRE (ip6 GRE otherwise). + @param ip_prefix - IP address (IPv4 in lower order 32 bits). + @param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2). + @param dscp - DSCP bit corresponding to VIP(applicable in L3DSR mode only). @param new_flows_table_length - Size of the new connections flow table used for this VIP (must be power of 2). @param is_del - The VIP should be removed. @@ -35,7 +36,8 @@ autoreply define lb_add_del_vip { u32 context; u8 ip_prefix[16]; u8 prefix_length; - u8 is_gre4; + u8 encap; + u8 dscp; u32 new_flows_table_length; u8 is_del; }; @@ -43,8 +45,8 @@ autoreply define lb_add_del_vip { /** \brief Add an application server for a given VIP @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request - @param vip_ip_prefix - VIP IP address (IPv4 in lower order 32 bits). - @param vip_ip_prefix - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param vip_ip_prefix - VIP IP address (IPv4 in lower order 32 bits). + @param vip_ip_prefix - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4). @param as_address - The application server address (IPv4 in lower order 32 bits). @param is_del - The AS should be removed. */ diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index fee88056eb4..06953a45aaa 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -48,6 +48,12 @@ const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] = [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, }; +const static char * const lb_dpo_l3dsr_ip4[] = { "lb4-l3dsr" , NULL }; +const static char* const * const lb_dpo_l3dsr_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4, + }; + u32 lb_hash_time_now(vlib_main_t * vm) { return (u32) (vlib_time_now(vm) + 10000); @@ -81,6 +87,7 @@ static char *lb_vip_type_strings[] = { [LB_VIP_TYPE_IP6_GRE4] = "ip6-gre4", [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6", [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4", + [LB_VIP_TYPE_IP4_L3DSR] = "ip4-l3dsr", }; u8 *format_lb_vip_type (u8 * s, va_list * args) @@ -140,6 +147,13 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) format_white_space, indent, vip->new_flow_table_mask + 1); + if (vip->type == LB_VIP_TYPE_IP4_L3DSR) + { + s = format(s, "%U dscp:%u\n", + format_white_space, indent, + vip->dscp); + } + //Print counters s = format(s, "%U counters:\n", format_white_space, indent); @@ -434,7 +448,7 @@ int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n) return VNET_API_ERROR_NO_SUCH_ENTRY; } - ip46_type_t type = lb_vip_is_gre4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6; + ip46_type_t type = lb_encap_is_ip4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6; u32 *to_be_added = 0; u32 *to_be_updated = 0; u32 i; @@ -497,7 +511,7 @@ next: * so we are informed when its forwarding changes */ fib_prefix_t nh = {}; - if (lb_vip_is_gre4(vip)) { + if (lb_encap_is_ip4(vip)) { nh.fp_addr.ip4 = as->address.ip4; nh.fp_len = 32; nh.fp_proto = FIB_PROTOCOL_IP4; @@ -595,6 +609,8 @@ int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n) static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) { dpo_proto_t proto = 0; + dpo_type_t dpo_type = 0; + dpo_id_t dpo = DPO_INVALID; fib_prefix_t pfx = {}; if (lb_vip_is_ip4(vip)) { @@ -608,8 +624,15 @@ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) pfx.fp_proto = FIB_PROTOCOL_IP6; proto = DPO_PROTO_IP6; } - dpo_set(&dpo, lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type, - proto, vip - lbm->vips); + + if(lb_vip_is_gre4(vip)) + dpo_type = lbm->dpo_gre4_type; + else if (lb_vip_is_gre6(vip)) + dpo_type = lbm->dpo_gre6_type; + else if (lb_vip_is_l3dsr(vip)) + dpo_type = lbm->dpo_l3dsr_type; + + dpo_set(&dpo, dpo_type, proto, vip - lbm->vips); fib_table_entry_special_dpo_add(0, &pfx, FIB_SOURCE_PLUGIN_HI, @@ -636,10 +659,12 @@ static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip) fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI); } -int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_length, u32 *vip_index) +int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, + u32 new_length, u32 *vip_index) { lb_main_t *lbm = &lb_main; lb_vip_t *vip; + lb_get_writer_lock(); ip46_prefix_normalize(prefix, plen); @@ -655,9 +680,19 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_leng if (ip46_prefix_is_ip4(prefix, plen) && (type != LB_VIP_TYPE_IP4_GRE4) && - (type != LB_VIP_TYPE_IP4_GRE6)) + (type != LB_VIP_TYPE_IP4_GRE6) && + (type != LB_VIP_TYPE_IP4_L3DSR)) + return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + + if ((!ip46_prefix_is_ip4(prefix, plen)) && + (type != LB_VIP_TYPE_IP6_GRE4) && + (type != LB_VIP_TYPE_IP6_GRE6)) return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + if ((type == LB_VIP_TYPE_IP4_L3DSR) && (dscp >= 64 ) ) + { + return VNET_API_ERROR_VALUE_EXIST; + } //Allocate pool_get(lbm->vips, vip); @@ -667,6 +702,7 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_leng vip->plen = plen; vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main()); vip->type = type; + vip->dscp = dscp; vip->flags = LB_VIP_FLAGS_USED; vip->as_indexes = 0; @@ -775,7 +811,16 @@ lb_as_stack (lb_as_t *as) { lb_main_t *lbm = &lb_main; lb_vip_t *vip = &lbm->vips[as->vip_index]; - dpo_stack(lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type, + dpo_type_t dpo_type = 0; + + if(lb_vip_is_gre4(vip)) + dpo_type = lbm->dpo_gre4_type; + else if (lb_vip_is_gre6(vip)) + dpo_type = lbm->dpo_gre6_type; + else if (lb_vip_is_l3dsr(vip)) + dpo_type = lbm->dpo_l3dsr_type; + + dpo_stack(dpo_type, lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6, &as->dpo, fib_entry_contribute_ip_forwarding( @@ -819,6 +864,7 @@ lb_init (vlib_main_t * vm) lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL; lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes); lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes); + lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft, lb_dpo_l3dsr_nodes); lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft); //Init AS reference counters diff --git a/src/plugins/lb/lb.h b/src/plugins/lb/lb.h index fa0b5d48b07..8db0394075c 100644 --- a/src/plugins/lb/lb.h +++ b/src/plugins/lb/lb.h @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -128,18 +129,27 @@ typedef enum { LB_N_VIP_COUNTERS } lb_vip_counter_t; +typedef enum { + LB_ENCAP_TYPE_GRE4, + LB_ENCAP_TYPE_GRE6, + LB_ENCAP_TYPE_L3DSR, + LB_ENCAP_N_TYPES, +} lb_encap_type_t; + /** * The load balancer supports IPv4 and IPv6 traffic - * and GRE4 and GRE6 encap. + * and GRE4, GRE6 and L3DSR encap. */ typedef enum { LB_VIP_TYPE_IP6_GRE6, LB_VIP_TYPE_IP6_GRE4, LB_VIP_TYPE_IP4_GRE6, LB_VIP_TYPE_IP4_GRE4, + LB_VIP_TYPE_IP4_L3DSR, LB_VIP_N_TYPES, } lb_vip_type_t; + format_function_t format_lb_vip_type; unformat_function_t unformat_lb_vip_type; @@ -195,6 +205,11 @@ typedef struct { */ lb_vip_type_t type; + /** + * DSCP bits for L3DSR + */ + u8 dscp; + /** * Flags related to this VIP. * LB_VIP_FLAGS_USED means the VIP is active. @@ -212,8 +227,20 @@ typedef struct { u32 *as_indexes; } lb_vip_t; -#define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) -#define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) +#define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 \ + || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \ + || (vip)->type == LB_VIP_TYPE_IP4_L3DSR ) + +#define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ + || (vip)->type == LB_VIP_TYPE_IP4_GRE4) +#define lb_vip_is_gre6(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE6 \ + || (vip)->type == LB_VIP_TYPE_IP4_GRE6) +#define lb_vip_is_l3dsr(vip) ((vip)->type == LB_VIP_TYPE_IP4_L3DSR) + +#define lb_encap_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ + || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \ + || (vip)->type == LB_VIP_TYPE_IP4_L3DSR) + format_function_t format_lb_vip; format_function_t format_lb_vip_detailed; @@ -286,6 +313,7 @@ typedef struct { */ dpo_type_t dpo_gre4_type; dpo_type_t dpo_gre6_type; + dpo_type_t dpo_l3dsr_type; /** * Node type for registering to fib changes. @@ -313,8 +341,8 @@ extern vlib_node_registration_t lb4_node; int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, u32 sticky_buckets, u32 flow_timeout); -int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, - u32 new_length, u32 *vip_index); +int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, + u32 new_length, u32 *vip_index); int lb_vip_del(u32 vip_index); int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index); diff --git a/src/plugins/lb/lb_plugin_doc.md b/src/plugins/lb/lb_plugin_doc.md index c7885ffb837..7672b1e88d7 100644 --- a/src/plugins/lb/lb_plugin_doc.md +++ b/src/plugins/lb/lb_plugin_doc.md @@ -8,19 +8,26 @@ Wich also means feedback is really welcome regarding features, apis, etc... ## Overview -This plugin provides load balancing for VPP in a way that is largely inspired +This plugin provides load balancing for VPP in a way that is largely inspired from Google's MagLev: http://research.google.com/pubs/pub44824.html -The load balancer is configured with a set of Virtual IPs (VIP, which can be +The load balancer is configured with a set of Virtual IPs (VIP, which can be prefixes), and for each VIP, with a set of Application Server addresses (ASs). +There are four encap types to steer traffic to different ASs: +1). IPv4+GRE ad IPv6+GRE encap types: Traffic received for a given VIP (or VIP prefix) is tunneled using GRE towards -the different ASs in a way that (tries to) ensure that a given session will +the different ASs in a way that (tries to) ensure that a given session will always be tunneled to the same AS. +2). IPv4+L3DSR encap types: +L3DSR is used to overcome Layer 2 limitations of Direct Server Return Load Balancing. +It maps VIP to DSCP bits, and reuse TOS bits to transfer DSCP bits +to server, and then server will get VIP from DSCP-to-VIP mapping. + Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must be using -the same encap. type (i.e. IPv4+GRE or IPv6+GRE). Meaning that for a given VIP, -all AS addresses must be of the same family. +the same encap. type (i.e. IPv4+GRE or IPv6+GRE or IPv4+L3DSR). +Meaning that for a given VIP, all AS addresses must be of the same family. ## Performances @@ -35,34 +42,36 @@ in next versions. The load balancer needs to be configured with some parameters: - lb conf [ip4-src-address ] [ip6-src-address ] + lb conf [ip4-src-address ] [ip6-src-address ] [buckets ] [timeout ] - + ip4-src-address: the source address used to send encap. packets using IPv4. ip6-src-address: the source address used to send encap. packets using IPv6. buckets: the *per-thread* established-connexions-table number of buckets. -timeout: the number of seconds a connection will remain in the +timeout: the number of seconds a connection will remain in the established-connexions-table while no packet for this flow is received. - ### Configure the VIPs - lb vip [encap (gre6|gre4)] [new_len ] [del] - + lb vip [encap (gre6|gre4|l3dsr)] [dscp ] [new_len ] [del] + new_len is the size of the new-connection-table. It should be 1 or 2 orders of magnitude bigger than the number of ASs for the VIP in order to ensure a good load balancing. +Encap l3dsr and dscp is used to map VIP to dscp bit and rewrite DSCP bit in packets. +So the selected server could get VIP from DSCP bit in this packet and perform DSR. Examples: - + lb vip 2002::/16 encap gre6 new_len 1024 lb vip 2003::/16 encap gre4 new_len 2048 lb vip 80.0.0.0/8 encap gre6 new_len 16 lb vip 90.0.0.0/8 encap gre4 new_len 1024 + lb vip 100.0.0.0/8 encap l3dsr dscp 2 new_len 32 ### Configure the ASs (for each VIP) diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c index 35bda262fee..b02793944c5 100644 --- a/src/plugins/lb/lb_test.c +++ b/src/plugins/lb/lb_test.c @@ -157,7 +157,7 @@ static int api_lb_add_del_vip (vat_main_t * vam) vl_api_lb_add_del_vip_t mps, *mp; int ret; mps.is_del = 0; - mps.is_gre4 = 0; + mps.encap = LB_ENCAP_TYPE_GRE4; if (!unformat(i, "%U", unformat_ip46_prefix, mps.ip_prefix, &mps.prefix_length, IP46_TYPE_ANY)) { @@ -166,9 +166,11 @@ static int api_lb_add_del_vip (vat_main_t * vam) } if (unformat(i, "gre4")) { - mps.is_gre4 = 1; + mps.encap = LB_ENCAP_TYPE_GRE4; } else if (unformat(i, "gre6")) { - mps.is_gre4 = 0; + mps.encap = LB_ENCAP_TYPE_GRE6; + } else if (unformat(i, "l3dsr")) { + mps.encap = LB_ENCAP_TYPE_L3DSR; } else { errmsg ("no encap\n"); return -99; diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c index 4a7485eb835..22ba3104f92 100644 --- a/src/plugins/lb/node.c +++ b/src/plugins/lb/node.c @@ -149,7 +149,7 @@ static_always_inline uword lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6) - u8 is_encap_v4) //Compile-time parameter stating that is GRE encap is v4 (or v6) + lb_encap_type_t encap_type) //Compile-time parameter stating that is GRE4 or GRE6 or L3DSR { lb_main_t *lbm = &lb_main; u32 n_left_from, *from, next_index, *to_next, n_left_to_next; @@ -265,43 +265,54 @@ lb_node_fn (vlib_main_t * vm, 1); //Now let's encap - { - gre_header_t *gre0; - if (is_encap_v4) - { - ip4_header_t *ip40; - vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t)); - ip40 = vlib_buffer_get_current(p0); - gre0 = (gre_header_t *)(ip40 + 1); - ip40->src_address = lbm->ip4_src_address; - ip40->dst_address = lbm->ass[asindex0].address.ip4; - ip40->ip_version_and_header_length = 0x45; - ip40->ttl = 128; - ip40->fragment_id = 0; - ip40->flags_and_fragment_offset = 0; - ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t)); - ip40->protocol = IP_PROTOCOL_GRE; - ip40->checksum = ip4_header_checksum (ip40); - } - else - { - ip6_header_t *ip60; - vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t)); - ip60 = vlib_buffer_get_current(p0); - gre0 = (gre_header_t *)(ip60 + 1); - ip60->dst_address = lbm->ass[asindex0].address.ip6; - ip60->src_address = lbm->ip6_src_address; - ip60->hop_limit = 128; - ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28); - ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t)); - ip60->protocol = IP_PROTOCOL_GRE; - } - - gre0->flags_and_version = 0; - gre0->protocol = (is_input_v4)? - clib_host_to_net_u16(0x0800): - clib_host_to_net_u16(0x86DD); - } + if ( (encap_type == LB_ENCAP_TYPE_GRE4) + || (encap_type == LB_ENCAP_TYPE_GRE6) ) + { + gre_header_t *gre0; + if (encap_type == LB_ENCAP_TYPE_GRE4) /* encap GRE4*/ + { + ip4_header_t *ip40; + vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t)); + ip40 = vlib_buffer_get_current(p0); + gre0 = (gre_header_t *)(ip40 + 1); + ip40->src_address = lbm->ip4_src_address; + ip40->dst_address = lbm->ass[asindex0].address.ip4; + ip40->ip_version_and_header_length = 0x45; + ip40->ttl = 128; + ip40->fragment_id = 0; + ip40->flags_and_fragment_offset = 0; + ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t)); + ip40->protocol = IP_PROTOCOL_GRE; + ip40->checksum = ip4_header_checksum (ip40); + } + else /* encap GRE6*/ + { + ip6_header_t *ip60; + vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t)); + ip60 = vlib_buffer_get_current(p0); + gre0 = (gre_header_t *)(ip60 + 1); + ip60->dst_address = lbm->ass[asindex0].address.ip6; + ip60->src_address = lbm->ip6_src_address; + ip60->hop_limit = 128; + ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28); + ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t)); + ip60->protocol = IP_PROTOCOL_GRE; + } + + gre0->flags_and_version = 0; + gre0->protocol = (is_input_v4)? + clib_host_to_net_u16(0x0800): + clib_host_to_net_u16(0x86DD); + } else if (encap_type == LB_ENCAP_TYPE_L3DSR) /* encap L3DSR*/ + { + ip4_header_t *ip40; + + ip40 = vlib_buffer_get_current(p0); + ip40->dst_address = lbm->ass[asindex0].address.ip4; + /* Get and rewrite DSCP bit */ + ip40->tos = (u8)((vip0->dscp & 0x3F)<<2); + ip40->checksum = ip4_header_checksum (ip40); + } if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -327,28 +338,35 @@ static uword lb6_gre6_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - return lb_node_fn(vm, node, frame, 0, 0); + return lb_node_fn(vm, node, frame, 0, LB_ENCAP_TYPE_GRE6); } static uword lb6_gre4_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - return lb_node_fn(vm, node, frame, 0, 1); + return lb_node_fn(vm, node, frame, 0, LB_ENCAP_TYPE_GRE4); } static uword lb4_gre6_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - return lb_node_fn(vm, node, frame, 1, 0); + return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_GRE6); } static uword lb4_gre4_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - return lb_node_fn(vm, node, frame, 1, 1); + return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_GRE4); +} + +static uword +lb4_l3dsr_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_L3DSR); } VLIB_REGISTER_NODE (lb6_gre6_node) = @@ -419,3 +437,19 @@ VLIB_REGISTER_NODE (lb4_gre4_node) = }, }; +VLIB_REGISTER_NODE (lb4_l3dsr_node) = +{ + .function = lb4_l3dsr_node_fn, + .name = "lb4-l3dsr", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; diff --git a/test/test_lb.py b/test/test_lb.py index e653b60b0ab..731790bce72 100644 --- a/test/test_lb.py +++ b/test/test_lb.py @@ -15,6 +15,7 @@ from util import ppp - IP4 to GRE6 encap - IP6 to GRE4 encap - IP6 to GRE6 encap + - IP4 to L3DSR encap As stated in comments below, GRE has issues with IPv6. All test cases involving IPv6 are executed, but @@ -94,7 +95,7 @@ class TestLB(VppTestCase): self.assertEqual(payload_info.src, self.pg0.sw_if_index) self.assertEqual(str(inner), str(self.info.data[IPver])) - def checkCapture(self, gre4, isv4): + def checkCapture(self, encap, isv4): self.pg0.assert_nothing_captured() out = self.pg1.get_capture(len(self.packets)) @@ -104,7 +105,7 @@ class TestLB(VppTestCase): try: asid = 0 gre = None - if gre4: + if (encap == 'gre4'): ip = p[IP] asid = int(ip.dst.split(".")[3]) self.assertEqual(ip.version, 4) @@ -115,7 +116,8 @@ class TestLB(VppTestCase): self.assertEqual(len(ip.options), 0) self.assertGreaterEqual(ip.ttl, 64) gre = p[GRE] - else: + self.checkInner(gre, isv4) + elif (encap == 'gre6'): ip = p[IPv6] asid = ip.dst.split(":") asid = asid[len(asid) - 1] @@ -132,7 +134,15 @@ class TestLB(VppTestCase): self.assertGreaterEqual(ip.hlim, 64) # self.assertEqual(len(ip.options), 0) gre = GRE(str(p[IPv6].payload)) - self.checkInner(gre, isv4) + self.checkInner(gre, isv4) + if (encap == 'l3dsr'): + ip = p[IP] + asid = int(ip.dst.split(".")[3]) + self.assertEqual(ip.version, 4) + self.assertEqual(ip.flags, 0) + self.assertEqual(ip.dst, "10.0.0.%u" % asid) + self.assertEqual(ip.tos, 0x1c) + self.assertEqual(len(ip.options), 0) load[asid] += 1 except: self.logger.error(ppp("Unexpected or invalid packet:", p)) @@ -156,7 +166,7 @@ class TestLB(VppTestCase): self.pg0.add_stream(self.generatePackets(self.pg0, isv4=True)) self.pg_enable_capture(self.pg_interfaces) self.pg_start() - self.checkCapture(gre4=True, isv4=True) + self.checkCapture(encap='gre4', isv4=True) finally: for asid in self.ass: @@ -176,7 +186,7 @@ class TestLB(VppTestCase): self.pg_enable_capture(self.pg_interfaces) self.pg_start() - self.checkCapture(gre4=True, isv4=False) + self.checkCapture(encap='gre4', isv4=False) finally: for asid in self.ass: self.vapi.cli("lb as 2001::/16 10.0.0.%u del" % (asid)) @@ -194,7 +204,7 @@ class TestLB(VppTestCase): self.pg_enable_capture(self.pg_interfaces) self.pg_start() - self.checkCapture(gre4=False, isv4=True) + self.checkCapture(encap='gre6', isv4=True) finally: for asid in self.ass: self.vapi.cli("lb as 90.0.0.0/8 2002::%u del" % (asid)) @@ -212,9 +222,27 @@ class TestLB(VppTestCase): self.pg_enable_capture(self.pg_interfaces) self.pg_start() - self.checkCapture(gre4=False, isv4=False) + self.checkCapture(encap='gre6', isv4=False) finally: for asid in self.ass: self.vapi.cli("lb as 2001::/16 2002::%u del" % (asid)) self.vapi.cli("lb vip 2001::/16 encap gre6 del") self.vapi.cli("test lb flowtable flush") + + def test_lb_ip4_l3dsr(self): + """ Load Balancer IP4 L3DSR """ + try: + self.vapi.cli("lb vip 90.0.0.0/8 encap l3dsr dscp 7") + for asid in self.ass: + self.vapi.cli("lb as 90.0.0.0/8 10.0.0.%u" % (asid)) + + self.pg0.add_stream(self.generatePackets(self.pg0, isv4=True)) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + self.checkCapture(encap='l3dsr', isv4=True) + + finally: + for asid in self.ass: + self.vapi.cli("lb as 90.0.0.0/8 10.0.0.%u del" % (asid)) + self.vapi.cli("lb vip 90.0.0.0/8 encap l3dsr dscp 7 del") + self.vapi.cli("test lb flowtable flush") -- 2.16.6