X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fplugins%2Flb%2Flb.c;h=5326433981157654d9c5a78869d19e8a295b51ab;hb=b7b929931a07fbb27b43d5cd105f366c3e29807e;hp=06953a45aaa8d326156c23c3cb9bdf96c30acc49;hpb=647f609a11e2afb91a5216ca99d0705a3e1212a7;p=vpp.git diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index 06953a45aaa..53264339811 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -17,6 +17,7 @@ #include #include #include +#include //GC runs at most once every so many seconds #define LB_GARBAGE_RUN 60 @@ -26,8 +27,8 @@ lb_main_t lb_main; -#define lb_get_writer_lock() do {} while(__sync_lock_test_and_set (lb_main.writer_lock, 1)) -#define lb_put_writer_lock() lb_main.writer_lock[0] = 0 +#define lb_get_writer_lock() do {} while(clib_atomic_test_and_set (lb_main.writer_lock)) +#define lb_put_writer_lock() clib_atomic_release (lb_main.writer_lock) static void lb_as_stack (lb_as_t *as); @@ -36,22 +37,56 @@ const static char * const lb_dpo_gre4_ip4[] = { "lb4-gre4" , NULL }; const static char * const lb_dpo_gre4_ip6[] = { "lb6-gre4" , NULL }; const static char* const * const lb_dpo_gre4_nodes[DPO_PROTO_NUM] = { - [DPO_PROTO_IP4] = lb_dpo_gre4_ip4, - [DPO_PROTO_IP6] = lb_dpo_gre4_ip6, + [DPO_PROTO_IP4] = lb_dpo_gre4_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre4_ip6, }; const static char * const lb_dpo_gre6_ip4[] = { "lb4-gre6" , NULL }; const static char * const lb_dpo_gre6_ip6[] = { "lb6-gre6" , NULL }; const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] = { - [DPO_PROTO_IP4] = lb_dpo_gre6_ip4, - [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, + [DPO_PROTO_IP4] = lb_dpo_gre6_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, }; -const static char * const lb_dpo_l3dsr_ip4[] = { "lb4-l3dsr" , NULL }; +const static char * const lb_dpo_gre4_ip4_port[] = { "lb4-gre4-port" , NULL }; +const static char * const lb_dpo_gre4_ip6_port[] = { "lb6-gre4-port" , NULL }; +const static char* const * const lb_dpo_gre4_port_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre4_ip4_port, + [DPO_PROTO_IP6] = lb_dpo_gre4_ip6_port, + }; + +const static char * const lb_dpo_gre6_ip4_port[] = { "lb4-gre6-port" , NULL }; +const static char * const lb_dpo_gre6_ip6_port[] = { "lb6-gre6-port" , NULL }; +const static char* const * const lb_dpo_gre6_port_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre6_ip4_port, + [DPO_PROTO_IP6] = lb_dpo_gre6_ip6_port, + }; + +const static char * const lb_dpo_l3dsr_ip4[] = {"lb4-l3dsr" , NULL}; const static char* const * const lb_dpo_l3dsr_nodes[DPO_PROTO_NUM] = { - [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4, + [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4, + }; + +const static char * const lb_dpo_l3dsr_ip4_port[] = {"lb4-l3dsr-port" , NULL}; +const static char* const * const lb_dpo_l3dsr_port_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4_port, + }; + +const static char * const lb_dpo_nat4_ip4_port[] = { "lb4-nat4-port" , NULL }; +const static char* const * const lb_dpo_nat4_port_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_nat4_ip4_port, + }; + +const static char * const lb_dpo_nat6_ip6_port[] = { "lb6-nat6-port" , NULL }; +const static char* const * const lb_dpo_nat6_port_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP6] = lb_dpo_nat6_ip6_port, }; u32 lb_hash_time_now(vlib_main_t * vm) @@ -88,6 +123,8 @@ static char *lb_vip_type_strings[] = { [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6", [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4", [LB_VIP_TYPE_IP4_L3DSR] = "ip4-l3dsr", + [LB_VIP_TYPE_IP4_NAT4] = "ip4-nat4", + [LB_VIP_TYPE_IP6_NAT6] = "ip6-nat6", }; u8 *format_lb_vip_type (u8 * s, va_list * args) @@ -115,20 +152,40 @@ uword unformat_lb_vip_type (unformat_input_t * input, va_list * args) u8 *format_lb_vip (u8 * s, va_list * args) { lb_vip_t *vip = va_arg (*args, lb_vip_t *); - return format(s, "%U %U new_size:%u #as:%u%s", + s = format(s, "%U %U new_size:%u #as:%u%s", format_lb_vip_type, vip->type, format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY, vip->new_flow_table_mask + 1, pool_elts(vip->as_indexes), (vip->flags & LB_VIP_FLAGS_USED)?"":" removed"); + + if (vip->port != 0) + { + s = format(s, " protocol:%u port:%u ", vip->protocol, vip->port); + } + + if (vip->type == LB_VIP_TYPE_IP4_L3DSR) + { + s = format(s, " dscp:%u", vip->encap_args.dscp); + } + else if ((vip->type == LB_VIP_TYPE_IP4_NAT4) + || (vip->type == LB_VIP_TYPE_IP6_NAT6)) + { + s = format (s, " type:%s port:%u target_port:%u", + (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)?"clusterip": + "nodeport", + ntohs(vip->port), ntohs(vip->encap_args.target_port)); + } + + return s; } u8 *format_lb_as (u8 * s, va_list * args) { lb_as_t *as = va_arg (*args, lb_as_t *); return format(s, "%U %s", format_ip46_address, - &as->address, IP46_TYPE_ANY, - (as->flags & LB_AS_FLAGS_USED)?"used":"removed"); + &as->address, IP46_TYPE_ANY, + (as->flags & LB_AS_FLAGS_USED)?"used":"removed"); } u8 *format_lb_vip_detailed (u8 * s, va_list * args) @@ -147,11 +204,27 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) format_white_space, indent, vip->new_flow_table_mask + 1); + if (vip->port != 0) + { + s = format(s, "%U protocol:%u port:%u\n", + format_white_space, indent, + vip->protocol, vip->port); + } + if (vip->type == LB_VIP_TYPE_IP4_L3DSR) { s = format(s, "%U dscp:%u\n", format_white_space, indent, - vip->dscp); + vip->encap_args.dscp); + } + else if ((vip->type == LB_VIP_TYPE_IP4_NAT4) + || (vip->type == LB_VIP_TYPE_IP6_NAT6)) + { + s = format (s, "%U type:%s port:%u target_port:%u", + format_white_space, indent, + (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)?"clusterip": + "nodeport", + ntohs(vip->port), ntohs(vip->encap_args.target_port)); } //Print counters @@ -159,7 +232,7 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) format_white_space, indent); u32 i; for (i=0; ivip_counters[i].name, vlib_get_simple_counter(&lbm->vip_counters[i], vip - lbm->vips)); @@ -180,7 +253,7 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) u32 *as_index; pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; - s = format(s, "%U %U %d buckets %d flows dpo:%u %s\n", + s = format(s, "%U %U %u buckets %Lu flows dpo:%u %s\n", format_white_space, indent, format_ip46_address, &as->address, IP46_TYPE_ANY, count[as - lbm->ass], @@ -190,14 +263,6 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) }); vec_free(count); - - /* - s = format(s, "%U new flows table:\n", format_white_space, indent); - lb_new_flow_entry_t *nfe; - vec_foreach(nfe, vip->new_flow_table) { - s = format(s, "%U %d: %d\n", format_white_space, indent, nfe - vip->new_flow_table, nfe->as_index); - } - */ return s; } @@ -219,6 +284,11 @@ static int lb_pseudorand_compare(void *a, void *b) static void lb_vip_garbage_collection(lb_vip_t *vip) { lb_main_t *lbm = &lb_main; + lb_snat4_key_t m_key4; + clib_bihash_kv_8_8_t kv4, value4; + lb_snat6_key_t m_key6; + clib_bihash_kv_24_8_t kv6, value6; + lb_snat_mapping_t *m = 0; ASSERT (lbm->writer_lock[0]); u32 now = (u32) vlib_time_now(vlib_get_main()); @@ -231,18 +301,52 @@ static void lb_vip_garbage_collection(lb_vip_t *vip) pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; if (!(as->flags & LB_AS_FLAGS_USED) && //Not used - clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used - (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) - { //Not referenced - fib_entry_child_remove(as->next_hop_fib_entry_index, - as->next_hop_child_index); - fib_table_entry_delete_index(as->next_hop_fib_entry_index, - FIB_SOURCE_RR); - as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; - - pool_put(vip->as_indexes, as_index); - pool_put(lbm->ass, as); - } + clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && + (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) + { //Not referenced + + if (lb_vip_is_nat4_port(vip)) { + m_key4.addr = as->address.ip4; + m_key4.port = vip->encap_args.target_port; + m_key4.protocol = 0; + m_key4.fib_index = 0; + + kv4.key = m_key4.as_u64; + if(!clib_bihash_search_8_8(&lbm->mapping_by_as4, &kv4, &value4)) + m = pool_elt_at_index (lbm->snat_mappings, value4.value); + ASSERT (m); + + kv4.value = m - lbm->snat_mappings; + clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 0); + pool_put (lbm->snat_mappings, m); + } else if (lb_vip_is_nat6_port(vip)) { + m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0]; + m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1]; + m_key6.port = vip->encap_args.target_port; + m_key6.protocol = 0; + m_key6.fib_index = 0; + + kv6.key[0] = m_key6.as_u64[0]; + kv6.key[1] = m_key6.as_u64[1]; + kv6.key[2] = m_key6.as_u64[2]; + + if (!clib_bihash_search_24_8 (&lbm->mapping_by_as6, &kv6, &value6)) + m = pool_elt_at_index (lbm->snat_mappings, value6.value); + ASSERT (m); + + kv6.value = m - lbm->snat_mappings; + clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 0); + pool_put (lbm->snat_mappings, m); + } + fib_entry_child_remove(as->next_hop_fib_entry_index, + as->next_hop_child_index); + fib_table_entry_delete_index(as->next_hop_fib_entry_index, + FIB_SOURCE_RR); + as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; + + pool_put(vip->as_indexes, as_index); + pool_put(lbm->ass, as); + } }); } @@ -279,7 +383,6 @@ static void lb_vip_update_new_flow_table(lb_vip_t *vip) lb_new_flow_entry_t *new_flow_table = 0; lb_as_t *as; lb_pseudorand_t *pr, *sort_arr = 0; - u32 count; ASSERT (lbm->writer_lock[0]); //We must have the lock @@ -304,7 +407,6 @@ out: } //First, let's sort the ASs - sort_arr = 0; vec_alloc(sort_arr, pool_elts(vip->as_indexes)); i = 0; @@ -359,16 +461,8 @@ out: } } - vec_free(sort_arr); - finished: - -//Count number of changed entries - count = 0; - for (i=0; inew_flow_table == 0 || - new_flow_table[i].as_index != vip->new_flow_table[i].as_index) - count++; + vec_free(sort_arr); old_table = vip->new_flow_table; vip->new_flow_table = new_flow_table; @@ -392,8 +486,13 @@ int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, return 0; } + + static -int lb_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index) +int lb_vip_port_find_index(ip46_address_t *prefix, u8 plen, + u8 protocol, u16 port, + lb_lkp_type_t lkp_type, + u32 *vip_index) { lb_main_t *lbm = &lb_main; lb_vip_t *vip; @@ -403,19 +502,57 @@ int lb_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index) if ((vip->flags & LB_AS_FLAGS_USED) && vip->plen == plen && vip->prefix.as_u64[0] == prefix->as_u64[0] && - vip->prefix.as_u64[1] == prefix->as_u64[1]) { - *vip_index = vip - lbm->vips; - return 0; - } + vip->prefix.as_u64[1] == prefix->as_u64[1]) + { + if((lkp_type == LB_LKP_SAME_IP_PORT && + vip->protocol == protocol && + vip->port == port) || + (lkp_type == LB_LKP_ALL_PORT_IP && + vip->port == 0) || + (lkp_type == LB_LKP_DIFF_IP_PORT && + (vip->protocol != protocol || + vip->port != port) ) ) + { + *vip_index = vip - lbm->vips; + return 0; + } + } }); return VNET_API_ERROR_NO_SUCH_ENTRY; } -int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index) +static +int lb_vip_port_find_index_with_lock(ip46_address_t *prefix, u8 plen, + u8 protocol, u16 port, u32 *vip_index) +{ + return lb_vip_port_find_index(prefix, plen, protocol, port, + LB_LKP_SAME_IP_PORT, vip_index); +} + +static +int lb_vip_port_find_all_port_vip(ip46_address_t *prefix, u8 plen, + u32 *vip_index) +{ + return lb_vip_port_find_index(prefix, plen, ~0, 0, + LB_LKP_ALL_PORT_IP, vip_index); +} + +/* Find out per-port-vip entry with different protocol and port */ +static +int lb_vip_port_find_diff_port(ip46_address_t *prefix, u8 plen, + u8 protocol, u16 port, u32 *vip_index) +{ + return lb_vip_port_find_index(prefix, plen, protocol, port, + LB_LKP_DIFF_IP_PORT, vip_index); +} + +int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u8 protocol, + u16 port, u32 *vip_index) { int ret; lb_get_writer_lock(); - ret = lb_vip_find_index_with_lock(prefix, plen, vip_index); + ret = lb_vip_port_find_index_with_lock(prefix, plen, + protocol, port, vip_index); lb_put_writer_lock(); return ret; } @@ -430,7 +567,8 @@ static int lb_as_find_index_vip(lb_vip_t *vip, ip46_address_t *address, u32 *as_ as = &lbm->ass[*asi]; if (as->vip_index == (vip - lbm->vips) && as->address.as_u64[0] == address->as_u64[0] && - as->address.as_u64[1] == address->as_u64[1]) { + as->address.as_u64[1] == address->as_u64[1]) + { *as_index = as - lbm->ass; return 0; } @@ -453,6 +591,7 @@ int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n) u32 *to_be_updated = 0; u32 i; u32 *ip; + lb_snat_mapping_t *m; //Sanity check while (n--) { @@ -512,26 +651,94 @@ next: */ fib_prefix_t nh = {}; if (lb_encap_is_ip4(vip)) { - nh.fp_addr.ip4 = as->address.ip4; - nh.fp_len = 32; - nh.fp_proto = FIB_PROTOCOL_IP4; + nh.fp_addr.ip4 = as->address.ip4; + nh.fp_len = 32; + nh.fp_proto = FIB_PROTOCOL_IP4; } else { - nh.fp_addr.ip6 = as->address.ip6; - nh.fp_len = 128; - nh.fp_proto = FIB_PROTOCOL_IP6; + nh.fp_addr.ip6 = as->address.ip6; + nh.fp_len = 128; + nh.fp_proto = FIB_PROTOCOL_IP6; } as->next_hop_fib_entry_index = - fib_table_entry_special_add(0, - &nh, - FIB_SOURCE_RR, - FIB_ENTRY_FLAG_NONE); + fib_table_entry_special_add(0, + &nh, + FIB_SOURCE_RR, + FIB_ENTRY_FLAG_NONE); as->next_hop_child_index = - fib_entry_child_add(as->next_hop_fib_entry_index, - lbm->fib_node_type, - as - lbm->ass); + fib_entry_child_add(as->next_hop_fib_entry_index, + lbm->fib_node_type, + as - lbm->ass); lb_as_stack(as); + + if ( lb_vip_is_nat4_port(vip) || lb_vip_is_nat6_port(vip) ) + { + /* Add SNAT static mapping */ + pool_get (lbm->snat_mappings, m); + clib_memset (m, 0, sizeof (*m)); + if (lb_vip_is_nat4_port(vip)) { + lb_snat4_key_t m_key4; + clib_bihash_kv_8_8_t kv4; + m_key4.addr = as->address.ip4; + m_key4.port = vip->encap_args.target_port; + m_key4.protocol = 0; + m_key4.fib_index = 0; + + if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) + { + m->src_ip.ip4 = vip->prefix.ip4; + } + else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT) + { + m->src_ip.ip4 = lbm->ip4_src_address; + } + m->src_ip_is_ipv6 = 0; + m->as_ip.ip4 = as->address.ip4; + m->as_ip_is_ipv6 = 0; + m->src_port = vip->port; + m->target_port = vip->encap_args.target_port; + m->vrf_id = 0; + m->fib_index = 0; + + kv4.key = m_key4.as_u64; + kv4.value = m - lbm->snat_mappings; + clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 1); + } else { + lb_snat6_key_t m_key6; + clib_bihash_kv_24_8_t kv6; + m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0]; + m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1]; + m_key6.port = vip->encap_args.target_port; + m_key6.protocol = 0; + m_key6.fib_index = 0; + + if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) + { + m->src_ip.ip6.as_u64[0] = vip->prefix.ip6.as_u64[0]; + m->src_ip.ip6.as_u64[1] = vip->prefix.ip6.as_u64[1]; + } + else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT) + { + m->src_ip.ip6.as_u64[0] = lbm->ip6_src_address.as_u64[0]; + m->src_ip.ip6.as_u64[1] = lbm->ip6_src_address.as_u64[1]; + } + m->src_ip_is_ipv6 = 1; + m->as_ip.ip6.as_u64[0] = as->address.ip6.as_u64[0]; + m->as_ip.ip6.as_u64[1] = as->address.ip6.as_u64[1]; + m->as_ip_is_ipv6 = 1; + m->src_port = vip->port; + m->target_port = vip->encap_args.target_port; + m->vrf_id = 0; + m->fib_index = 0; + + kv6.key[0] = m_key6.as_u64[0]; + kv6.key[1] = m_key6.as_u64[1]; + kv6.key[2] = m_key6.as_u64[2]; + kv6.value = m - lbm->snat_mappings; + clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 1); + } + } } vec_free(to_be_added); @@ -545,11 +752,48 @@ next: return 0; } -int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n) +int +lb_flush_vip_as (u32 vip_index, u32 as_index) +{ + u32 thread_index; + vlib_thread_main_t *tm = vlib_get_thread_main(); + lb_main_t *lbm = &lb_main; + + for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) { + lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht; + if (h != NULL) { + u32 i; + lb_hash_bucket_t *b; + + lb_hash_foreach_entry(h, b, i) { + if ((vip_index == ~0) + || ((b->vip[i] == vip_index) && (as_index == ~0)) + || ((b->vip[i] == vip_index) && (b->value[i] == as_index))) + { + vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1); + vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1); + b->vip[i] = ~0; + b->value[i] = ~0; + } + } + if (vip_index == ~0) + { + lb_hash_free(h); + lbm->per_cpu[thread_index].sticky_ht = 0; + } + } + } + + return 0; +} + +int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n, + u8 flush) { lb_main_t *lbm = &lb_main; u32 now = (u32) vlib_time_now(vlib_get_main()); u32 *ip = 0; + u32 as_index = 0; lb_vip_t *vip; if (!(vip = lb_vip_get_by_index(vip_index))) { @@ -558,8 +802,7 @@ int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n) u32 *indexes = NULL; while (n--) { - u32 i; - if (lb_as_find_index_vip(vip, &addresses[n], &i)) { + if (lb_as_find_index_vip(vip, &addresses[n], &as_index)) { vec_free(indexes); return VNET_API_ERROR_NO_SUCH_ENTRY; } @@ -573,7 +816,7 @@ int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n) } } - vec_add1(indexes, i); + vec_add1(indexes, as_index); next: continue; } @@ -585,6 +828,12 @@ next: vec_foreach(ip, indexes) { lbm->ass[*ip].flags &= ~LB_AS_FLAGS_USED; lbm->ass[*ip].last_used = now; + + if(flush) + { + /* flush flow table for deleted ASs*/ + lb_flush_vip_as(vip_index, *ip); + } } //Recompute flows @@ -595,25 +844,76 @@ next: return 0; } -int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n) +int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n, u8 flush) { lb_get_writer_lock(); - int ret = lb_vip_del_ass_withlock(vip_index, addresses, n); + int ret = lb_vip_del_ass_withlock(vip_index, addresses, n, flush); lb_put_writer_lock(); + return ret; } +static int +lb_vip_prefix_index_alloc (lb_main_t *lbm) +{ + /* + * Check for dynamically allocaetd instance number. + */ + u32 bit; + + bit = clib_bitmap_first_clear (lbm->vip_prefix_indexes); + + lbm->vip_prefix_indexes = clib_bitmap_set(lbm->vip_prefix_indexes, bit, 1); + + return bit; +} + +static int +lb_vip_prefix_index_free (lb_main_t *lbm, u32 instance) +{ + + if (clib_bitmap_get (lbm->vip_prefix_indexes, instance) == 0) + { + return -1; + } + + lbm->vip_prefix_indexes = clib_bitmap_set (lbm->vip_prefix_indexes, + instance, 0); + + return 0; +} + /** * Add the VIP adjacency to the ip4 or ip6 fib */ -static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) +static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip, + u32 *vip_prefix_index) { dpo_proto_t proto = 0; dpo_type_t dpo_type = 0; + u32 vip_idx = 0; + + if (vip->port != 0) + { + /* for per-port vip, if VIP adjacency has been added, + * no need to add adjacency. */ + if (!lb_vip_port_find_diff_port(&(vip->prefix), vip->plen, + vip->protocol, vip->port, &vip_idx)) + { + return; + } + + /* Allocate an index for per-port vip */ + *vip_prefix_index = lb_vip_prefix_index_alloc(lbm); + } + else + { + *vip_prefix_index = vip - lbm->vips; + } dpo_id_t dpo = DPO_INVALID; fib_prefix_t pfx = {}; - if (lb_vip_is_ip4(vip)) { + if (lb_vip_is_ip4(vip->type)) { pfx.fp_addr.ip4 = vip->prefix.ip4; pfx.fp_len = vip->plen - 96; pfx.fp_proto = FIB_PROTOCOL_IP4; @@ -625,29 +925,102 @@ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) proto = DPO_PROTO_IP6; } - if(lb_vip_is_gre4(vip)) + if (lb_vip_is_gre4(vip)) dpo_type = lbm->dpo_gre4_type; else if (lb_vip_is_gre6(vip)) dpo_type = lbm->dpo_gre6_type; + else if (lb_vip_is_gre4_port(vip)) + dpo_type = lbm->dpo_gre4_port_type; + else if (lb_vip_is_gre6_port(vip)) + dpo_type = lbm->dpo_gre6_port_type; else if (lb_vip_is_l3dsr(vip)) dpo_type = lbm->dpo_l3dsr_type; - - dpo_set(&dpo, dpo_type, proto, vip - lbm->vips); + else if (lb_vip_is_l3dsr_port(vip)) + dpo_type = lbm->dpo_l3dsr_port_type; + else if(lb_vip_is_nat4_port(vip)) + dpo_type = lbm->dpo_nat4_port_type; + else if (lb_vip_is_nat6_port(vip)) + dpo_type = lbm->dpo_nat6_port_type; + + dpo_set(&dpo, dpo_type, proto, *vip_prefix_index); fib_table_entry_special_dpo_add(0, - &pfx, - FIB_SOURCE_PLUGIN_HI, - FIB_ENTRY_FLAG_EXCLUSIVE, - &dpo); + &pfx, + FIB_SOURCE_PLUGIN_HI, + FIB_ENTRY_FLAG_EXCLUSIVE, + &dpo); dpo_reset(&dpo); } +/** + * Add the VIP filter entry + */ +static int lb_vip_add_port_filter(lb_main_t *lbm, lb_vip_t *vip, + u32 vip_prefix_index, u32 vip_idx) +{ + vip_port_key_t key; + clib_bihash_kv_8_8_t kv; + + key.vip_prefix_index = vip_prefix_index; + key.protocol = vip->protocol; + key.port = clib_host_to_net_u16(vip->port); + key.rsv = 0; + + kv.key = key.as_u64; + kv.value = vip_idx; + clib_bihash_add_del_8_8(&lbm->vip_index_per_port, &kv, 1); + + return 0; +} + +/** + * Del the VIP filter entry + */ +static int lb_vip_del_port_filter(lb_main_t *lbm, lb_vip_t *vip) +{ + vip_port_key_t key; + clib_bihash_kv_8_8_t kv, value; + lb_vip_t *m = 0; + + key.vip_prefix_index = vip->vip_prefix_index; + key.protocol = vip->protocol; + key.port = clib_host_to_net_u16(vip->port); + + kv.key = key.as_u64; + if(clib_bihash_search_8_8(&lbm->vip_index_per_port, &kv, &value) == 0) + m = pool_elt_at_index (lbm->vips, value.value); + ASSERT (m); + + kv.value = m - lbm->vips; + clib_bihash_add_del_8_8(&lbm->vip_index_per_port, &kv, 0); + + return 0; +} + /** * Deletes the adjacency associated with the VIP */ static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip) { fib_prefix_t pfx = {}; - if (lb_vip_is_ip4(vip)) { + u32 vip_idx = 0; + + if (vip->port != 0) + { + /* If this vip adjacency is used by other per-port vip, + * no need to del this adjacency. */ + if (!lb_vip_port_find_diff_port(&(vip->prefix), vip->plen, + vip->protocol, vip->port, &vip_idx)) + { + lb_put_writer_lock(); + return; + } + + /* Return vip_prefix_index for per-port vip */ + lb_vip_prefix_index_free(lbm, vip->vip_prefix_index); + + } + + if (lb_vip_is_ip4(vip->type)) { pfx.fp_addr.ip4 = vip->prefix.ip4; pfx.fp_len = vip->plen - 96; pfx.fp_proto = FIB_PROTOCOL_IP4; @@ -659,38 +1032,75 @@ static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip) fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI); } -int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, - u32 new_length, u32 *vip_index) +int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) { lb_main_t *lbm = &lb_main; + vlib_main_t *vm = vlib_get_main(); lb_vip_t *vip; + lb_vip_type_t type = args.type; + u32 vip_prefix_index = 0; lb_get_writer_lock(); - ip46_prefix_normalize(prefix, plen); + ip46_prefix_normalize(&(args.prefix), args.plen); - if (!lb_vip_find_index_with_lock(prefix, plen, vip_index)) { - lb_put_writer_lock(); - return VNET_API_ERROR_VALUE_EXIST; - } + if (!lb_vip_port_find_index_with_lock(&(args.prefix), args.plen, + args.protocol, args.port, + vip_index)) + { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } - if (!is_pow2(new_length)) { + /* Make sure we can't add a per-port VIP entry + * when there already is an all-port VIP for the same prefix. */ + if ((args.port != 0) && + !lb_vip_port_find_all_port_vip(&(args.prefix), args.plen, vip_index)) + { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + /* Make sure we can't add a all-port VIP entry + * when there already is an per-port VIP for the same prefix. */ + if ((args.port == 0) && + !lb_vip_port_find_diff_port(&(args.prefix), args.plen, + args.protocol, args.port, vip_index)) + { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + /* Make sure all VIP for a given prefix (using different ports) have the same type. */ + if ((args.port != 0) && + !lb_vip_port_find_diff_port(&(args.prefix), args.plen, + args.protocol, args.port, vip_index) + && (args.type != lbm->vips[*vip_index].type)) + { + lb_put_writer_lock(); + return VNET_API_ERROR_INVALID_ARGUMENT; + } + + if (!is_pow2(args.new_length)) { lb_put_writer_lock(); return VNET_API_ERROR_INVALID_MEMORY_SIZE; } - if (ip46_prefix_is_ip4(prefix, plen) && - (type != LB_VIP_TYPE_IP4_GRE4) && - (type != LB_VIP_TYPE_IP4_GRE6) && - (type != LB_VIP_TYPE_IP4_L3DSR)) + if (ip46_prefix_is_ip4(&(args.prefix), args.plen) && + !lb_vip_is_ip4(type)) { + lb_put_writer_lock(); return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + } - if ((!ip46_prefix_is_ip4(prefix, plen)) && - (type != LB_VIP_TYPE_IP6_GRE4) && - (type != LB_VIP_TYPE_IP6_GRE6)) + if ((!ip46_prefix_is_ip4(&(args.prefix), args.plen)) && + !lb_vip_is_ip6(type)) { + lb_put_writer_lock(); return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + } - if ((type == LB_VIP_TYPE_IP4_L3DSR) && (dscp >= 64 ) ) + if ((type == LB_VIP_TYPE_IP4_L3DSR) && + (args.encap_args.dscp >= 64) ) { + lb_put_writer_lock(); return VNET_API_ERROR_VALUE_EXIST; } @@ -698,11 +1108,31 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, pool_get(lbm->vips, vip); //Init - vip->prefix = *prefix; - vip->plen = plen; + memcpy (&(vip->prefix), &(args.prefix), sizeof(args.prefix)); + vip->plen = args.plen; + if (args.port != 0) + { + vip->protocol = args.protocol; + vip->port = args.port; + } + else + { + vip->protocol = (u8)~0; + vip->port = 0; + } vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main()); - vip->type = type; - vip->dscp = dscp; + vip->type = args.type; + + if (args.type == LB_VIP_TYPE_IP4_L3DSR) { + vip->encap_args.dscp = args.encap_args.dscp; + } + else if ((args.type == LB_VIP_TYPE_IP4_NAT4) + ||(args.type == LB_VIP_TYPE_IP6_NAT6)) { + vip->encap_args.srv_type = args.encap_args.srv_type; + vip->encap_args.target_port = + clib_host_to_net_u16(args.encap_args.target_port); + } + vip->flags = LB_VIP_FLAGS_USED; vip->as_indexes = 0; @@ -714,17 +1144,43 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, } //Configure new flow table - vip->new_flow_table_mask = new_length - 1; + vip->new_flow_table_mask = args.new_length - 1; vip->new_flow_table = 0; - //Create a new flow hash table full of the default entry + //Update flow hash table lb_vip_update_new_flow_table(vip); //Create adjacency to direct traffic - lb_vip_add_adjacency(lbm, vip); + lb_vip_add_adjacency(lbm, vip, &vip_prefix_index); + + if ( (lb_vip_is_nat4_port(vip) || lb_vip_is_nat6_port(vip)) + && (args.encap_args.srv_type == LB_SRV_TYPE_NODEPORT) ) + { + u32 key; + uword * entry; + + //Create maping from nodeport to vip_index + key = clib_host_to_net_u16(args.port); + entry = hash_get_mem (lbm->vip_index_by_nodeport, &key); + if (entry) { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + hash_set_mem (lbm->vip_index_by_nodeport, &key, vip - lbm->vips); + + /* receive packets destined to NodeIP:NodePort */ + udp_register_dst_port (vm, args.port, lb4_nodeport_node.index, 1); + udp_register_dst_port (vm, args.port, lb6_nodeport_node.index, 0); + } - //Return result *vip_index = vip - lbm->vips; + //Create per-port vip filtering table + if (args.port != 0) + { + lb_vip_add_port_filter(lbm, vip, vip_prefix_index, *vip_index); + vip->vip_prefix_index = vip_prefix_index; + } lb_put_writer_lock(); return 0; @@ -734,6 +1190,11 @@ int lb_vip_del(u32 vip_index) { lb_main_t *lbm = &lb_main; lb_vip_t *vip; + + /* Does not remove default vip, i.e. vip_index = 0 */ + if (vip_index == 0) + return 0; + lb_get_writer_lock(); if (!(vip = lb_vip_get_by_index(vip_index))) { lb_put_writer_lock(); @@ -748,18 +1209,25 @@ int lb_vip_del(u32 vip_index) ip46_address_t *ass = 0; lb_as_t *as; u32 *as_index; + pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; vec_add1(ass, as->address); }); if (vec_len(ass)) - lb_vip_del_ass_withlock(vip_index, ass, vec_len(ass)); + lb_vip_del_ass_withlock(vip_index, ass, vec_len(ass), 0); vec_free(ass); } //Delete adjacency lb_vip_del_adjacency(lbm, vip); + //Delete per-port vip filtering entry + if (vip->port != 0) + { + lb_vip_del_port_filter(lbm, vip); + } + //Set the VIP as unused vip->flags &= ~LB_VIP_FLAGS_USED; @@ -813,33 +1281,79 @@ lb_as_stack (lb_as_t *as) lb_vip_t *vip = &lbm->vips[as->vip_index]; dpo_type_t dpo_type = 0; - if(lb_vip_is_gre4(vip)) + if (lb_vip_is_gre4(vip)) dpo_type = lbm->dpo_gre4_type; else if (lb_vip_is_gre6(vip)) dpo_type = lbm->dpo_gre6_type; + else if (lb_vip_is_gre4_port(vip)) + dpo_type = lbm->dpo_gre4_port_type; + else if (lb_vip_is_gre6_port(vip)) + dpo_type = lbm->dpo_gre6_port_type; else if (lb_vip_is_l3dsr(vip)) dpo_type = lbm->dpo_l3dsr_type; + else if (lb_vip_is_l3dsr_port(vip)) + dpo_type = lbm->dpo_l3dsr_port_type; + else if(lb_vip_is_nat4_port(vip)) + dpo_type = lbm->dpo_nat4_port_type; + else if (lb_vip_is_nat6_port(vip)) + dpo_type = lbm->dpo_nat6_port_type; dpo_stack(dpo_type, - lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6, - &as->dpo, - fib_entry_contribute_ip_forwarding( - as->next_hop_fib_entry_index)); + lb_vip_is_ip4(vip->type)?DPO_PROTO_IP4:DPO_PROTO_IP6, + &as->dpo, + fib_entry_contribute_ip_forwarding( + as->next_hop_fib_entry_index)); } static fib_node_back_walk_rc_t lb_fib_node_back_walk_notify (fib_node_t *node, - fib_node_back_walk_ctx_t *ctx) + fib_node_back_walk_ctx_t *ctx) { lb_as_stack(lb_as_from_fib_node(node)); return (FIB_NODE_BACK_WALK_CONTINUE); } +int lb_nat4_interface_add_del (u32 sw_if_index, int is_del) +{ + if (is_del) + { + vnet_feature_enable_disable ("ip4-unicast", "lb-nat4-in2out", + sw_if_index, 0, 0, 0); + } + else + { + vnet_feature_enable_disable ("ip4-unicast", "lb-nat4-in2out", + sw_if_index, 1, 0, 0); + } + + return 0; +} + +int lb_nat6_interface_add_del (u32 sw_if_index, int is_del) +{ + if (is_del) + { + vnet_feature_enable_disable ("ip6-unicast", "lb-nat6-in2out", + sw_if_index, 0, 0, 0); + } + else + { + vnet_feature_enable_disable ("ip6-unicast", "lb-nat6-in2out", + sw_if_index, 1, 0, 0); + } + + return 0; +} + clib_error_t * lb_init (vlib_main_t * vm) { vlib_thread_main_t *tm = vlib_get_thread_main (); lb_main_t *lbm = &lb_main; + lbm->vnet_main = vnet_get_main (); + lbm->vlib_main = vm; + + lb_vip_t *default_vip; lb_as_t *default_as; fib_node_vft_t lb_fib_node_vft = { .fnv_get = lb_fib_node_get_node, @@ -852,7 +1366,15 @@ lb_init (vlib_main_t * vm) .dv_format = format_lb_dpo, }; + //Allocate and init default VIP. lbm->vips = 0; + pool_get(lbm->vips, default_vip); + default_vip->prefix.ip6.as_u64[0] = 0xffffffffffffffffL; + default_vip->prefix.ip6.as_u64[1] = 0xffffffffffffffffL; + default_vip->protocol = ~0; + default_vip->port = 0; + default_vip->flags = LB_VIP_FLAGS_USED; + lbm->per_cpu = 0; vec_validate(lbm->per_cpu, tm->n_vlib_mains - 1); lbm->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES); @@ -864,7 +1386,18 @@ lb_init (vlib_main_t * vm) lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL; lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes); lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes); - lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft, lb_dpo_l3dsr_nodes); + lbm->dpo_gre4_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_gre4_port_nodes); + lbm->dpo_gre6_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_gre6_port_nodes); + lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft, + lb_dpo_l3dsr_nodes); + lbm->dpo_l3dsr_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_l3dsr_port_nodes); + lbm->dpo_nat4_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_nat4_port_nodes); + lbm->dpo_nat6_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_nat6_port_nodes); lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft); //Init AS reference counters @@ -879,6 +1412,21 @@ lb_init (vlib_main_t * vm) default_as->address.ip6.as_u64[0] = 0xffffffffffffffffL; default_as->address.ip6.as_u64[1] = 0xffffffffffffffffL; + lbm->vip_index_by_nodeport + = hash_create_mem (0, sizeof(u16), sizeof (uword)); + + clib_bihash_init_8_8 (&lbm->vip_index_per_port, + "vip_index_per_port", LB_VIP_PER_PORT_BUCKETS, + LB_VIP_PER_PORT_MEMORY_SIZE); + + clib_bihash_init_8_8 (&lbm->mapping_by_as4, + "mapping_by_as4", LB_MAPPING_BUCKETS, + LB_MAPPING_MEMORY_SIZE); + + clib_bihash_init_24_8 (&lbm->mapping_by_as6, + "mapping_by_as6", LB_MAPPING_BUCKETS, + LB_MAPPING_MEMORY_SIZE); + #define _(a,b,c) lbm->vip_counters[c].name = b; lb_foreach_vip_counter #undef _