From 5e52417a2aa3b2063a811c6a9f293a79d73bcb43 Mon Sep 17 00:00:00 2001 From: Zhiyong Yang Date: Wed, 8 Jul 2020 20:28:36 +0000 Subject: [PATCH] ip: enhance vtep4_check of tunnel by vector way This patch aims to improve decap performance by reducing expensive hash_get callings as less as possible using AVX512 on XEON. e.g. vxlan, vxlan_gpe, geneve, gtpu. For the existing code, if vtep4 of the current packet match the last vtep4_key_t well, expensive hash computation can be avoided and the code returns directly. This patch improves tunnel decap multiple flows case greatly by leveraging 512bit vector register on XEON accommodating 8 vtep4_keys. It enhances the possiblity of avoiding unnecessary hash computing once hash key of the current packet hits any one of 8 in the 512bit cache. The oldest element in vtep4_cache_t is updated in round-robin order. vlib_get_buffers is also leveraged in the meanwhile. Type: improvement Signed-off-by: Zhiyong Yang Signed-off-by: Ray Kinsella Signed-off-by: Junfeng Wang Change-Id: I313103202bd76f2dd638cd942554721b37ddad60 --- src/plugins/gtpu/gtpu_decap.c | 42 +++++++++++++++++++++++++++++------------ src/vnet/geneve/decap.c | 43 ++++++++++++++++++++++++++++++------------ src/vnet/ip/vtep.h | 42 +++++++++++++++++++++++++++++++++++++++++ src/vnet/vxlan-gpe/decap.c | 44 +++++++++++++++++++++++++++++++------------ src/vnet/vxlan/decap.c | 44 +++++++++++++++++++++++++++++++------------ src/vppinfra/vector_avx512.h | 6 ++++++ 6 files changed, 173 insertions(+), 48 deletions(-) diff --git a/src/plugins/gtpu/gtpu_decap.c b/src/plugins/gtpu/gtpu_decap.c index 05c21381d55..7a88aae63a6 100644 --- a/src/plugins/gtpu/gtpu_decap.c +++ b/src/plugins/gtpu/gtpu_decap.c @@ -804,10 +804,16 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, matching a local VTEP address */ vtep6_key_t last_vtep6; /* last IPv6 address / fib index matching a local VTEP address */ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; +#ifdef CLIB_HAVE_VEC512 + vtep4_cache_t vtep4_u512; + clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512)); +#endif from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; + vlib_get_buffers (vm, from, bufs, n_left_from); if (node->flags & VLIB_NODE_FLAG_TRACE) ip4_forward_next_trace (vm, node, frame, VLIB_TX); @@ -835,16 +841,11 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, /* Prefetch next iteration. */ { - vlib_buffer_t * p2, * p3; - - p2 = vlib_get_buffer (vm, from[2]); - p3 = vlib_get_buffer (vm, from[3]); - - vlib_prefetch_buffer_header (p2, LOAD); - vlib_prefetch_buffer_header (p3, LOAD); + vlib_prefetch_buffer_header (b[2], LOAD); + vlib_prefetch_buffer_header (b[3], LOAD); - CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); - CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[2]->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[3]->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); } bi0 = to_next[0] = from[0]; @@ -854,8 +855,9 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, to_next += 2; n_left_to_next -= 2; - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); + b0 = b[0]; + b1 = b[1]; + b += 2; if (is_ip4) { ip40 = vlib_buffer_get_current (b0); @@ -899,7 +901,12 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs*/ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (>m->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (>m->vtep_table, b0, ip40, &last_vtep4)) +#endif goto exit0; /* no local VTEP for GTPU packet */ } else @@ -973,7 +980,12 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs*/ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (>m->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (>m->vtep_table, b1, ip41, &last_vtep4)) +#endif goto exit1; /* no local VTEP for GTPU packet */ } else @@ -1053,7 +1065,8 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, to_next += 1; n_left_to_next -= 1; - b0 = vlib_get_buffer (vm, bi0); + b0 = b[0]; + b++; if (is_ip4) ip40 = vlib_buffer_get_current (b0); else @@ -1083,7 +1096,12 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs*/ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (>m->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (>m->vtep_table, b0, ip40, &last_vtep4)) +#endif goto exit; /* no local VTEP for GTPU packet */ } else diff --git a/src/vnet/geneve/decap.c b/src/vnet/geneve/decap.c index 10a17cef35d..b570e3512c4 100644 --- a/src/vnet/geneve/decap.c +++ b/src/vnet/geneve/decap.c @@ -869,11 +869,18 @@ ip_geneve_bypass_inline (vlib_main_t * vm, matching a local VTEP address */ vtep6_key_t last_vtep6; /* last IPv6 address / fib index matching a local VTEP address */ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; +#ifdef CLIB_HAVE_VEC512 + vtep4_cache_t vtep4_u512; + clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512)); +#endif from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; + vlib_get_buffers (vm, from, bufs, n_left_from); + if (node->flags & VLIB_NODE_FLAG_TRACE) ip4_forward_next_trace (vm, node, frame, VLIB_TX); @@ -900,16 +907,11 @@ ip_geneve_bypass_inline (vlib_main_t * vm, /* Prefetch next iteration. */ { - vlib_buffer_t *p2, *p3; - - p2 = vlib_get_buffer (vm, from[2]); - p3 = vlib_get_buffer (vm, from[3]); - - vlib_prefetch_buffer_header (p2, LOAD); - vlib_prefetch_buffer_header (p3, LOAD); + vlib_prefetch_buffer_header (b[2], LOAD); + vlib_prefetch_buffer_header (b[3], LOAD); - CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); - CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); } bi0 = to_next[0] = from[0]; @@ -919,8 +921,9 @@ ip_geneve_bypass_inline (vlib_main_t * vm, to_next += 2; n_left_to_next -= 2; - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); + b0 = b[0]; + b1 = b[1]; + b += 2; if (is_ip4) { ip40 = vlib_buffer_get_current (b0); @@ -964,7 +967,12 @@ ip_geneve_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs */ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4)) +#endif goto exit0; /* no local VTEP for GENEVE packet */ } else @@ -1042,7 +1050,12 @@ ip_geneve_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs */ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&vxm->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&vxm->vtep_table, b1, ip41, &last_vtep4)) +#endif goto exit1; /* no local VTEP for GENEVE packet */ } else @@ -1126,7 +1139,8 @@ ip_geneve_bypass_inline (vlib_main_t * vm, to_next += 1; n_left_to_next -= 1; - b0 = vlib_get_buffer (vm, bi0); + b0 = b[0]; + b++; if (is_ip4) ip40 = vlib_buffer_get_current (b0); else @@ -1156,7 +1170,12 @@ ip_geneve_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs */ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4)) +#endif goto exit; /* no local VTEP for GENEVE packet */ } else diff --git a/src/vnet/ip/vtep.h b/src/vnet/ip/vtep.h index 703ace18dba..345b6db1f9b 100644 --- a/src/vnet/ip/vtep.h +++ b/src/vnet/ip/vtep.h @@ -112,6 +112,48 @@ vtep4_check (vtep_table_t * t, vlib_buffer_t * b0, ip4_header_t * ip40, return VTEP_CHECK_PASS; } +typedef struct +{ + vtep4_key_t vtep4_cache[8]; + int idx; +} vtep4_cache_t; + +always_inline u8 +vtep4_check_vector (vtep_table_t * t, vlib_buffer_t * b0, ip4_header_t * ip40, + vtep4_key_t * last_k4, vtep4_cache_t * vtep4_u512) +{ + vtep4_key_t k4; + k4.addr.as_u32 = ip40->dst_address.as_u32; + k4.fib_index = vlib_buffer_get_ip4_fib_index (b0); + + if (PREDICT_TRUE (k4.as_u64 == last_k4->as_u64)) + return VTEP_CHECK_PASS_UNCHANGED; + +#ifdef CLIB_HAVE_VEC512 + u64x8 k4_u64x8 = u64x8_splat (k4.as_u64); + u64x8 cache = u64x8_load_aligned (vtep4_u512->vtep4_cache); + u8 result = u64x8_mask_is_equal (cache, k4_u64x8); + if (PREDICT_TRUE (result != 0)) + { + k4.as_u64 = + vtep4_u512->vtep4_cache[count_trailing_zeros (result)].as_u64; + return VTEP_CHECK_PASS_UNCHANGED; + } +#endif + + if (PREDICT_FALSE (!hash_get (t->vtep4, k4.as_u64))) + return VTEP_CHECK_FAIL; + + last_k4->as_u64 = k4.as_u64; + +#ifdef CLIB_HAVE_VEC512 + vtep4_u512->vtep4_cache[vtep4_u512->idx].as_u64 = k4.as_u64; + vtep4_u512->idx = (vtep4_u512->idx + 1) & 0x7; +#endif + + return VTEP_CHECK_PASS; +} + always_inline u8 vtep6_check (vtep_table_t * t, vlib_buffer_t * b0, ip6_header_t * ip60, vtep6_key_t * last_k6) diff --git a/src/vnet/vxlan-gpe/decap.c b/src/vnet/vxlan-gpe/decap.c index f2961d5ff5b..77b5328c8f6 100644 --- a/src/vnet/vxlan-gpe/decap.c +++ b/src/vnet/vxlan-gpe/decap.c @@ -792,11 +792,18 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, matching a local VTEP address */ vtep6_key_t last_vtep6; /* last IPv6 address / fib index matching a local VTEP address */ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; +#ifdef CLIB_HAVE_VEC512 + vtep4_cache_t vtep4_u512; + clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512)); +#endif from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; + vlib_get_buffers (vm, from, bufs, n_left_from); + if (node->flags & VLIB_NODE_FLAG_TRACE) ip4_forward_next_trace (vm, node, frame, VLIB_TX); @@ -823,16 +830,11 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, /* Prefetch next iteration. */ { - vlib_buffer_t *p2, *p3; - - p2 = vlib_get_buffer (vm, from[2]); - p3 = vlib_get_buffer (vm, from[3]); - - vlib_prefetch_buffer_header (p2, LOAD); - vlib_prefetch_buffer_header (p3, LOAD); + vlib_prefetch_buffer_header (b[2], LOAD); + vlib_prefetch_buffer_header (b[3], LOAD); - CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); - CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); } bi0 = to_next[0] = from[0]; @@ -842,8 +844,9 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, to_next += 2; n_left_to_next -= 2; - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); + b0 = b[0]; + b1 = b[1]; + b += 2; if (is_ip4) { ip40 = vlib_buffer_get_current (b0); @@ -885,7 +888,12 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs */ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&ngm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&ngm->vtep_table, b0, ip40, &last_vtep4)) +#endif goto exit0; /* no local VTEP for VXLAN packet */ } else @@ -963,7 +971,12 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs */ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&ngm->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&ngm->vtep_table, b1, ip41, &last_vtep4)) +#endif goto exit1; /* no local VTEP for VXLAN packet */ } else @@ -1047,7 +1060,8 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, to_next += 1; n_left_to_next -= 1; - b0 = vlib_get_buffer (vm, bi0); + b0 = b[0]; + b++; if (is_ip4) ip40 = vlib_buffer_get_current (b0); else @@ -1073,9 +1087,15 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, goto exit; /* not VXLAN packet */ /* Validate DIP against VTEPs */ + if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&ngm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&ngm->vtep_table, b0, ip40, &last_vtep4)) +#endif goto exit; /* no local VTEP for VXLAN packet */ } else diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c index d2ba62d12ed..e41c6a95554 100644 --- a/src/vnet/vxlan/decap.c +++ b/src/vnet/vxlan/decap.c @@ -458,11 +458,19 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, matching a local VTEP address */ vtep6_key_t last_vtep6; /* last IPv6 address / fib index matching a local VTEP address */ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; + +#ifdef CLIB_HAVE_VEC512 + vtep4_cache_t vtep4_u512; + clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512)); +#endif from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; + vlib_get_buffers (vm, from, bufs, n_left_from); + if (node->flags & VLIB_NODE_FLAG_TRACE) ip4_forward_next_trace (vm, node, frame, VLIB_TX); @@ -489,16 +497,11 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, /* Prefetch next iteration. */ { - vlib_buffer_t *p2, *p3; + vlib_prefetch_buffer_header (b[2], LOAD); + vlib_prefetch_buffer_header (b[3], LOAD); - p2 = vlib_get_buffer (vm, from[2]); - p3 = vlib_get_buffer (vm, from[3]); - - vlib_prefetch_buffer_header (p2, LOAD); - vlib_prefetch_buffer_header (p3, LOAD); - - CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); - CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); } bi0 = to_next[0] = from[0]; @@ -508,8 +511,9 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, to_next += 2; n_left_to_next -= 2; - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); + b0 = b[0]; + b1 = b[1]; + b += 2; if (is_ip4) { ip40 = vlib_buffer_get_current (b0); @@ -553,7 +557,12 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs */ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4)) +#endif goto exit0; /* no local VTEP for VXLAN packet */ } else @@ -627,7 +636,12 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs */ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&vxm->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&vxm->vtep_table, b1, ip41, &last_vtep4)) +#endif goto exit1; /* no local VTEP for VXLAN packet */ } else @@ -707,7 +721,8 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, to_next += 1; n_left_to_next -= 1; - b0 = vlib_get_buffer (vm, bi0); + b0 = b[0]; + b++; if (is_ip4) ip40 = vlib_buffer_get_current (b0); else @@ -737,7 +752,12 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, /* Validate DIP against VTEPs */ if (is_ip4) { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector + (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512)) +#else if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4)) +#endif goto exit; /* no local VTEP for VXLAN packet */ } else diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h index 6eb7c5eaa4d..a51644be1db 100644 --- a/src/vppinfra/vector_avx512.h +++ b/src/vppinfra/vector_avx512.h @@ -246,6 +246,12 @@ u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask) return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b); } +static_always_inline u8 +u64x8_mask_is_equal (u64x8 a, u64x8 b) +{ + return _mm512_cmpeq_epu64_mask ((__m512i) a, (__m512i) b); +} + static_always_inline void u32x16_transpose (u32x16 m[16]) { -- 2.16.6