From 2e2372117d35191a0e6c096c5f989930de6e12b1 Mon Sep 17 00:00:00 2001 From: Lijian Zhang Date: Mon, 10 Sep 2018 17:13:56 +0800 Subject: [PATCH] Support dynamic dual/quad loop selection on aarch64 Currently, there are three variants available on aarch64, qdf24xx, thunderx2t99, and cortex-a72. -DCLIB_N_PREFETCHES is passed to source code to select dual/quad implementation. Besides, different compiler options are applied on these critical functions. gcc-7.3.0 reports ICE(internal compiler error) with -mtune=thunderx2t99, so -mtune=thunderx2t99 is enabled only when gcc version is greater than 7.3.0 Cavium ThunderX2, Impermenter 0x43, Part 0x0af -march=armv8-a+crc+crypto -mtune=thunderx2t99 Qualcomm Centriq 2400, Impermenter 0x51, Part 0xc00 -march=armv8.1-a+crc+crypto -mtune=qdf24xx Cortex-A72, Impermenter 0x41, Part 0xd08 -march=armv8-a+crc+crypto -mtune=cortex-a72 Change-Id: Id5649c6325c1e642d0fd42535e3908793b13e02a Signed-off-by: Lijian Zhang Reviewed-by: Sirshak Das Reviewed-by: Honnappa Nagarahalli --- src/cmake/cpu.cmake | 16 ++++++ src/vnet/ip/ip4_forward.h | 140 ++++++++++++++++++++++++++++++++++++++++++++++ src/vppinfra/cache.h | 5 ++ src/vppinfra/cpu.h | 90 +++++++++++++++++++++++++++++ 4 files changed, 251 insertions(+) diff --git a/src/cmake/cpu.cmake b/src/cmake/cpu.cmake index bdc9014d944..20dab7bfce1 100644 --- a/src/cmake/cpu.cmake +++ b/src/cmake/cpu.cmake @@ -65,6 +65,22 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") set(CMAKE_C_FLAGS "-march=armv8-a+crc ${CMAKE_C_FLAGS}") + check_c_compiler_flag("-march=armv8-a+crc+crypto -mtune=qdf24xx" compiler_flag_march_core_qdf24xx) + if(compiler_flag_march_core_qdf24xx) + list(APPEND MARCH_VARIANTS "qdf24xx\;-march=armv8-a+crc+crypto -DCLIB_N_PREFETCHES=8") + endif() + check_c_compiler_flag("-march=armv8.1-a+crc+crypto -mtune=thunderx2t99" compiler_flag_march_thunderx2t99) + if(compiler_flag_march_thunderx2t99) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.3) + list(APPEND MARCH_VARIANTS "thunderx2t99\;-march=armv8.1-a+crc+crypto -mtune=thunderx2t99 -DCLIB_N_PREFETCHES=8") + else() + list(APPEND MARCH_VARIANTS "thunderx2t99\;-march=armv8.1-a+crc+crypto -DCLIB_N_PREFETCHES=8") + endif() + endif() + check_c_compiler_flag("-march=armv8-a+crc+crypto -mtune=cortex-a72" compiler_flag_march_cortexa72) + if(compiler_flag_march_cortexa72) + list(APPEND MARCH_VARIANTS "cortexa72\;-march=armv8-a+crc+crypto -mtune=cortex-a72 -DCLIB_N_PREFETCHES=6") + endif() endif() macro(vpp_library_set_multiarch_sources lib) diff --git a/src/vnet/ip/ip4_forward.h b/src/vnet/ip/ip4_forward.h index b1294fdf2c3..2219b577b2a 100644 --- a/src/vnet/ip/ip4_forward.h +++ b/src/vnet/ip/ip4_forward.h @@ -40,6 +40,7 @@ #ifndef __included_ip4_forward_h__ #define __included_ip4_forward_h__ +#include #include #include @@ -70,6 +71,7 @@ ip4_lookup_inline (vlib_main_t * vm, { vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); +#if (CLIB_N_PREFETCHES >= 8) while (n_left_from >= 8 && n_left_to_next >= 4) { vlib_buffer_t *p0, *p1, *p2, *p3; @@ -283,7 +285,145 @@ ip4_lookup_inline (vlib_main_t * vm, pi0, pi1, pi2, pi3, next0, next1, next2, next3); } +#elif (CLIB_N_PREFETCHES >= 4) + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *p0, *p1; + ip4_header_t *ip0, *ip1; + ip_lookup_next_t next0, next1; + const load_balance_t *lb0, *lb1; + ip4_fib_mtrie_t *mtrie0, *mtrie1; + ip4_fib_mtrie_leaf_t leaf0, leaf1; + ip4_address_t *dst_addr0, *dst_addr1; + u32 pi0, pi1, lb_index0, lb_index1; + flow_hash_config_t flow_hash_config0, flow_hash_config1; + u32 hash_c0, hash_c1; + const dpo_id_t *dpo0, *dpo1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + dst_addr0 = &ip0->dst_address; + dst_addr1 = &ip1->dst_address; + + ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p0); + ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p1); + + if (!lookup_for_responses_to_locally_received_packets) + { + mtrie0 = &ip4_fib_get (vnet_buffer (p0)->ip.fib_index)->mtrie; + mtrie1 = &ip4_fib_get (vnet_buffer (p1)->ip.fib_index)->mtrie; + + leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0); + leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, dst_addr1); + } + + if (!lookup_for_responses_to_locally_received_packets) + { + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2); + } + + if (!lookup_for_responses_to_locally_received_packets) + { + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3); + } + if (lookup_for_responses_to_locally_received_packets) + { + lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX]; + lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX]; + } + else + { + lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1); + } + + ASSERT (lb_index0 && lb_index1); + lb0 = load_balance_get (lb_index0); + lb1 = load_balance_get (lb_index1); + + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); + ASSERT (lb1->lb_n_buckets > 0); + ASSERT (is_pow2 (lb1->lb_n_buckets)); + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0; + hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0; + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (p0)->ip.flow_hash = + ip4_compute_flow_hash (ip0, flow_hash_config0); + dpo0 = + load_balance_get_fwd_bucket (lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); + } + if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) + { + flow_hash_config1 = lb1->lb_hash_config; + hash_c1 = vnet_buffer (p1)->ip.flow_hash = + ip4_compute_flow_hash (ip1, flow_hash_config1); + dpo1 = + load_balance_get_fwd_bucket (lb1, + (hash_c1 & + (lb1->lb_n_buckets_minus_1))); + } + else + { + dpo1 = load_balance_get_bucket_i (lb1, 0); + } + + next0 = dpo0->dpoi_next_node; + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + next1 = dpo1->dpoi_next_node; + vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lb_index0, 1, + vlib_buffer_length_in_chain (vm, p0)); + vlib_increment_combined_counter + (cm, thread_index, lb_index1, 1, + vlib_buffer_length_in_chain (vm, p1)); + + vlib_validate_buffer_enqueue_x2 (vm, node, next, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } +#endif while (n_left_from > 0 && n_left_to_next > 0) { vlib_buffer_t *p0; diff --git a/src/vppinfra/cache.h b/src/vppinfra/cache.h index 7a54d34dfee..e8e89ba8556 100644 --- a/src/vppinfra/cache.h +++ b/src/vppinfra/cache.h @@ -59,6 +59,11 @@ #define CLIB_CACHE_LINE_BYTES (1 << CLIB_LOG2_CACHE_LINE_BYTES) #define CLIB_CACHE_LINE_ALIGN_MARK(mark) u8 mark[0] __attribute__((aligned(CLIB_CACHE_LINE_BYTES))) +/* Default cache line fill buffers. */ +#ifndef CLIB_N_PREFETCHES +#define CLIB_N_PREFETCHES 16 +#endif + /* Read/write arguments to __builtin_prefetch. */ #define CLIB_PREFETCH_READ 0 #define CLIB_PREFETCH_LOAD 0 /* alias for read */ diff --git a/src/vppinfra/cpu.h b/src/vppinfra/cpu.h index c636cf8639c..0ca9edb97fc 100644 --- a/src/vppinfra/cpu.h +++ b/src/vppinfra/cpu.h @@ -183,6 +183,96 @@ clib_cpu_march_priority_avx2 () return -1; } +static inline u32 +clib_cpu_implementer () +{ + char buf[128]; + static u32 implementer = -1; + + if (-1 != implementer) + return implementer; + + FILE *fp = fopen ("/proc/cpuinfo", "r"); + if (!fp) + return implementer; + + while (!feof (fp)) + { + if (!fgets (buf, sizeof (buf), fp)) + break; + buf[127] = '\0'; + if (strstr (buf, "CPU implementer")) + implementer = (u32) strtol (memchr (buf, ':', 128) + 2, NULL, 0); + if (-1 != implementer) + break; + } + fclose (fp); + + return implementer; +} + +static inline u32 +clib_cpu_part () +{ + char buf[128]; + static u32 part = -1; + + if (-1 != part) + return part; + + FILE *fp = fopen ("/proc/cpuinfo", "r"); + if (!fp) + return part; + + while (!feof (fp)) + { + if (!fgets (buf, sizeof (buf), fp)) + break; + buf[127] = '\0'; + if (strstr (buf, "CPU part")) + part = (u32) strtol (memchr (buf, ':', 128) + 2, NULL, 0); + if (-1 != part) + break; + } + fclose (fp); + + return part; +} + +#define AARCH64_CPU_IMPLEMENTER_THUNERDERX2 0x43 +#define AARCH64_CPU_PART_THUNERDERX2 0x0af +#define AARCH64_CPU_IMPLEMENTER_QDF24XX 0x51 +#define AARCH64_CPU_PART_QDF24XX 0xc00 +#define AARCH64_CPU_IMPLEMENTER_CORTEXA72 0x41 +#define AARCH64_CPU_PART_CORTEXA72 0xd08 + +static inline int +clib_cpu_march_priority_thunderx2t99 () +{ + if ((AARCH64_CPU_IMPLEMENTER_THUNERDERX2 == clib_cpu_implementer ()) && + (AARCH64_CPU_PART_THUNERDERX2 == clib_cpu_part ())) + return 20; + return -1; +} + +static inline int +clib_cpu_march_priority_qdf24xx () +{ + if ((AARCH64_CPU_IMPLEMENTER_QDF24XX == clib_cpu_implementer ()) && + (AARCH64_CPU_PART_QDF24XX == clib_cpu_part ())) + return 20; + return -1; +} + +static inline int +clib_cpu_march_priority_cortexa72 () +{ + if ((AARCH64_CPU_IMPLEMENTER_CORTEXA72 == clib_cpu_implementer ()) && + (AARCH64_CPU_PART_CORTEXA72 == clib_cpu_part ())) + return 10; + return -1; +} + #ifdef CLIB_MARCH_VARIANT #define CLIB_MARCH_FN_PRIORITY() CLIB_MARCH_SFX(clib_cpu_march_priority)() #else -- 2.16.6