From 2e2372117d35191a0e6c096c5f989930de6e12b1 Mon Sep 17 00:00:00 2001
From: Lijian Zhang <Lijian.Zhang@arm.com>
Date: Mon, 10 Sep 2018 17:13:56 +0800
Subject: [PATCH] Support dynamic dual/quad loop selection on aarch64

Currently, there are three variants available on aarch64, qdf24xx, thunderx2t99, and cortex-a72.
-DCLIB_N_PREFETCHES is passed to source code to select dual/quad implementation.
Besides, different compiler options are applied on these critical functions.

gcc-7.3.0 reports ICE(internal compiler error) with -mtune=thunderx2t99,
so -mtune=thunderx2t99 is enabled only when gcc version is greater than 7.3.0

Cavium ThunderX2, Impermenter 0x43, Part 0x0af
    -march=armv8-a+crc+crypto -mtune=thunderx2t99
Qualcomm Centriq 2400, Impermenter 0x51, Part 0xc00
    -march=armv8.1-a+crc+crypto -mtune=qdf24xx
Cortex-A72, Impermenter 0x41, Part 0xd08
    -march=armv8-a+crc+crypto -mtune=cortex-a72

Change-Id: Id5649c6325c1e642d0fd42535e3908793b13e02a
Signed-off-by: Lijian Zhang <Lijian.Zhang@arm.com>
Reviewed-by: Sirshak Das <sirshak.das@arm.com>
Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
---
 src/cmake/cpu.cmake       |  16 ++++++
 src/vnet/ip/ip4_forward.h | 140 ++++++++++++++++++++++++++++++++++++++++++++++
 src/vppinfra/cache.h      |   5 ++
 src/vppinfra/cpu.h        |  90 +++++++++++++++++++++++++++++
 4 files changed, 251 insertions(+)

diff --git a/src/cmake/cpu.cmake b/src/cmake/cpu.cmake
index bdc9014d944..20dab7bfce1 100644
--- a/src/cmake/cpu.cmake
+++ b/src/cmake/cpu.cmake
@@ -65,6 +65,22 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
   endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
   set(CMAKE_C_FLAGS "-march=armv8-a+crc ${CMAKE_C_FLAGS}")
+  check_c_compiler_flag("-march=armv8-a+crc+crypto -mtune=qdf24xx" compiler_flag_march_core_qdf24xx)
+  if(compiler_flag_march_core_qdf24xx)
+    list(APPEND MARCH_VARIANTS "qdf24xx\;-march=armv8-a+crc+crypto -DCLIB_N_PREFETCHES=8")
+  endif()
+  check_c_compiler_flag("-march=armv8.1-a+crc+crypto -mtune=thunderx2t99" compiler_flag_march_thunderx2t99)
+  if(compiler_flag_march_thunderx2t99)
+    if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.3)
+      list(APPEND MARCH_VARIANTS "thunderx2t99\;-march=armv8.1-a+crc+crypto -mtune=thunderx2t99 -DCLIB_N_PREFETCHES=8")
+    else()
+      list(APPEND MARCH_VARIANTS "thunderx2t99\;-march=armv8.1-a+crc+crypto -DCLIB_N_PREFETCHES=8")
+    endif()
+  endif()
+  check_c_compiler_flag("-march=armv8-a+crc+crypto -mtune=cortex-a72" compiler_flag_march_cortexa72)
+  if(compiler_flag_march_cortexa72)
+    list(APPEND MARCH_VARIANTS "cortexa72\;-march=armv8-a+crc+crypto -mtune=cortex-a72 -DCLIB_N_PREFETCHES=6")
+  endif()
 endif()
 
 macro(vpp_library_set_multiarch_sources lib)
diff --git a/src/vnet/ip/ip4_forward.h b/src/vnet/ip/ip4_forward.h
index b1294fdf2c3..2219b577b2a 100644
--- a/src/vnet/ip/ip4_forward.h
+++ b/src/vnet/ip/ip4_forward.h
@@ -40,6 +40,7 @@
 #ifndef __included_ip4_forward_h__
 #define __included_ip4_forward_h__
 
+#include <vppinfra/cache.h>
 #include <vnet/fib/ip4_fib.h>
 #include <vnet/dpo/load_balance_map.h>
 
@@ -70,6 +71,7 @@ ip4_lookup_inline (vlib_main_t * vm,
     {
       vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
 
+#if (CLIB_N_PREFETCHES >= 8)
       while (n_left_from >= 8 && n_left_to_next >= 4)
 	{
 	  vlib_buffer_t *p0, *p1, *p2, *p3;
@@ -283,7 +285,145 @@ ip4_lookup_inline (vlib_main_t * vm,
 					   pi0, pi1, pi2, pi3,
 					   next0, next1, next2, next3);
 	}
+#elif (CLIB_N_PREFETCHES >= 4)
+      while (n_left_from >= 4 && n_left_to_next >= 2)
+	{
+	  vlib_buffer_t *p0, *p1;
+	  ip4_header_t *ip0, *ip1;
+	  ip_lookup_next_t next0, next1;
+	  const load_balance_t *lb0, *lb1;
+	  ip4_fib_mtrie_t *mtrie0, *mtrie1;
+	  ip4_fib_mtrie_leaf_t leaf0, leaf1;
+	  ip4_address_t *dst_addr0, *dst_addr1;
+	  u32 pi0, pi1, lb_index0, lb_index1;
+	  flow_hash_config_t flow_hash_config0, flow_hash_config1;
+	  u32 hash_c0, hash_c1;
+	  const dpo_id_t *dpo0, *dpo1;
+
+	  /* Prefetch next iteration. */
+	  {
+	    vlib_buffer_t *p2, *p3;
+
+	    p2 = vlib_get_buffer (vm, from[2]);
+	    p3 = vlib_get_buffer (vm, from[3]);
+
+	    vlib_prefetch_buffer_header (p2, LOAD);
+	    vlib_prefetch_buffer_header (p3, LOAD);
+
+	    CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
+	    CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
+	  }
+
+	  pi0 = to_next[0] = from[0];
+	  pi1 = to_next[1] = from[1];
+
+	  from += 2;
+	  to_next += 2;
+	  n_left_to_next -= 2;
+	  n_left_from -= 2;
+
+	  p0 = vlib_get_buffer (vm, pi0);
+	  p1 = vlib_get_buffer (vm, pi1);
+
+	  ip0 = vlib_buffer_get_current (p0);
+	  ip1 = vlib_buffer_get_current (p1);
+
+	  dst_addr0 = &ip0->dst_address;
+	  dst_addr1 = &ip1->dst_address;
+
+	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p0);
+	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p1);
+
+	  if (!lookup_for_responses_to_locally_received_packets)
+	    {
+	      mtrie0 = &ip4_fib_get (vnet_buffer (p0)->ip.fib_index)->mtrie;
+	      mtrie1 = &ip4_fib_get (vnet_buffer (p1)->ip.fib_index)->mtrie;
+
+	      leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
+	      leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, dst_addr1);
+	    }
+
+	  if (!lookup_for_responses_to_locally_received_packets)
+	    {
+	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
+	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
+	    }
+
+	  if (!lookup_for_responses_to_locally_received_packets)
+	    {
+	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
+	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
+	    }
 
+	  if (lookup_for_responses_to_locally_received_packets)
+	    {
+	      lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
+	      lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
+	    }
+	  else
+	    {
+	      lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+	      lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+	    }
+
+	  ASSERT (lb_index0 && lb_index1);
+	  lb0 = load_balance_get (lb_index0);
+	  lb1 = load_balance_get (lb_index1);
+
+	  ASSERT (lb0->lb_n_buckets > 0);
+	  ASSERT (is_pow2 (lb0->lb_n_buckets));
+	  ASSERT (lb1->lb_n_buckets > 0);
+	  ASSERT (is_pow2 (lb1->lb_n_buckets));
+
+	  /* Use flow hash to compute multipath adjacency. */
+	  hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
+	  hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
+	  if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
+	    {
+	      flow_hash_config0 = lb0->lb_hash_config;
+	      hash_c0 = vnet_buffer (p0)->ip.flow_hash =
+		ip4_compute_flow_hash (ip0, flow_hash_config0);
+	      dpo0 =
+		load_balance_get_fwd_bucket (lb0,
+					     (hash_c0 &
+					      (lb0->lb_n_buckets_minus_1)));
+	    }
+	  else
+	    {
+	      dpo0 = load_balance_get_bucket_i (lb0, 0);
+	    }
+	  if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
+	    {
+	      flow_hash_config1 = lb1->lb_hash_config;
+	      hash_c1 = vnet_buffer (p1)->ip.flow_hash =
+		ip4_compute_flow_hash (ip1, flow_hash_config1);
+	      dpo1 =
+		load_balance_get_fwd_bucket (lb1,
+					     (hash_c1 &
+					      (lb1->lb_n_buckets_minus_1)));
+	    }
+	  else
+	    {
+	      dpo1 = load_balance_get_bucket_i (lb1, 0);
+	    }
+
+	  next0 = dpo0->dpoi_next_node;
+	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+	  next1 = dpo1->dpoi_next_node;
+	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
+
+	  vlib_increment_combined_counter
+	    (cm, thread_index, lb_index0, 1,
+	     vlib_buffer_length_in_chain (vm, p0));
+	  vlib_increment_combined_counter
+	    (cm, thread_index, lb_index1, 1,
+	     vlib_buffer_length_in_chain (vm, p1));
+
+	  vlib_validate_buffer_enqueue_x2 (vm, node, next,
+					   to_next, n_left_to_next,
+					   pi0, pi1, next0, next1);
+	}
+#endif
       while (n_left_from > 0 && n_left_to_next > 0)
 	{
 	  vlib_buffer_t *p0;
diff --git a/src/vppinfra/cache.h b/src/vppinfra/cache.h
index 7a54d34dfee..e8e89ba8556 100644
--- a/src/vppinfra/cache.h
+++ b/src/vppinfra/cache.h
@@ -59,6 +59,11 @@
 #define CLIB_CACHE_LINE_BYTES (1 << CLIB_LOG2_CACHE_LINE_BYTES)
 #define CLIB_CACHE_LINE_ALIGN_MARK(mark) u8 mark[0] __attribute__((aligned(CLIB_CACHE_LINE_BYTES)))
 
+/* Default cache line fill buffers. */
+#ifndef CLIB_N_PREFETCHES
+#define CLIB_N_PREFETCHES 16
+#endif
+
 /* Read/write arguments to __builtin_prefetch. */
 #define CLIB_PREFETCH_READ 0
 #define CLIB_PREFETCH_LOAD 0	/* alias for read */
diff --git a/src/vppinfra/cpu.h b/src/vppinfra/cpu.h
index c636cf8639c..0ca9edb97fc 100644
--- a/src/vppinfra/cpu.h
+++ b/src/vppinfra/cpu.h
@@ -183,6 +183,96 @@ clib_cpu_march_priority_avx2 ()
   return -1;
 }
 
+static inline u32
+clib_cpu_implementer ()
+{
+  char buf[128];
+  static u32 implementer = -1;
+
+  if (-1 != implementer)
+    return implementer;
+
+  FILE *fp = fopen ("/proc/cpuinfo", "r");
+  if (!fp)
+    return implementer;
+
+  while (!feof (fp))
+    {
+      if (!fgets (buf, sizeof (buf), fp))
+	break;
+      buf[127] = '\0';
+      if (strstr (buf, "CPU implementer"))
+	implementer = (u32) strtol (memchr (buf, ':', 128) + 2, NULL, 0);
+      if (-1 != implementer)
+	break;
+    }
+  fclose (fp);
+
+  return implementer;
+}
+
+static inline u32
+clib_cpu_part ()
+{
+  char buf[128];
+  static u32 part = -1;
+
+  if (-1 != part)
+    return part;
+
+  FILE *fp = fopen ("/proc/cpuinfo", "r");
+  if (!fp)
+    return part;
+
+  while (!feof (fp))
+    {
+      if (!fgets (buf, sizeof (buf), fp))
+	break;
+      buf[127] = '\0';
+      if (strstr (buf, "CPU part"))
+	part = (u32) strtol (memchr (buf, ':', 128) + 2, NULL, 0);
+      if (-1 != part)
+	break;
+    }
+  fclose (fp);
+
+  return part;
+}
+
+#define AARCH64_CPU_IMPLEMENTER_THUNERDERX2 0x43
+#define AARCH64_CPU_PART_THUNERDERX2        0x0af
+#define AARCH64_CPU_IMPLEMENTER_QDF24XX     0x51
+#define AARCH64_CPU_PART_QDF24XX            0xc00
+#define AARCH64_CPU_IMPLEMENTER_CORTEXA72   0x41
+#define AARCH64_CPU_PART_CORTEXA72          0xd08
+
+static inline int
+clib_cpu_march_priority_thunderx2t99 ()
+{
+  if ((AARCH64_CPU_IMPLEMENTER_THUNERDERX2 == clib_cpu_implementer ()) &&
+      (AARCH64_CPU_PART_THUNERDERX2 == clib_cpu_part ()))
+    return 20;
+  return -1;
+}
+
+static inline int
+clib_cpu_march_priority_qdf24xx ()
+{
+  if ((AARCH64_CPU_IMPLEMENTER_QDF24XX == clib_cpu_implementer ()) &&
+      (AARCH64_CPU_PART_QDF24XX == clib_cpu_part ()))
+    return 20;
+  return -1;
+}
+
+static inline int
+clib_cpu_march_priority_cortexa72 ()
+{
+  if ((AARCH64_CPU_IMPLEMENTER_CORTEXA72 == clib_cpu_implementer ()) &&
+      (AARCH64_CPU_PART_CORTEXA72 == clib_cpu_part ()))
+    return 10;
+  return -1;
+}
+
 #ifdef CLIB_MARCH_VARIANT
 #define CLIB_MARCH_FN_PRIORITY() CLIB_MARCH_SFX(clib_cpu_march_priority)()
 #else
-- 
2.16.6