From 9876520f9ba746ed4d9923f392911c4f1888a105 Mon Sep 17 00:00:00 2001
From: Pavel Kotucek <pkotucek@cisco.com>
Date: Fri, 7 Oct 2016 08:37:28 +0200
Subject: [PATCH] vpp_lite: add cpu pinning support (VPP-467)
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Proper cpu pinning in vpp_lite platform, like in normal vpp image.
Extended âshow threadsâ command to show propper information.
Changed handling of coreID and socketID for threads in "show threads"
CLI, pthread_getaffinity is used instead of info stored in DPDK.

Change-Id: Ic8299ec5e284472bb10a37a95fadeed57b6edae8
Signed-off-by: Pavel Kotucek <pkotucek@cisco.com>
---
 vlib/vlib/node_cli.c                |  4 +--
 vlib/vlib/threads.c                 | 23 +++-----------
 vlib/vlib/threads.h                 |  3 +-
 vlib/vlib/threads_cli.c             | 63 +++++++++++++++++++++++++++++++------
 vnet/vnet/devices/dpdk/cli.c        |  6 ++--
 vnet/vnet/devices/dpdk/init.c       |  4 +--
 vnet/vnet/devices/dpdk/vhost_user.c |  2 +-
 vpp/conf/startup.conf               | 19 ++++++++++-
 8 files changed, 86 insertions(+), 38 deletions(-)

diff --git a/vlib/vlib/node_cli.c b/vlib/vlib/node_cli.c
index af9b47dd05d..05d0f0b5a95 100644
--- a/vlib/vlib/node_cli.c
+++ b/vlib/vlib/node_cli.c
@@ -337,9 +337,9 @@ show_node_runtime (vlib_main_t * vm,
 	      if (j > 0)
 		vlib_cli_output (vm, "---------------");
 
-	      if (w->dpdk_lcore_id > -1)
+	      if (w->lcore_id > -1)
 		vlib_cli_output (vm, "Thread %d %s (lcore %u)", j, w->name,
-				 w->dpdk_lcore_id);
+				 w->lcore_id);
 	      else
 		vlib_cli_output (vm, "Thread %d %s", j, w->name);
 	    }
diff --git a/vlib/vlib/threads.c b/vlib/vlib/threads.c
index e371699f1d4..70505b072ff 100644
--- a/vlib/vlib/threads.c
+++ b/vlib/vlib/threads.c
@@ -211,8 +211,9 @@ vlib_thread_init (vlib_main_t * vm)
   w = vlib_worker_threads;
   w->thread_mheap = clib_mem_get_heap ();
   w->thread_stack = vlib_thread_stacks[0];
-  w->dpdk_lcore_id = -1;
+  w->lcore_id = tm->main_lcore;
   w->lwp = syscall (SYS_gettid);
+  w->thread_id = pthread_self ();
   tm->n_vlib_mains = 1;
 
   if (tm->sched_policy != ~0)
@@ -510,15 +511,7 @@ vlib_worker_thread_bootstrap_fn (void *arg)
   vlib_worker_thread_t *w = arg;
 
   w->lwp = syscall (SYS_gettid);
-  w->dpdk_lcore_id = -1;
-#if DPDK==1
-  if (w->registration && !w->registration->use_pthreads && rte_socket_id)	/* do we really have dpdk linked */
-    {
-      unsigned lcore = rte_lcore_id ();
-      lcore = lcore < RTE_MAX_LCORE ? lcore : -1;
-      w->dpdk_lcore_id = lcore;
-    }
-#endif
+  w->thread_id = pthread_self ();
 
   rv = (void *) clib_calljmp
     ((uword (*)(uword)) w->thread_function,
@@ -532,6 +525,7 @@ vlib_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id)
 {
   void *(*fp_arg) (void *) = fp;
 
+  w->lcore_id = lcore_id;
 #if DPDK==1
   if (!w->registration->use_pthreads)
     if (rte_eal_remote_launch)	/* do we have dpdk linked */
@@ -584,15 +578,6 @@ start_workers (vlib_main_t * vm)
       vlib_set_thread_name ((char *) w->name);
     }
 
-#if DPDK==1
-  w->dpdk_lcore_id = -1;
-  if (rte_socket_id)		/* do we really have dpdk linked */
-    {
-      unsigned lcore = rte_lcore_id ();
-      w->dpdk_lcore_id = lcore < RTE_MAX_LCORE ? lcore : -1;;
-    }
-#endif
-
   /*
    * Truth of the matter: we always use at least two
    * threads. So, make the main heap thread-safe
diff --git a/vlib/vlib/threads.h b/vlib/vlib/threads.h
index 589d1f3a1ec..e65794cfb6b 100644
--- a/vlib/vlib/threads.h
+++ b/vlib/vlib/threads.h
@@ -105,7 +105,8 @@ typedef struct
   u64 barrier_sync_count;
 
   long lwp;
-  int dpdk_lcore_id;
+  int lcore_id;
+  pthread_t thread_id;
 } vlib_worker_thread_t;
 
 vlib_worker_thread_t *vlib_worker_threads;
diff --git a/vlib/vlib/threads_cli.c b/vlib/vlib/threads_cli.c
index e788b04b795..631fe0c6244 100644
--- a/vlib/vlib/threads_cli.c
+++ b/vlib/vlib/threads_cli.c
@@ -12,12 +12,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#define _GNU_SOURCE
 
 #include <vppinfra/format.h>
 #include <vlib/vlib.h>
 
 #include <vlib/threads.h>
-#include <linux/sched.h>
+#include <vlib/unix/unix.h>
 
 static u8 *
 format_sched_policy_and_priority (u8 * s, va_list * args)
@@ -62,15 +63,52 @@ show_threads_fn (vlib_main_t * vm,
 
       line = format (line, "%-25U", format_sched_policy_and_priority, w->lwp);
 
-#if DPDK==1
-      int lcore = w->dpdk_lcore_id;
-      if (lcore > -1)
+      int lcore = -1;
+      cpu_set_t cpuset;
+      CPU_ZERO (&cpuset);
+      int ret = -1;
+
+      ret =
+	pthread_getaffinity_np (w->thread_id, sizeof (cpu_set_t), &cpuset);
+      if (!ret)
+	{
+	  int c;
+	  for (c = 0; c < CPU_SETSIZE; c++)
+	    if (CPU_ISSET (c, &cpuset))
+	      {
+		if (lcore > -1)
+		  {
+		    lcore = -2;
+		    break;
+		  }
+		lcore = c;
+	      }
+	}
+      else
 	{
-	  line = format (line, "%-7u%-7u%-7u",
-			 lcore,
-			 lcore_config[lcore].core_id,
-			 lcore_config[lcore].socket_id);
+	  lcore = w->lcore_id;
+	}
 
+      if (lcore > -1)
+	{
+	  const char *sys_cpu_path = "/sys/devices/system/cpu/cpu";
+	  int socket_id = -1;
+	  int core_id = -1;
+	  u8 *p = 0;
+
+	  p = format (p, "%s%u/topology/core_id%c", sys_cpu_path, lcore, 0);
+	  vlib_sysfs_read ((char *) p, "%d", &core_id);
+
+	  vec_reset_length (p);
+	  p =
+	    format (p,
+		    "%s%u/topology/physical_package_id%c",
+		    sys_cpu_path, lcore, 0);
+	  vlib_sysfs_read ((char *) p, "%d", &socket_id);
+	  vec_free (p);
+
+	  line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id);
+#if DPDK==1
 	  switch (lcore_config[lcore].state)
 	    {
 	    case WAIT:
@@ -85,8 +123,15 @@ show_threads_fn (vlib_main_t * vm,
 	    default:
 	      line = format (line, "unknown");
 	    }
-	}
 #endif
+	}
+      else
+	{
+	  line =
+	    format (line, "%-7s%-7s%-7s%", (lcore == -2) ? "M" : "n/a", "n/a",
+		    "n/a");
+	}
+
       vlib_cli_output (vm, "%v", line);
       vec_free (line);
     }
diff --git a/vnet/vnet/devices/dpdk/cli.c b/vnet/vnet/devices/dpdk/cli.c
index 7941f9e0e16..2683030658a 100644
--- a/vnet/vnet/devices/dpdk/cli.c
+++ b/vnet/vnet/devices/dpdk/cli.c
@@ -757,7 +757,7 @@ show_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input,
       if (vec_len (dm->devices_by_cpu[cpu]))
 	vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu,
 			 vlib_worker_threads[cpu].name,
-			 vlib_worker_threads[cpu].dpdk_lcore_id);
+			 vlib_worker_threads[cpu].lcore_id);
 
       /* *INDENT-OFF* */
       vec_foreach(dq, dm->devices_by_cpu[cpu])
@@ -857,7 +857,7 @@ set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input,
               dq->queue_id = queue;
               dq->device = xd->device_index;
               xd->cpu_socket_id_by_queue[queue] =
-                rte_lcore_to_socket_id(vlib_worker_threads[cpu].dpdk_lcore_id);
+                rte_lcore_to_socket_id(vlib_worker_threads[cpu].lcore_id);
 
               vec_sort_with_function(dm->devices_by_cpu[i],
                                      dpdk_device_queue_sort);
@@ -907,7 +907,7 @@ show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
       if (vec_len (dm->devices_by_hqos_cpu[cpu]))
 	vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu,
 			 vlib_worker_threads[cpu].name,
-			 vlib_worker_threads[cpu].dpdk_lcore_id);
+			 vlib_worker_threads[cpu].lcore_id);
 
       vec_foreach (dq, dm->devices_by_hqos_cpu[cpu])
       {
diff --git a/vnet/vnet/devices/dpdk/init.c b/vnet/vnet/devices/dpdk/init.c
index a5c056c6a68..73edc4a97a5 100644
--- a/vnet/vnet/devices/dpdk/init.c
+++ b/vnet/vnet/devices/dpdk/init.c
@@ -652,7 +652,7 @@ dpdk_lib_init (dpdk_main_t * dm)
 	  /* *INDENT-OFF* */
 	  clib_bitmap_foreach (i, devconf->workers, ({
 	    int cpu = dm->input_cpu_first_index + i;
-	    unsigned lcore = vlib_worker_threads[cpu].dpdk_lcore_id;
+	    unsigned lcore = vlib_worker_threads[cpu].lcore_id;
 	    vec_validate(xd->cpu_socket_id_by_queue, q);
 	    xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore);
 	    vec_add2(dm->devices_by_cpu[cpu], dq, 1);
@@ -665,7 +665,7 @@ dpdk_lib_init (dpdk_main_t * dm)
 	for (q = 0; q < xd->rx_q_used; q++)
 	  {
 	    int cpu = dm->input_cpu_first_index + next_cpu;
-	    unsigned lcore = vlib_worker_threads[cpu].dpdk_lcore_id;
+	    unsigned lcore = vlib_worker_threads[cpu].lcore_id;
 
 	    /*
 	     * numa node for worker thread handling this queue
diff --git a/vnet/vnet/devices/dpdk/vhost_user.c b/vnet/vnet/devices/dpdk/vhost_user.c
index 9e53c96f599..46fae60dac7 100644
--- a/vnet/vnet/devices/dpdk/vhost_user.c
+++ b/vnet/vnet/devices/dpdk/vhost_user.c
@@ -393,7 +393,7 @@ dpdk_create_vhost_user_if_internal (u32 * hw_if_index, u32 if_id, u8 * hwaddr)
     {
       int cpu = dm->input_cpu_first_index + (next_cpu % dm->input_cpu_count);
 
-      unsigned lcore = vlib_worker_threads[cpu].dpdk_lcore_id;
+      unsigned lcore = vlib_worker_threads[cpu].lcore_id;
       vec_validate (xd->cpu_socket_id_by_queue, q);
       xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id (lcore);
 
diff --git a/vpp/conf/startup.conf b/vpp/conf/startup.conf
index 84a026474c9..bce002027bf 100644
--- a/vpp/conf/startup.conf
+++ b/vpp/conf/startup.conf
@@ -14,13 +14,30 @@ api-segment {
 }
 
 cpu {
+	## In the VPP there is one main thread and optionally the user can create worker(s)
+	## The main thread and worker thread(s) can be pinned to CPU core(s) manually or automatically
+
+	## Manual pinning of thread(s) to CPU core(s)
+
 	## Set logical CPU core where main thread runs
 	# main-core 1
 
 	## Set logical CPU core(s) where worker threads are running
-	## by default there is no worker threads started
 	# corelist-workers 2-3,18-19
 
+	## Automatic pinning of thread(s) to CPU core(s)
+
+	## Sets number of CPU core(s) to be skipped (1 ... N-1)
+	## Skipped CPU core(s) are not used for pinning main thread and working thread(s).
+	## The main thread is automatically pinned to the first available CPU core and worker(s)
+	## are pinned to next free CPU core(s) after core assigned to main thread
+	# skip-cores 4
+
+	## Specify a number of workers to be created
+	## Workers are pinned to N consecutive CPU cores while skipping "skip-cores" CPU core(s)
+	## and main thread's CPU core
+	# workers 2
+
 	## Set scheduling policy and priority of main and worker threads
 
 	## Scheduling policy options are: other (SCHED_OTHER), batch (SCHED_BATCH)
-- 
2.16.6