From: hsandid Date: Fri, 7 Jun 2024 16:11:48 +0000 (+0200) Subject: vlib: add 'relative' keyword for cpu configuration X-Git-Tag: v25.10-rc0~96 X-Git-Url: https://gerrit.fd.io/r/gitweb?a=commitdiff_plain;h=9b20151507d040b4c97763dea3d7cea5bd238931;p=vpp.git vlib: add 'relative' keyword for cpu configuration Type: feature Add keyword 'relative' to translate cpu pinning configuration to match vpp launch affinity Change-Id: Ic748e5f4269525764b7adc92a8a5e4877e59c5a3 Signed-off-by: hsandid --- diff --git a/docs/configuration/reference.rst b/docs/configuration/reference.rst index 2759d47d54b..9378ce5b589 100644 --- a/docs/configuration/reference.rst +++ b/docs/configuration/reference.rst @@ -466,6 +466,20 @@ CPU cores while skipping "skip-cores" CPU core(s) and main thread's CPU core workers 2 +relative +^^^^^^^^^ + +Apply thread pinning configuration with respect to the available logical cores +in the current control group CPU set. +By default, VPP applies the thread pinning configuration with respect to the +available logical cores on host (e.g. '/sys/devices/system/cpu/online'). With +the 'relative' keyword, the thread pinning configuration is applied with respect +to the available logical cores (obtained with sched_getaffinity). + +.. code-block:: console + + relative + scheduler-policy other | batch | idle | fifo | rr ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/developer/corearchitecture/multi_thread.rst b/docs/developer/corearchitecture/multi_thread.rst index 195a9b791fd..89f2a10e489 100644 --- a/docs/developer/corearchitecture/multi_thread.rst +++ b/docs/developer/corearchitecture/multi_thread.rst @@ -48,6 +48,10 @@ placement works in the following way: and it will run threads on them - if “corelist-workers A,B1-Bn,C1-Cn” is defined vpp will automatically assign those CPU cores to worker threads +- if "relative" is defined, vpp will consider cores it has affinity + (using sched_getaffinity) rather than all cores available on the + host machine. This is useful if running in a containerized environment which + is only allowed to use a subset of the host's CPUs. User can see active placement of cores by using the VPP debug CLI command show threads: @@ -102,6 +106,26 @@ on cores 2,3,4. workers 3 } +Relative Placement +~~~~~~~~~~~~~~~~~~ + +Relative placement can be used in addition to manual or auto placement. It takes +into consideration that the VPP might be allowed to run on a limited subset of +logical cores on the host machine (e.g. running in a container), and automatically +remaps the user requested pinning configuration to the logical cores available to VPP +(checked using sched_getaffinity). +If a VPP instance runs with CPU set 20,25,26,27 and relative mode is enabled, a +manual placement of main thread on core 0 and workers on cores 2,3 will result +in placement of main thread on core 20 and workers on cores 26,27. + +.. code-block:: console + + cpu { + main-core 0 + corelist-workers 2-3 + relative + } + Buffer Memory Allocation ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index b1962cc460c..ecfebfb8eaa 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -402,6 +402,7 @@ geneve Geneve gerrit Gerrit +getaffinity gethostbyname gettingsources gettingstarted @@ -1003,6 +1004,7 @@ scanf scapy Scapy SCHED_RR +sched sclass scooby screenshot diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 0d1cf502f6e..bf5031b7ee4 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -183,6 +183,7 @@ vlib_thread_init (vlib_main_t * vm) u32 first_index = 1; u32 i; uword *avail_cpu; + uword n_cpus; u32 stats_num_worker_threads_dir_index; stats_num_worker_threads_dir_index = @@ -190,12 +191,24 @@ vlib_thread_init (vlib_main_t * vm) ASSERT (stats_num_worker_threads_dir_index != ~0); /* get bitmaps of active cpu cores and sockets */ - tm->cpu_core_bitmap = os_get_online_cpu_core_bitmap (); tm->cpu_socket_bitmap = os_get_online_cpu_node_bitmap (); + if (!tm->cpu_translate) + tm->cpu_core_bitmap = os_get_online_cpu_core_bitmap (); + else + { + /* get bitmap of cpu core affinity */ + if ((tm->cpu_core_bitmap = os_get_cpu_affinity_bitmap ()) == 0) + return clib_error_return (0, "could not fetch cpu affinity bmp"); + } avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap); /* skip cores */ + n_cpus = clib_bitmap_count_set_bits (avail_cpu); + if (tm->skip_cores >= n_cpus) + return clib_error_return ( + 0, "skip-core value greater or equal to available cpus"); + for (i = 0; i < tm->skip_cores; i++) { uword c = clib_bitmap_first_set (avail_cpu); @@ -213,8 +226,20 @@ vlib_thread_init (vlib_main_t * vm) if (tm->main_lcore != ~0) { if (clib_bitmap_get (avail_cpu, tm->main_lcore) == 0) - return clib_error_return (0, "cpu %u is not available to be used" - " for the main thread", tm->main_lcore); + { + if (tm->cpu_translate) + return clib_error_return ( + 0, + "cpu %u (relative cpu %u) is not available to be used" + " for the main thread in relative mode", + tm->main_lcore, + os_translate_cpu_from_affinity_bitmap (tm->main_lcore)); + else + return clib_error_return (0, + "cpu %u is not available to be used" + " for the main thread", + tm->main_lcore); + } avail_cpu = clib_bitmap_set (avail_cpu, tm->main_lcore, 0); } @@ -297,11 +322,23 @@ vlib_thread_init (vlib_main_t * vm) uword c; clib_bitmap_foreach (c, tr->coremask) { if (clib_bitmap_get(avail_cpu, c) == 0) - return clib_error_return (0, "cpu %u is not available to be used" - " for the '%s' thread",c, tr->name); - - avail_cpu = clib_bitmap_set(avail_cpu, c, 0); - } + { + if (tm->cpu_translate) + return clib_error_return ( + 0, + "cpu %u (relative cpu %u) is not available to be used" + " for the '%s' thread in relative mode", + c, os_translate_cpu_from_affinity_bitmap (c), tr->name); + else + return clib_error_return ( + 0, + "cpu %u is not available to be used" + " for the '%s' thread", + c, tr->name); + } + + avail_cpu = clib_bitmap_set (avail_cpu, c, 0); + } } else { @@ -313,7 +350,7 @@ vlib_thread_init (vlib_main_t * vm) uword c = clib_bitmap_first_set (avail_cpu); /* Use CPU 0 as a last resort */ - if (c == ~0 && avail_c0) + if (c == ~0 && avail_c0 && !tm->cpu_translate) { c = 0; avail_c0 = 0; @@ -323,7 +360,7 @@ vlib_thread_init (vlib_main_t * vm) return clib_error_return (0, "no available cpus to be used for" " the '%s' thread #%u", - tr->name, tr->count); + tr->name, j); avail_cpu = clib_bitmap_set (avail_cpu, 0, avail_c0); avail_cpu = clib_bitmap_set (avail_cpu, c, 0); @@ -1106,6 +1143,8 @@ cpu_config (vlib_main_t * vm, unformat_input_t * input) ; else if (unformat (input, "skip-cores %u", &tm->skip_cores)) ; + else if (unformat (input, "relative")) + tm->cpu_translate = 1; else if (unformat (input, "numa-heap-size %U", unformat_memory_size, &tm->numa_heap_size)) ; @@ -1201,6 +1240,36 @@ cpu_config (vlib_main_t * vm, unformat_input_t * input) tr = tr->next; } + /* for relative mode, update requested main-core and corelists */ + if (tm->cpu_translate) + { + + if (tm->main_lcore == ~0) + clib_error ("main-core must be specified in relative mode"); + int cpu_translate_main_core = + os_translate_cpu_to_affinity_bitmap (tm->main_lcore); + if (cpu_translate_main_core == -1) + clib_error ("cpu %u is not available to be used" + " for the main thread in relative mode", + tm->main_lcore); + tm->main_lcore = cpu_translate_main_core; + + tr = tm->next; + uword *translated_cpu_bmp; + while (tr && tr->coremask) + { + translated_cpu_bmp = + os_translate_cpu_bmp_to_affinity_bitmap (tr->coremask); + + if (!translated_cpu_bmp) + clib_error ("could not translate corelist associated to %s", + tr->name); + clib_bitmap_free (tr->coremask); + tr->coremask = translated_cpu_bmp; + tr = tr->next; + } + } + return 0; } diff --git a/src/vlib/threads.h b/src/vlib/threads.h index da2c41fec73..8c5d56058eb 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -260,6 +260,9 @@ typedef struct int use_pthreads; + /* Translate requested cpu configuration to vpp affinity mask */ + int cpu_translate; + /* Number of vlib_main / vnet_main clones */ u32 n_vlib_mains; diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index 2872a025d66..ccc34fc8d8d 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -14,6 +14,8 @@ */ #define _GNU_SOURCE +#include +#include #include #include @@ -46,16 +48,20 @@ show_threads_fn (vlib_main_t * vm, const vlib_thread_main_t *tm = vlib_get_thread_main (); vlib_worker_thread_t *w; int i; + u8 *line = NULL; - vlib_cli_output (vm, "%-7s%-20s%-12s%-8s%-25s%-7s%-7s%-7s%-10s", - "ID", "Name", "Type", "LWP", "Sched Policy (Priority)", - "lcore", "Core", "Socket", "State"); + line = format (line, "%-7s%-20s%-12s%-8s%-25s%-7s%-7s%-7s%-10s", "ID", + "Name", "Type", "LWP", "Sched Policy (Priority)", "lcore", + "Core", "Socket", "State"); + if (tm->cpu_translate) + line = format (line, "%-15s", "Relative Core"); + vlib_cli_output (vm, "%v", line); + vec_free (line); #if !defined(__powerpc64__) for (i = 0; i < vec_len (vlib_worker_threads); i++) { w = vlib_worker_threads + i; - u8 *line = NULL; line = format (line, "%-7d%-20s%-12s%-8d", i, @@ -69,7 +75,13 @@ show_threads_fn (vlib_main_t * vm, { int core_id = w->core_id; int numa_id = w->numa_id; - line = format (line, "%-7u%-7u%-7u%", cpu_id, core_id, numa_id); + line = format (line, "%-7u%-7u%-17u%", cpu_id, core_id, numa_id); + if (tm->cpu_translate) + { + int cpu_translate_core_id = + os_translate_cpu_from_affinity_bitmap (cpu_id); + line = format (line, "%-7u", cpu_translate_core_id); + } } else { diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index dd4f4cc3353..2808265ffb6 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -123,6 +123,7 @@ main (int argc, char *argv[]) unformat_input_t input, sub_input; u8 *s = 0, *v = 0; int main_core = ~0; + int cpu_translate = 0; cpu_set_t cpuset; void *main_heap; @@ -282,6 +283,8 @@ main (int argc, char *argv[]) unix_main.flags |= UNIX_FLAG_INTERACTIVE; else if (!strncmp (argv[i], "nosyslog", 8)) unix_main.flags |= UNIX_FLAG_NOSYSLOG; + else if (!strncmp (argv[i], "relative", 8)) + cpu_translate = 1; } defaulted: @@ -329,6 +332,17 @@ defaulted: unformat_free (&input); + int translate_main_core = os_translate_cpu_to_affinity_bitmap (main_core); + + if (cpu_translate && main_core != ~0) + { + if (translate_main_core == -1) + clib_error ("cpu %u is not available to be used" + " for the main thread in relative mode", + main_core); + main_core = translate_main_core; + } + /* if main thread affinity is unspecified, set to current running cpu */ if (main_core == ~0) main_core = sched_getcpu (); diff --git a/src/vppinfra/unix-misc.c b/src/vppinfra/unix-misc.c index 05ca2f901c6..c0cf507aa5c 100644 --- a/src/vppinfra/unix-misc.c +++ b/src/vppinfra/unix-misc.c @@ -66,8 +66,8 @@ __clib_export __thread uword __os_thread_index = 0; __clib_export __thread uword __os_numa_index = 0; - -__clib_export clib_bitmap_t *os_get_cpu_affinity_bitmap (int pid); +__clib_export cpu_set_t __os_affinity_cpu_set; +__clib_export clib_bitmap_t *os_get_cpu_affinity_bitmap (); clib_error_t * clib_file_n_bytes (char *file, uword * result) @@ -285,29 +285,31 @@ os_get_online_cpu_core_bitmap () } __clib_export clib_bitmap_t * -os_get_cpu_affinity_bitmap (int pid) +os_get_cpu_affinity_bitmap () { #if __linux - int index, ret; - cpu_set_t cpuset; + int cpu; uword *affinity_cpus; - clib_bitmap_alloc (affinity_cpus, sizeof (cpu_set_t)); + clib_bitmap_alloc (affinity_cpus, __CPU_SETSIZE); clib_bitmap_zero (affinity_cpus); - CPU_ZERO_S (sizeof (cpu_set_t), &cpuset); - - ret = sched_getaffinity (0, sizeof (cpu_set_t), &cpuset); - - if (ret < 0) + /* set__os_affinity_cpu_set once on first call to + * os_get_cpu_affinity_bitmap() */ + if (__CPU_COUNT_S (sizeof (cpu_set_t), &__os_affinity_cpu_set) == 0) { - clib_bitmap_free (affinity_cpus); - return 0; + int ret; + ret = sched_getaffinity (0, sizeof (cpu_set_t), &__os_affinity_cpu_set); + if (ret < 0) + { + clib_bitmap_free (affinity_cpus); + return NULL; + } } - for (index = 0; index < sizeof (cpu_set_t); index++) - if (CPU_ISSET_S (index, sizeof (cpu_set_t), &cpuset)) - clib_bitmap_set (affinity_cpus, index, 1); + for (cpu = 0; cpu < __CPU_SETSIZE; cpu++) + if (__CPU_ISSET_S (cpu, sizeof (cpu_set_t), &__os_affinity_cpu_set)) + clib_bitmap_set (affinity_cpus, cpu, 1); return affinity_cpus; #elif defined(__FreeBSD__) cpuset_t mask; @@ -332,6 +334,100 @@ os_get_cpu_affinity_bitmap (int pid) #endif } +__clib_export int +os_translate_cpu_to_affinity_bitmap (int cpu) +{ + uword *affinity_bmp = os_get_cpu_affinity_bitmap (); + int cpu_it = 0; + int cpu_translate_it = 0; + + if (!affinity_bmp) + return -1; + + if (cpu == ~0) + goto err; + + clib_bitmap_foreach (cpu_it, affinity_bmp) + { + + if (cpu == cpu_translate_it) + { + clib_bitmap_free (affinity_bmp); + return cpu_it; + } + + cpu_translate_it += 1; + } + +err: + clib_bitmap_free (affinity_bmp); + return -1; +} + +__clib_export int +os_translate_cpu_from_affinity_bitmap (int cpu_translated) +{ + uword *affinity_bmp = os_get_cpu_affinity_bitmap (); + int cpu_it = 0; + int cpu_translate_it = 0; + + if (!affinity_bmp) + return -1; + + if (cpu_translated == ~0) + goto err; + + clib_bitmap_foreach (cpu_it, affinity_bmp) + { + + if (cpu_translated == cpu_it) + { + clib_bitmap_free (affinity_bmp); + return cpu_translate_it; + } + + cpu_translate_it += 1; + } + +err: + clib_bitmap_free (affinity_bmp); + return -1; +} + +__clib_export clib_bitmap_t * +os_translate_cpu_bmp_to_affinity_bitmap (clib_bitmap_t *cpu_bmp) +{ + uword *affinity_bmp = os_get_cpu_affinity_bitmap (); + + if (!affinity_bmp) + return NULL; + + u32 cpu_count_relative = clib_bitmap_count_set_bits (affinity_bmp); + u32 cpu_max_corelist = clib_bitmap_last_set (cpu_bmp); + + if (cpu_count_relative <= cpu_max_corelist) + return NULL; + + uword *translated_cpulist; + clib_bitmap_alloc (translated_cpulist, __CPU_SETSIZE); + clib_bitmap_zero (translated_cpulist); + + uword cpu_it; + uword cpu_translate_it = 0; + + clib_bitmap_foreach (cpu_it, affinity_bmp) + { + + if (clib_bitmap_get (cpu_bmp, cpu_translate_it)) + clib_bitmap_set (translated_cpulist, cpu_it, 1); + + cpu_translate_it++; + } + + vec_free (affinity_bmp); + return translated_cpulist; +} + __clib_export clib_bitmap_t * os_get_online_cpu_node_bitmap () { diff --git a/src/vppinfra/unix.h b/src/vppinfra/unix.h index d0ddb93a46f..db3102e4fee 100644 --- a/src/vppinfra/unix.h +++ b/src/vppinfra/unix.h @@ -56,6 +56,19 @@ clib_error_t *unix_proc_file_contents (char *file, u8 ** result); /* Retrieve bitmap of online cpu cures */ clib_bitmap_t *os_get_online_cpu_core_bitmap (); +/* Retrieve bitmap of cpu affinity */ +clib_bitmap_t *os_get_cpu_affinity_bitmap (); + +/* Translate cpu index in cpu affinity bitmap */ +int os_translate_cpu_to_affinity_bitmap (int cpu); + +/* Retrieve cpu index after translation in cpu affinity bitmap */ +int os_translate_cpu_from_affinity_bitmap (int cpu_translated); + +/* Translate cpu bitmap based on cpu affinity bitmap */ +clib_bitmap_t * +os_translate_cpu_bmp_to_affinity_bitmap (clib_bitmap_t *cpu_bmp); + /* Retrieve bitmap of online cpu nodes (sockets) */ clib_bitmap_t *os_get_online_cpu_node_bitmap ();