src/vlib/threads.c

   1 /*
   2  * Copyright (c) 2015 Cisco and/or its affiliates.
   3  * Licensed under the Apache License, Version 2.0 (the "License");
   4  * you may not use this file except in compliance with the License.
   5  * You may obtain a copy of the License at:
   6  *
   7  *     http://www.apache.org/licenses/LICENSE-2.0
   8  *
   9  * Unless required by applicable law or agreed to in writing, software
  10  * distributed under the License is distributed on an "AS IS" BASIS,
  11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12  * See the License for the specific language governing permissions and
  13  * limitations under the License.
  14  */
  15 #define _GNU_SOURCE
  16
  17 #include <signal.h>
  18 #include <math.h>
  19 #include <vppinfra/format.h>
  20 #include <vppinfra/time_range.h>
  21 #include <vppinfra/interrupt.h>
  22 #include <vppinfra/bitmap.h>
  23 #include <vppinfra/unix.h>
  24 #include <vlib/vlib.h>
  25
  26 #include <vlib/threads.h>
  27
  28 #include <vlib/stats/stats.h>
  29
  30 u32
  31 vl (void *p)
  32 {
  33   return vec_len (p);
  34 }
  35
  36 vlib_worker_thread_t *vlib_worker_threads;
  37 vlib_thread_main_t vlib_thread_main;
  38
  39 /*
  40  * Barrier tracing can be enabled on a normal build to collect information
  41  * on barrier use, including timings and call stacks.  Deliberately not
  42  * keyed off CLIB_DEBUG, because that can add significant overhead which
  43  * imapacts observed timings.
  44  */
  45
  46 static inline void
  47 barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed)
  48 {
  49   if (!vlib_worker_threads->barrier_elog_enabled)
  50     return;
  51
  52   ELOG_TYPE_DECLARE (e) = {
  53     .format = "bar-trace-%s-#%d",
  54     .format_args = "T4i4",
  55   };
  56
  57   struct
  58   {
  59     u32 caller, count, t_entry, t_open, t_closed;
  60   } *ed = 0;
  61
  62   ed = ELOG_DATA (&vlib_global_main.elog_main, e);
  63   ed->count = (int) vlib_worker_threads[0].barrier_sync_count;
  64   ed->caller = elog_string (&vlib_global_main.elog_main,
  65                             (char *) vlib_worker_threads[0].barrier_caller);
  66   ed->t_entry = (int) (1000000.0 * t_entry);
  67   ed->t_open = (int) (1000000.0 * t_open);
  68   ed->t_closed = (int) (1000000.0 * t_closed);
  69 }
  70
  71 static inline void
  72 barrier_trace_sync_rec (f64 t_entry)
  73 {
  74   if (!vlib_worker_threads->barrier_elog_enabled)
  75     return;
  76
  77   ELOG_TYPE_DECLARE (e) = {
  78     .format = "bar-syncrec-%s-#%d",
  79     .format_args = "T4i4",
  80   };
  81
  82   struct
  83   {
  84     u32 caller, depth;
  85   } *ed = 0;
  86
  87   ed = ELOG_DATA (&vlib_global_main.elog_main, e);
  88   ed->depth = (int) vlib_worker_threads[0].recursion_level - 1;
  89   ed->caller = elog_string (&vlib_global_main.elog_main,
  90                             (char *) vlib_worker_threads[0].barrier_caller);
  91 }
  92
  93 static inline void
  94 barrier_trace_release_rec (f64 t_entry)
  95 {
  96   if (!vlib_worker_threads->barrier_elog_enabled)
  97     return;
  98
  99   ELOG_TYPE_DECLARE (e) = {
 100     .format = "bar-relrrec-#%d",
 101     .format_args = "i4",
 102   };
 103
 104   struct
 105   {
 106     u32 depth;
 107   } *ed = 0;
 108
 109   ed = ELOG_DATA (&vlib_global_main.elog_main, e);
 110   ed->depth = (int) vlib_worker_threads[0].recursion_level;
 111 }
 112
 113 static inline void
 114 barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main)
 115 {
 116   if (!vlib_worker_threads->barrier_elog_enabled)
 117     return;
 118
 119   ELOG_TYPE_DECLARE (e) = {
 120     .format = "bar-rel-#%d-e%d-u%d-t%d",
 121     .format_args = "i4i4i4i4",
 122   };
 123
 124   struct
 125   {
 126     u32 count, t_entry, t_update_main, t_closed_total;
 127   } *ed = 0;
 128
 129   ed = ELOG_DATA (&vlib_global_main.elog_main, e);
 130   ed->t_entry = (int) (1000000.0 * t_entry);
 131   ed->t_update_main = (int) (1000000.0 * t_update_main);
 132   ed->t_closed_total = (int) (1000000.0 * t_closed_total);
 133   ed->count = (int) vlib_worker_threads[0].barrier_sync_count;
 134
 135   /* Reset context for next trace */
 136   vlib_worker_threads[0].barrier_context = NULL;
 137 }
 138
 139 uword
 140 os_get_nthreads (void)
 141 {
 142   return vec_len (vlib_thread_stacks);
 143 }
 144
 145 void
 146 vlib_set_thread_name (char *name)
 147 {
 148   int pthread_setname_np (pthread_t __target_thread, const char *__name);
 149   int rv;
 150   pthread_t thread = pthread_self ();
 151
 152   if (thread)
 153     {
 154       rv = pthread_setname_np (thread, name);
 155       if (rv)
 156         clib_warning ("pthread_setname_np returned %d", rv);
 157     }
 158 }
 159
 160 static int
 161 sort_registrations_by_no_clone (void *a0, void *a1)
 162 {
 163   vlib_thread_registration_t **tr0 = a0;
 164   vlib_thread_registration_t **tr1 = a1;
 165
 166   return ((i32) ((*tr0)->no_data_structure_clone)
 167           - ((i32) ((*tr1)->no_data_structure_clone)));
 168 }
 169
 170
 171 /* Called early in the init sequence */
 172
 173 clib_error_t *
 174 vlib_thread_init (vlib_main_t * vm)
 175 {
 176   vlib_thread_main_t *tm = &vlib_thread_main;
 177   vlib_worker_thread_t *w;
 178   vlib_thread_registration_t *tr;
 179   u32 n_vlib_mains = 1;
 180   u32 first_index = 1;
 181   u32 i;
 182   uword *avail_cpu;
 183   u32 stats_num_worker_threads_dir_index;
 184
 185   stats_num_worker_threads_dir_index =
 186     vlib_stats_add_gauge ("/sys/num_worker_threads");
 187   ASSERT (stats_num_worker_threads_dir_index != ~0);
 188
 189   /* get bitmaps of active cpu cores and sockets */
 190   tm->cpu_core_bitmap = os_get_online_cpu_core_bitmap ();
 191   tm->cpu_socket_bitmap = os_get_online_cpu_node_bitmap ();
 192
 193   avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap);
 194
 195   /* skip cores */
 196   for (i = 0; i < tm->skip_cores; i++)
 197     {
 198       uword c = clib_bitmap_first_set (avail_cpu);
 199       if (c == ~0)
 200         return clib_error_return (0, "no available cpus to skip");
 201
 202       avail_cpu = clib_bitmap_set (avail_cpu, c, 0);
 203     }
 204
 205   /* grab cpu for main thread */
 206   if (tm->main_lcore != ~0)
 207     {
 208       if (clib_bitmap_get (avail_cpu, tm->main_lcore) == 0)
 209         return clib_error_return (0, "cpu %u is not available to be used"
 210                                   " for the main thread", tm->main_lcore);
 211       avail_cpu = clib_bitmap_set (avail_cpu, tm->main_lcore, 0);
 212     }
 213
 214   /* assume that there is socket 0 only if there is no data from sysfs */
 215   if (!tm->cpu_socket_bitmap)
 216     tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1);
 217
 218   /* pin main thread to main_lcore  */
 219   if (tm->main_lcore != ~0)
 220     {
 221       cpu_set_t cpuset;
 222       CPU_ZERO (&cpuset);
 223       CPU_SET (tm->main_lcore, &cpuset);
 224       if (pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t),
 225                                   &cpuset))
 226         {
 227           return clib_error_return (0, "could not pin main thread to cpu %u",
 228                                     tm->main_lcore);
 229         }
 230     }
 231
 232   /* Set up thread 0 */
 233   vec_validate_aligned (vlib_worker_threads, 0, CLIB_CACHE_LINE_BYTES);
 234   vec_set_len (vlib_worker_threads, 1);
 235   w = vlib_worker_threads;
 236   w->thread_mheap = clib_mem_get_heap ();
 237   w->thread_stack = vlib_thread_stacks[0];
 238   w->cpu_id = tm->main_lcore;
 239   w->lwp = syscall (SYS_gettid);
 240   w->thread_id = pthread_self ();
 241   tm->n_vlib_mains = 1;
 242
 243   vlib_get_thread_core_numa (w, w->cpu_id);
 244
 245   if (tm->sched_policy != ~0)
 246     {
 247       struct sched_param sched_param;
 248       if (!sched_getparam (w->lwp, &sched_param))
 249         {
 250           if (tm->sched_priority != ~0)
 251             sched_param.sched_priority = tm->sched_priority;
 252           sched_setscheduler (w->lwp, tm->sched_policy, &sched_param);
 253         }
 254     }
 255
 256   /* assign threads to cores and set n_vlib_mains */
 257   tr = tm->next;
 258
 259   while (tr)
 260     {
 261       vec_add1 (tm->registrations, tr);
 262       tr = tr->next;
 263     }
 264
 265   vec_sort_with_function (tm->registrations, sort_registrations_by_no_clone);
 266
 267   for (i = 0; i < vec_len (tm->registrations); i++)
 268     {
 269       int j;
 270       tr = tm->registrations[i];
 271       tr->first_index = first_index;
 272       first_index += tr->count;
 273       n_vlib_mains += (tr->no_data_structure_clone == 0) ? tr->count : 0;
 274
 275       /* construct coremask */
 276       if (tr->use_pthreads || !tr->count)
 277         continue;
 278
 279       if (tr->coremask)
 280         {
 281           uword c;
 282           clib_bitmap_foreach (c, tr->coremask)  {
 283             if (clib_bitmap_get(avail_cpu, c) == 0)
 284               return clib_error_return (0, "cpu %u is not available to be used"
 285                                         " for the '%s' thread",c, tr->name);
 286
 287             avail_cpu = clib_bitmap_set(avail_cpu, c, 0);
 288           }
 289         }
 290       else
 291         {
 292           for (j = 0; j < tr->count; j++)
 293             {
 294               /* Do not use CPU 0 by default - leave it to the host and IRQs */
 295               uword avail_c0 = clib_bitmap_get (avail_cpu, 0);
 296               avail_cpu = clib_bitmap_set (avail_cpu, 0, 0);
 297
 298               uword c = clib_bitmap_first_set (avail_cpu);
 299               /* Use CPU 0 as a last resort */
 300               if (c == ~0 && avail_c0)
 301                 {
 302                   c = 0;
 303                   avail_c0 = 0;
 304                 }
 305
 306               if (c == ~0)
 307                 return clib_error_return (0,
 308                                           "no available cpus to be used for"
 309                                           " the '%s' thread #%u",
 310                                           tr->name, tr->count);
 311
 312               avail_cpu = clib_bitmap_set (avail_cpu, 0, avail_c0);
 313               avail_cpu = clib_bitmap_set (avail_cpu, c, 0);
 314               tr->coremask = clib_bitmap_set (tr->coremask, c, 1);
 315             }
 316         }
 317     }
 318
 319   clib_bitmap_free (avail_cpu);
 320
 321   tm->n_vlib_mains = n_vlib_mains;
 322   vlib_stats_set_gauge (stats_num_worker_threads_dir_index, n_vlib_mains - 1);
 323
 324   /*
 325    * Allocate the remaining worker threads, and thread stack vector slots
 326    * from now on, calls to os_get_nthreads() will return the correct
 327    * answer.
 328    */
 329   vec_validate_aligned (vlib_worker_threads, first_index - 1,
 330                         CLIB_CACHE_LINE_BYTES);
 331   vec_validate (vlib_thread_stacks, vec_len (vlib_worker_threads) - 1);
 332   return 0;
 333 }
 334
 335 vlib_frame_queue_t *
 336 vlib_frame_queue_alloc (int nelts)
 337 {
 338   vlib_frame_queue_t *fq;
 339
 340   fq = clib_mem_alloc_aligned (sizeof (*fq), CLIB_CACHE_LINE_BYTES);
 341   clib_memset (fq, 0, sizeof (*fq));
 342   fq->nelts = nelts;
 343   fq->vector_threshold = 2 * VLIB_FRAME_SIZE;
 344   vec_validate_aligned (fq->elts, nelts - 1, CLIB_CACHE_LINE_BYTES);
 345
 346   if (nelts & (nelts - 1))
 347     {
 348       fformat (stderr, "FATAL: nelts MUST be a power of 2\n");
 349       abort ();
 350     }
 351
 352   return (fq);
 353 }
 354
 355 void vl_msg_api_handler_no_free (void *) __attribute__ ((weak));
 356 void
 357 vl_msg_api_handler_no_free (void *v)
 358 {
 359 }
 360
 361 /* To be called by vlib worker threads upon startup */
 362 void
 363 vlib_worker_thread_init (vlib_worker_thread_t * w)
 364 {
 365   vlib_thread_main_t *tm = vlib_get_thread_main ();
 366
 367   /*
 368    * Note: disabling signals in worker threads as follows
 369    * prevents the api post-mortem dump scheme from working
 370    * {
 371    *    sigset_t s;
 372    *    sigfillset (&s);
 373    *    pthread_sigmask (SIG_SETMASK, &s, 0);
 374    *  }
 375    */
 376
 377   clib_mem_set_heap (w->thread_mheap);
 378
 379   if (vec_len (tm->thread_prefix) && w->registration->short_name)
 380     {
 381       w->name = format (0, "%v_%s_%d%c", tm->thread_prefix,
 382                         w->registration->short_name, w->instance_id, '\0');
 383       vlib_set_thread_name ((char *) w->name);
 384     }
 385
 386   if (!w->registration->use_pthreads)
 387     {
 388
 389       /* Initial barrier sync, for both worker and i/o threads */
 390       clib_atomic_fetch_add (vlib_worker_threads->workers_at_barrier, 1);
 391
 392       while (*vlib_worker_threads->wait_at_barrier)
 393         ;
 394
 395       clib_atomic_fetch_add (vlib_worker_threads->workers_at_barrier, -1);
 396     }
 397 }
 398
 399 void *
 400 vlib_worker_thread_bootstrap_fn (void *arg)
 401 {
 402   vlib_worker_thread_t *w = arg;
 403
 404   w->lwp = syscall (SYS_gettid);
 405   w->thread_id = pthread_self ();
 406
 407   __os_thread_index = w - vlib_worker_threads;
 408
 409   if (CLIB_DEBUG > 0)
 410     {
 411       void *frame_addr = __builtin_frame_address (0);
 412       if (frame_addr < (void *) w->thread_stack ||
 413           frame_addr > (void *) w->thread_stack + VLIB_THREAD_STACK_SIZE)
 414         {
 415           /* heap is not set yet */
 416           fprintf (stderr, "thread stack is not set properly\n");
 417           exit (1);
 418         }
 419     }
 420
 421   w->thread_function (arg);
 422
 423   return 0;
 424 }
 425
 426 void
 427 vlib_get_thread_core_numa (vlib_worker_thread_t * w, unsigned cpu_id)
 428 {
 429   clib_bitmap_t *nbmp = 0, *cbmp = 0;
 430   int node, core_id = -1, numa_id = -1;
 431
 432   core_id = os_get_cpu_phys_core_id (cpu_id);
 433   nbmp = os_get_online_cpu_node_bitmap ();
 434
 435   clib_bitmap_foreach (node, nbmp)  {
 436       cbmp = os_get_cpu_on_node_bitmap (node);
 437       if (clib_bitmap_get (cbmp, cpu_id))
 438         numa_id = node;
 439       vec_reset_length (cbmp);
 440   }
 441
 442   vec_free (nbmp);
 443   vec_free (cbmp);
 444
 445   w->core_id = core_id;
 446   w->numa_id = numa_id;
 447 }
 448
 449 static clib_error_t *
 450 vlib_launch_thread_int (void *fp, vlib_worker_thread_t * w, unsigned cpu_id)
 451 {
 452   clib_mem_main_t *mm = &clib_mem_main;
 453   vlib_thread_main_t *tm = &vlib_thread_main;
 454   pthread_t worker;
 455   pthread_attr_t attr;
 456   cpu_set_t cpuset;
 457   void *(*fp_arg) (void *) = fp;
 458   void *numa_heap;
 459
 460   w->cpu_id = cpu_id;
 461   vlib_get_thread_core_numa (w, cpu_id);
 462
 463   /* Set up NUMA-bound heap if indicated */
 464   if (mm->per_numa_mheaps[w->numa_id] == 0)
 465     {
 466       /* If the user requested a NUMA heap, create it... */
 467       if (tm->numa_heap_size)
 468         {
 469           clib_mem_set_numa_affinity (w->numa_id, 1 /* force */ );
 470           numa_heap = clib_mem_create_heap (0 /* DIY */ , tm->numa_heap_size,
 471                                             1 /* is_locked */ ,
 472                                             "numa %u heap", w->numa_id);
 473           clib_mem_set_default_numa_affinity ();
 474           mm->per_numa_mheaps[w->numa_id] = numa_heap;
 475         }
 476       else
 477         {
 478           /* Or, use the main heap */
 479           mm->per_numa_mheaps[w->numa_id] = w->thread_mheap;
 480         }
 481     }
 482
 483       CPU_ZERO (&cpuset);
 484       CPU_SET (cpu_id, &cpuset);
 485
 486       if (pthread_attr_init (&attr))
 487         return clib_error_return_unix (0, "pthread_attr_init");
 488
 489       if (pthread_attr_setstack (&attr, w->thread_stack,
 490                                  VLIB_THREAD_STACK_SIZE))
 491         return clib_error_return_unix (0, "pthread_attr_setstack");
 492
 493       if (pthread_create (&worker, &attr, fp_arg, (void *) w))
 494         return clib_error_return_unix (0, "pthread_create");
 495
 496       if (pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset))
 497         return clib_error_return_unix (0, "pthread_setaffinity_np");
 498
 499       if (pthread_attr_destroy (&attr))
 500         return clib_error_return_unix (0, "pthread_attr_destroy");
 501
 502       return 0;
 503 }
 504
 505 static clib_error_t *
 506 start_workers (vlib_main_t * vm)
 507 {
 508   vlib_global_main_t *vgm = vlib_get_global_main ();
 509   vlib_main_t *fvm = vlib_get_first_main ();
 510   int i, j;
 511   vlib_worker_thread_t *w;
 512   vlib_main_t *vm_clone;
 513   void *oldheap;
 514   vlib_thread_main_t *tm = &vlib_thread_main;
 515   vlib_thread_registration_t *tr;
 516   vlib_node_runtime_t *rt;
 517   u32 n_vlib_mains = tm->n_vlib_mains;
 518   u32 worker_thread_index;
 519   u32 stats_err_entry_index = fvm->error_main.stats_err_entry_index;
 520   clib_mem_heap_t *main_heap = clib_mem_get_per_cpu_heap ();
 521   vlib_stats_register_mem_heap (main_heap);
 522
 523   vec_reset_length (vlib_worker_threads);
 524
 525   /* Set up the main thread */
 526   vec_add2_aligned (vlib_worker_threads, w, 1, CLIB_CACHE_LINE_BYTES);
 527   w->elog_track.name = "main thread";
 528   elog_track_register (vlib_get_elog_main (), &w->elog_track);
 529
 530   if (vec_len (tm->thread_prefix))
 531     {
 532       w->name = format (0, "%v_main%c", tm->thread_prefix, '\0');
 533       vlib_set_thread_name ((char *) w->name);
 534     }
 535
 536   vgm->elog_main.lock =
 537     clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES);
 538   vgm->elog_main.lock[0] = 0;
 539
 540   clib_callback_data_init (&vm->vlib_node_runtime_perf_callbacks,
 541                            &vm->worker_thread_main_loop_callback_lock);
 542
 543   vec_validate_aligned (vgm->vlib_mains, n_vlib_mains - 1,
 544                         CLIB_CACHE_LINE_BYTES);
 545   vec_set_len (vgm->vlib_mains, 0);
 546   vec_add1_aligned (vgm->vlib_mains, vm, CLIB_CACHE_LINE_BYTES);
 547
 548   if (n_vlib_mains > 1)
 549     {
 550       vlib_worker_threads->wait_at_barrier =
 551         clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
 552       vlib_worker_threads->workers_at_barrier =
 553         clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
 554
 555       vlib_worker_threads->node_reforks_required =
 556         clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
 557
 558       /* We'll need the rpc vector lock... */
 559       clib_spinlock_init (&vm->pending_rpc_lock);
 560
 561       /* Ask for an initial barrier sync */
 562       *vlib_worker_threads->workers_at_barrier = 0;
 563       *vlib_worker_threads->wait_at_barrier = 1;
 564
 565       /* Without update or refork */
 566       *vlib_worker_threads->node_reforks_required = 0;
 567       vgm->need_vlib_worker_thread_node_runtime_update = 0;
 568
 569       /* init timing */
 570       vm->barrier_epoch = 0;
 571       vm->barrier_no_close_before = 0;
 572
 573       worker_thread_index = 1;
 574       clib_spinlock_init (&vm->worker_thread_main_loop_callback_lock);
 575
 576       for (i = 0; i < vec_len (tm->registrations); i++)
 577         {
 578           vlib_node_main_t *nm, *nm_clone;
 579           int k;
 580
 581           tr = tm->registrations[i];
 582
 583           if (tr->count == 0)
 584             continue;
 585
 586           for (k = 0; k < tr->count; k++)
 587             {
 588               vlib_node_t *n;
 589               u64 **c;
 590
 591               vec_add2 (vlib_worker_threads, w, 1);
 592               /* Currently unused, may not really work */
 593               if (tr->mheap_size)
 594                 w->thread_mheap = clib_mem_create_heap (0, tr->mheap_size,
 595                                                         /* unlocked */ 0,
 596                                                         "%s%d heap",
 597                                                         tr->name, k);
 598               else
 599                 w->thread_mheap = main_heap;
 600
 601               w->thread_stack =
 602                 vlib_thread_stack_init (w - vlib_worker_threads);
 603               w->thread_function = tr->function;
 604               w->thread_function_arg = w;
 605               w->instance_id = k;
 606               w->registration = tr;
 607
 608               w->elog_track.name =
 609                 (char *) format (0, "%s %d", tr->name, k + 1);
 610               vec_add1 (w->elog_track.name, 0);
 611               elog_track_register (vlib_get_elog_main (), &w->elog_track);
 612
 613               if (tr->no_data_structure_clone)
 614                 continue;
 615
 616               /* Fork vlib_global_main et al. Look for bugs here */
 617               oldheap = clib_mem_set_heap (w->thread_mheap);
 618
 619               vm_clone = clib_mem_alloc_aligned (sizeof (*vm_clone),
 620                                                  CLIB_CACHE_LINE_BYTES);
 621               clib_memcpy (vm_clone, vlib_get_first_main (),
 622                            sizeof (*vm_clone));
 623
 624               vm_clone->thread_index = worker_thread_index;
 625               vm_clone->pending_rpc_requests = 0;
 626               vec_validate (vm_clone->pending_rpc_requests, 0);
 627               vec_set_len (vm_clone->pending_rpc_requests, 0);
 628               clib_memset (&vm_clone->random_buffer, 0,
 629                            sizeof (vm_clone->random_buffer));
 630               clib_spinlock_init
 631                 (&vm_clone->worker_thread_main_loop_callback_lock);
 632               clib_callback_data_init
 633                 (&vm_clone->vlib_node_runtime_perf_callbacks,
 634                  &vm_clone->worker_thread_main_loop_callback_lock);
 635
 636               nm = &vlib_get_first_main ()->node_main;
 637               nm_clone = &vm_clone->node_main;
 638               /* fork next frames array, preserving node runtime indices */
 639               nm_clone->next_frames = vec_dup_aligned (nm->next_frames,
 640                                                        CLIB_CACHE_LINE_BYTES);
 641               for (j = 0; j < vec_len (nm_clone->next_frames); j++)
 642                 {
 643                   vlib_next_frame_t *nf = &nm_clone->next_frames[j];
 644                   u32 save_node_runtime_index;
 645                   u32 save_flags;
 646
 647                   save_node_runtime_index = nf->node_runtime_index;
 648                   save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
 649                   vlib_next_frame_init (nf);
 650                   nf->node_runtime_index = save_node_runtime_index;
 651                   nf->flags = save_flags;
 652                 }
 653
 654               /* fork the frame dispatch queue */
 655               nm_clone->pending_frames = 0;
 656               vec_validate (nm_clone->pending_frames, 10);
 657               vec_set_len (nm_clone->pending_frames, 0);
 658
 659               /* fork nodes */
 660               nm_clone->nodes = 0;
 661
 662               /* Allocate all nodes in single block for speed */
 663               n = clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*n));
 664
 665               for (j = 0; j < vec_len (nm->nodes); j++)
 666                 {
 667                   clib_memcpy (n, nm->nodes[j], sizeof (*n));
 668                   /* none of the copied nodes have enqueue rights given out */
 669                   n->owner_node_index = VLIB_INVALID_NODE_INDEX;
 670                   clib_memset (&n->stats_total, 0, sizeof (n->stats_total));
 671                   clib_memset (&n->stats_last_clear, 0,
 672                                sizeof (n->stats_last_clear));
 673                   vec_add1 (nm_clone->nodes, n);
 674                   n++;
 675                 }
 676               nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
 677                 vec_dup_aligned (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL],
 678                                  CLIB_CACHE_LINE_BYTES);
 679               vec_foreach (rt,
 680                            nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
 681               {
 682                 vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 683                 /* copy initial runtime_data from node */
 684                 if (n->runtime_data && n->runtime_data_bytes > 0)
 685                   clib_memcpy (rt->runtime_data, n->runtime_data,
 686                                clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
 687                                          n->runtime_data_bytes));
 688               }
 689
 690               nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
 691                 vec_dup_aligned (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT],
 692                                  CLIB_CACHE_LINE_BYTES);
 693               clib_interrupt_init (
 694                 &nm_clone->input_node_interrupts,
 695                 vec_len (nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]));
 696               clib_interrupt_init (
 697                 &nm_clone->pre_input_node_interrupts,
 698                 vec_len (nm_clone->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT]));
 699               vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
 700               {
 701                 vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 702                 /* copy initial runtime_data from node */
 703                 if (n->runtime_data && n->runtime_data_bytes > 0)
 704                   clib_memcpy (rt->runtime_data, n->runtime_data,
 705                                clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
 706                                          n->runtime_data_bytes));
 707               }
 708
 709               nm_clone->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT] =
 710                 vec_dup_aligned (nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT],
 711                                  CLIB_CACHE_LINE_BYTES);
 712               vec_foreach (rt,
 713                            nm_clone->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT])
 714               {
 715                 vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 716                 /* copy initial runtime_data from node */
 717                 if (n->runtime_data && n->runtime_data_bytes > 0)
 718                   clib_memcpy (rt->runtime_data, n->runtime_data,
 719                                clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
 720                                          n->runtime_data_bytes));
 721               }
 722
 723               nm_clone->processes = vec_dup_aligned (nm->processes,
 724                                                      CLIB_CACHE_LINE_BYTES);
 725
 726               /* Create per-thread frame freelist */
 727               nm_clone->frame_sizes = 0;
 728               nm_clone->node_by_error = nm->node_by_error;
 729
 730               /* Packet trace buffers are guaranteed to be empty, nothing to do here */
 731
 732               clib_mem_set_heap (oldheap);
 733               vec_add1_aligned (vgm->vlib_mains, vm_clone,
 734                                 CLIB_CACHE_LINE_BYTES);
 735
 736               /* Switch to the stats segment ... */
 737               vlib_stats_validate (stats_err_entry_index, worker_thread_index,
 738                                    vec_len (fvm->error_main.counters) - 1);
 739               c = vlib_stats_get_entry_data_pointer (stats_err_entry_index);
 740               vm_clone->error_main.counters = c[worker_thread_index];
 741
 742               vm_clone->error_main.counters_last_clear = vec_dup_aligned (
 743                 vlib_get_first_main ()->error_main.counters_last_clear,
 744                 CLIB_CACHE_LINE_BYTES);
 745
 746               worker_thread_index++;
 747             }
 748         }
 749     }
 750   else
 751     {
 752       /* only have non-data-structure copy threads to create... */
 753       for (i = 0; i < vec_len (tm->registrations); i++)
 754         {
 755           tr = tm->registrations[i];
 756
 757           for (j = 0; j < tr->count; j++)
 758             {
 759               vec_add2 (vlib_worker_threads, w, 1);
 760               if (tr->mheap_size)
 761                 {
 762                   w->thread_mheap = clib_mem_create_heap (0, tr->mheap_size,
 763                                                           /* locked */ 0,
 764                                                           "%s%d heap",
 765                                                           tr->name, j);
 766                 }
 767               else
 768                 w->thread_mheap = main_heap;
 769               w->thread_stack =
 770                 vlib_thread_stack_init (w - vlib_worker_threads);
 771               w->thread_function = tr->function;
 772               w->thread_function_arg = w;
 773               w->instance_id = j;
 774               w->elog_track.name =
 775                 (char *) format (0, "%s %d", tr->name, j + 1);
 776               w->registration = tr;
 777               vec_add1 (w->elog_track.name, 0);
 778               elog_track_register (vlib_get_elog_main (), &w->elog_track);
 779             }
 780         }
 781     }
 782
 783   worker_thread_index = 1;
 784
 785   for (i = 0; i < vec_len (tm->registrations); i++)
 786     {
 787       clib_error_t *err;
 788       int j;
 789
 790       tr = tm->registrations[i];
 791
 792       if (tr->use_pthreads || tm->use_pthreads)
 793         {
 794           for (j = 0; j < tr->count; j++)
 795             {
 796
 797               w = vlib_worker_threads + worker_thread_index++;
 798               err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn,
 799                                             w, 0);
 800               if (err)
 801                 clib_unix_error ("%U, thread %s init on cpu %d failed",
 802                                  format_clib_error, err, tr->name, 0);
 803             }
 804         }
 805       else
 806         {
 807           uword c;
 808           clib_bitmap_foreach (c, tr->coremask)  {
 809             w = vlib_worker_threads + worker_thread_index++;
 810             err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn,
 811                                           w, c);
 812             if (err)
 813               clib_unix_error ("%U, thread %s init on cpu %d failed",
 814                                format_clib_error, err, tr->name, c);
 815             }
 816         }
 817     }
 818   vlib_worker_thread_barrier_sync (vm);
 819   {
 820     clib_error_t *err;
 821     err = vlib_call_init_exit_functions (
 822       vm, &vgm->num_workers_change_function_registrations, 1 /* call_once */,
 823       1 /* is_global */);
 824     if (err)
 825       clib_error_report (err);
 826   }
 827   vlib_worker_thread_barrier_release (vm);
 828   return 0;
 829 }
 830
 831 VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers);
 832
 833
 834 static inline void
 835 worker_thread_node_runtime_update_internal (void)
 836 {
 837   int i, j;
 838   vlib_main_t *vm;
 839   vlib_node_main_t *nm, *nm_clone;
 840   vlib_main_t *vm_clone;
 841   vlib_node_runtime_t *rt;
 842
 843   ASSERT (vlib_get_thread_index () == 0);
 844
 845   vm = vlib_get_first_main ();
 846   nm = &vm->node_main;
 847
 848   ASSERT (*vlib_worker_threads->wait_at_barrier == 1);
 849
 850   /*
 851    * Scrape all runtime stats, so we don't lose node runtime(s) with
 852    * pending counts, or throw away worker / io thread counts.
 853    */
 854   for (j = 0; j < vec_len (nm->nodes); j++)
 855     {
 856       vlib_node_t *n;
 857       n = nm->nodes[j];
 858       vlib_node_sync_stats (vm, n);
 859     }
 860
 861   for (i = 1; i < vlib_get_n_threads (); i++)
 862     {
 863       vlib_node_t *n;
 864
 865       vm_clone = vlib_get_main_by_index (i);
 866       nm_clone = &vm_clone->node_main;
 867
 868       for (j = 0; j < vec_len (nm_clone->nodes); j++)
 869         {
 870           n = nm_clone->nodes[j];
 871
 872           rt = vlib_node_get_runtime (vm_clone, n->index);
 873           vlib_node_runtime_sync_stats (vm_clone, rt, 0, 0, 0);
 874         }
 875     }
 876
 877   /* Per-worker clone rebuilds are now done on each thread */
 878 }
 879
 880
 881 void
 882 vlib_worker_thread_node_refork (void)
 883 {
 884   vlib_main_t *vm, *vm_clone;
 885   vlib_node_main_t *nm, *nm_clone;
 886   vlib_node_t **old_nodes_clone;
 887   vlib_node_runtime_t *rt, *old_rt;
 888   u64 **c;
 889
 890   vlib_node_t *new_n_clone;
 891
 892   int j;
 893
 894   vm = vlib_get_first_main ();
 895   nm = &vm->node_main;
 896   vm_clone = vlib_get_main ();
 897   nm_clone = &vm_clone->node_main;
 898
 899   /* Re-clone error heap */
 900   u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear;
 901
 902   clib_memcpy_fast (&vm_clone->error_main, &vm->error_main,
 903                     sizeof (vm->error_main));
 904   j = vec_len (vm->error_main.counters) - 1;
 905
 906   c = vlib_stats_get_entry_data_pointer (vm->error_main.stats_err_entry_index);
 907   vm_clone->error_main.counters = c[vm_clone->thread_index];
 908
 909   vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES);
 910   vm_clone->error_main.counters_last_clear = old_counters_all_clear;
 911
 912   for (j = 0; j < vec_len (nm_clone->next_frames); j++)
 913     {
 914       vlib_next_frame_t *nf = &nm_clone->next_frames[j];
 915       if ((nf->flags & VLIB_FRAME_IS_ALLOCATED) && nf->frame != NULL)
 916         {
 917           vlib_frame_t *f = nf->frame;
 918           nf->frame = NULL;
 919           vlib_frame_free (vm_clone, f);
 920         }
 921     }
 922
 923   vec_free (nm_clone->next_frames);
 924   nm_clone->next_frames = vec_dup_aligned (nm->next_frames,
 925                                            CLIB_CACHE_LINE_BYTES);
 926
 927   for (j = 0; j < vec_len (nm_clone->next_frames); j++)
 928     {
 929       vlib_next_frame_t *nf = &nm_clone->next_frames[j];
 930       u32 save_node_runtime_index;
 931       u32 save_flags;
 932
 933       save_node_runtime_index = nf->node_runtime_index;
 934       save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
 935       vlib_next_frame_init (nf);
 936       nf->node_runtime_index = save_node_runtime_index;
 937       nf->flags = save_flags;
 938     }
 939
 940   old_nodes_clone = nm_clone->nodes;
 941   nm_clone->nodes = 0;
 942
 943   /* re-fork nodes */
 944
 945   /* Allocate all nodes in single block for speed */
 946   new_n_clone =
 947     clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*new_n_clone));
 948   for (j = 0; j < vec_len (nm->nodes); j++)
 949     {
 950       vlib_node_t *new_n = nm->nodes[j];
 951
 952       clib_memcpy_fast (new_n_clone, new_n, sizeof (*new_n));
 953       /* none of the copied nodes have enqueue rights given out */
 954       new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX;
 955
 956       if (j >= vec_len (old_nodes_clone))
 957         {
 958           /* new node, set to zero */
 959           clib_memset (&new_n_clone->stats_total, 0,
 960                        sizeof (new_n_clone->stats_total));
 961           clib_memset (&new_n_clone->stats_last_clear, 0,
 962                        sizeof (new_n_clone->stats_last_clear));
 963         }
 964       else
 965         {
 966           vlib_node_t *old_n_clone = old_nodes_clone[j];
 967           /* Copy stats if the old data is valid */
 968           clib_memcpy_fast (&new_n_clone->stats_total,
 969                             &old_n_clone->stats_total,
 970                             sizeof (new_n_clone->stats_total));
 971           clib_memcpy_fast (&new_n_clone->stats_last_clear,
 972                             &old_n_clone->stats_last_clear,
 973                             sizeof (new_n_clone->stats_last_clear));
 974
 975           /* keep previous node state */
 976           new_n_clone->state = old_n_clone->state;
 977           new_n_clone->flags = old_n_clone->flags;
 978         }
 979       vec_add1 (nm_clone->nodes, new_n_clone);
 980       new_n_clone++;
 981     }
 982   /* Free the old node clones */
 983   clib_mem_free (old_nodes_clone[0]);
 984
 985   vec_free (old_nodes_clone);
 986
 987
 988   /* re-clone internal nodes */
 989   old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL];
 990   nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
 991     vec_dup_aligned (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL],
 992                      CLIB_CACHE_LINE_BYTES);
 993
 994   vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
 995   {
 996     vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 997     /* copy runtime_data, will be overwritten later for existing rt */
 998     if (n->runtime_data && n->runtime_data_bytes > 0)
 999       clib_memcpy_fast (rt->runtime_data, n->runtime_data,
1000                         clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
1001                                   n->runtime_data_bytes));
1002   }
1003
1004   for (j = 0; j < vec_len (old_rt); j++)
1005     {
1006       rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
1007       rt->state = old_rt[j].state;
1008       rt->flags = old_rt[j].flags;
1009       clib_memcpy_fast (rt->runtime_data, old_rt[j].runtime_data,
1010                         VLIB_NODE_RUNTIME_DATA_SIZE);
1011     }
1012
1013   vec_free (old_rt);
1014
1015   /* re-clone input nodes */
1016   old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT];
1017   nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
1018     vec_dup_aligned (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT],
1019                      CLIB_CACHE_LINE_BYTES);
1020   clib_interrupt_resize (
1021     &nm_clone->input_node_interrupts,
1022     vec_len (nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]));
1023   clib_interrupt_resize (
1024     &nm_clone->pre_input_node_interrupts,
1025     vec_len (nm_clone->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT]));
1026
1027   vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
1028   {
1029     vlib_node_t *n = vlib_get_node (vm, rt->node_index);
1030     /* copy runtime_data, will be overwritten later for existing rt */
1031     if (n->runtime_data && n->runtime_data_bytes > 0)
1032       clib_memcpy_fast (rt->runtime_data, n->runtime_data,
1033                         clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
1034                                   n->runtime_data_bytes));
1035   }
1036
1037   for (j = 0; j < vec_len (old_rt); j++)
1038     {
1039       rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
1040       rt->state = old_rt[j].state;
1041       rt->flags = old_rt[j].flags;
1042       clib_memcpy_fast (rt->runtime_data, old_rt[j].runtime_data,
1043                         VLIB_NODE_RUNTIME_DATA_SIZE);
1044     }
1045
1046   vec_free (old_rt);
1047
1048   /* re-clone pre-input nodes */
1049   old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT];
1050   nm_clone->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT] =
1051     vec_dup_aligned (nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT],
1052                      CLIB_CACHE_LINE_BYTES);
1053
1054   vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT])
1055   {
1056     vlib_node_t *n = vlib_get_node (vm, rt->node_index);
1057     /* copy runtime_data, will be overwritten later for existing rt */
1058     if (n->runtime_data && n->runtime_data_bytes > 0)
1059       clib_memcpy_fast (rt->runtime_data, n->runtime_data,
1060                         clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
1061                                   n->runtime_data_bytes));
1062   }
1063
1064   for (j = 0; j < vec_len (old_rt); j++)
1065     {
1066       rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
1067       rt->state = old_rt[j].state;
1068       rt->flags = old_rt[j].flags;
1069       clib_memcpy_fast (rt->runtime_data, old_rt[j].runtime_data,
1070                         VLIB_NODE_RUNTIME_DATA_SIZE);
1071     }
1072
1073   vec_free (old_rt);
1074
1075   vec_free (nm_clone->processes);
1076   nm_clone->processes = vec_dup_aligned (nm->processes,
1077                                          CLIB_CACHE_LINE_BYTES);
1078   nm_clone->node_by_error = nm->node_by_error;
1079 }
1080
1081 void
1082 vlib_worker_thread_node_runtime_update (void)
1083 {
1084   /*
1085    * Make a note that we need to do a node runtime update
1086    * prior to releasing the barrier.
1087    */
1088   vlib_global_main.need_vlib_worker_thread_node_runtime_update = 1;
1089 }
1090
1091 u32
1092 unformat_sched_policy (unformat_input_t * input, va_list * args)
1093 {
1094   u32 *r = va_arg (*args, u32 *);
1095
1096   if (0);
1097 #define _(v,f,s) else if (unformat (input, s)) *r = SCHED_POLICY_##f;
1098   foreach_sched_policy
1099 #undef _
1100     else
1101     return 0;
1102   return 1;
1103 }
1104
1105 static clib_error_t *
1106 cpu_config (vlib_main_t * vm, unformat_input_t * input)
1107 {
1108   vlib_thread_registration_t *tr;
1109   uword *p;
1110   vlib_thread_main_t *tm = &vlib_thread_main;
1111   u8 *name;
1112   uword *bitmap;
1113   u32 count;
1114
1115   tm->thread_registrations_by_name = hash_create_string (0, sizeof (uword));
1116
1117   tm->n_thread_stacks = 1;      /* account for main thread */
1118   tm->sched_policy = ~0;
1119   tm->sched_priority = ~0;
1120   tm->main_lcore = ~0;
1121
1122   tr = tm->next;
1123
1124   while (tr)
1125     {
1126       hash_set_mem (tm->thread_registrations_by_name, tr->name, (uword) tr);
1127       tr = tr->next;
1128     }
1129
1130   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1131     {
1132       if (unformat (input, "use-pthreads"))
1133         tm->use_pthreads = 1;
1134       else if (unformat (input, "thread-prefix %v", &tm->thread_prefix))
1135         ;
1136       else if (unformat (input, "main-core %u", &tm->main_lcore))
1137         ;
1138       else if (unformat (input, "skip-cores %u", &tm->skip_cores))
1139         ;
1140       else if (unformat (input, "numa-heap-size %U",
1141                          unformat_memory_size, &tm->numa_heap_size))
1142         ;
1143       else if (unformat (input, "coremask-%s %U", &name,
1144                          unformat_bitmap_mask, &bitmap) ||
1145                unformat (input, "corelist-%s %U", &name,
1146                          unformat_bitmap_list, &bitmap))
1147         {
1148           p = hash_get_mem (tm->thread_registrations_by_name, name);
1149           if (p == 0)
1150             return clib_error_return (0, "no such thread type '%s'", name);
1151
1152           tr = (vlib_thread_registration_t *) p[0];
1153
1154           if (tr->use_pthreads)
1155             return clib_error_return (0,
1156                                       "corelist cannot be set for '%s' threads",
1157                                       name);
1158           if (tr->count)
1159             return clib_error_return
1160               (0, "core placement of '%s' threads is already configured",
1161                name);
1162
1163           tr->coremask = bitmap;
1164           tr->count = clib_bitmap_count_set_bits (tr->coremask);
1165         }
1166       else
1167         if (unformat
1168             (input, "scheduler-policy %U", unformat_sched_policy,
1169              &tm->sched_policy))
1170         ;
1171       else if (unformat (input, "scheduler-priority %u", &tm->sched_priority))
1172         ;
1173       else if (unformat (input, "%s %u", &name, &count))
1174         {
1175           p = hash_get_mem (tm->thread_registrations_by_name, name);
1176           if (p == 0)
1177             return clib_error_return (0, "no such thread type 3 '%s'", name);
1178
1179           tr = (vlib_thread_registration_t *) p[0];
1180
1181           if (tr->fixed_count)
1182             return clib_error_return
1183               (0, "number of '%s' threads not configurable", name);
1184           if (tr->count)
1185             return clib_error_return
1186               (0, "number of '%s' threads is already configured", name);
1187
1188           tr->count = count;
1189         }
1190       else
1191         break;
1192     }
1193
1194   if (tm->sched_priority != ~0)
1195     {
1196       if (tm->sched_policy == SCHED_FIFO || tm->sched_policy == SCHED_RR)
1197         {
1198           u32 prio_max = sched_get_priority_max (tm->sched_policy);
1199           u32 prio_min = sched_get_priority_min (tm->sched_policy);
1200           if (tm->sched_priority > prio_max)
1201             tm->sched_priority = prio_max;
1202           if (tm->sched_priority < prio_min)
1203             tm->sched_priority = prio_min;
1204         }
1205       else
1206         {
1207           return clib_error_return
1208             (0,
1209              "scheduling priority (%d) is not allowed for `normal` scheduling policy",
1210              tm->sched_priority);
1211         }
1212     }
1213   tr = tm->next;
1214
1215   if (!tm->thread_prefix)
1216     tm->thread_prefix = format (0, "vpp");
1217
1218   while (tr)
1219     {
1220       tm->n_thread_stacks += tr->count;
1221       tm->n_pthreads += tr->count * tr->use_pthreads;
1222       tm->n_threads += tr->count * (tr->use_pthreads == 0);
1223       tr = tr->next;
1224     }
1225
1226   return 0;
1227 }
1228
1229 VLIB_EARLY_CONFIG_FUNCTION (cpu_config, "cpu");
1230
1231   /*
1232    * Enforce minimum open time to minimize packet loss due to Rx overflow,
1233    * based on a test based heuristic that barrier should be open for at least
1234    * 3 time as long as it is closed (with an upper bound of 1ms because by that
1235    *  point it is probably too late to make a difference)
1236    */
1237
1238 #ifndef BARRIER_MINIMUM_OPEN_LIMIT
1239 #define BARRIER_MINIMUM_OPEN_LIMIT 0.001
1240 #endif
1241
1242 #ifndef BARRIER_MINIMUM_OPEN_FACTOR
1243 #define BARRIER_MINIMUM_OPEN_FACTOR 3
1244 #endif
1245
1246 void
1247 vlib_worker_thread_initial_barrier_sync_and_release (vlib_main_t * vm)
1248 {
1249   f64 deadline;
1250   f64 now = vlib_time_now (vm);
1251   u32 count = vlib_get_n_threads () - 1;
1252
1253   /* No worker threads? */
1254   if (count == 0)
1255     return;
1256
1257   deadline = now + BARRIER_SYNC_TIMEOUT;
1258   *vlib_worker_threads->wait_at_barrier = 1;
1259   while (*vlib_worker_threads->workers_at_barrier != count)
1260     {
1261       if ((now = vlib_time_now (vm)) > deadline)
1262         {
1263           fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
1264           os_panic ();
1265         }
1266       CLIB_PAUSE ();
1267     }
1268   *vlib_worker_threads->wait_at_barrier = 0;
1269 }
1270
1271 /**
1272  * Return true if the wroker thread barrier is held
1273  */
1274 u8
1275 vlib_worker_thread_barrier_held (void)
1276 {
1277   if (vlib_get_n_threads () < 2)
1278     return (1);
1279
1280   return (*vlib_worker_threads->wait_at_barrier == 1);
1281 }
1282
1283 void
1284 vlib_worker_thread_barrier_sync_int (vlib_main_t * vm, const char *func_name)
1285 {
1286   f64 deadline;
1287   f64 now;
1288   f64 t_entry;
1289   f64 t_open;
1290   f64 t_closed;
1291   f64 max_vector_rate;
1292   u32 count;
1293   int i;
1294
1295   if (vlib_get_n_threads () < 2)
1296     return;
1297
1298   ASSERT (vlib_get_thread_index () == 0);
1299
1300   vlib_worker_threads[0].barrier_caller = func_name;
1301   count = vlib_get_n_threads () - 1;
1302
1303   /* Record entry relative to last close */
1304   now = vlib_time_now (vm);
1305   t_entry = now - vm->barrier_epoch;
1306
1307   /* Tolerate recursive calls */
1308   if (++vlib_worker_threads[0].recursion_level > 1)
1309     {
1310       barrier_trace_sync_rec (t_entry);
1311       return;
1312     }
1313
1314   if (PREDICT_FALSE (vec_len (vm->barrier_perf_callbacks) != 0))
1315     clib_call_callbacks (vm->barrier_perf_callbacks, vm,
1316                          vm->clib_time.last_cpu_time, 0 /* enter */ );
1317
1318   /*
1319    * Need data to decide if we're working hard enough to honor
1320    * the barrier hold-down timer.
1321    */
1322   max_vector_rate = 0.0;
1323   for (i = 1; i < vlib_get_n_threads (); i++)
1324     {
1325       vlib_main_t *ovm = vlib_get_main_by_index (i);
1326       max_vector_rate = clib_max (max_vector_rate,
1327                                   (f64) vlib_last_vectors_per_main_loop (ovm));
1328     }
1329
1330   vlib_worker_threads[0].barrier_sync_count++;
1331
1332   /* Enforce minimum barrier open time to minimize packet loss */
1333   ASSERT (vm->barrier_no_close_before <= (now + BARRIER_MINIMUM_OPEN_LIMIT));
1334
1335   /*
1336    * If any worker thread seems busy, which we define
1337    * as a vector rate above 10, we enforce the barrier hold-down timer
1338    */
1339   if (max_vector_rate > 10.0)
1340     {
1341       while (1)
1342         {
1343           now = vlib_time_now (vm);
1344           /* Barrier hold-down timer expired? */
1345           if (now >= vm->barrier_no_close_before)
1346             break;
1347           if ((vm->barrier_no_close_before - now)
1348               > (2.0 * BARRIER_MINIMUM_OPEN_LIMIT))
1349             {
1350               clib_warning
1351                 ("clock change: would have waited for %.4f seconds",
1352                  (vm->barrier_no_close_before - now));
1353               break;
1354             }
1355         }
1356     }
1357   /* Record time of closure */
1358   t_open = now - vm->barrier_epoch;
1359   vm->barrier_epoch = now;
1360
1361   deadline = now + BARRIER_SYNC_TIMEOUT;
1362
1363   *vlib_worker_threads->wait_at_barrier = 1;
1364   while (*vlib_worker_threads->workers_at_barrier != count)
1365     {
1366       if ((now = vlib_time_now (vm)) > deadline)
1367         {
1368           fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
1369           os_panic ();
1370         }
1371     }
1372
1373   t_closed = now - vm->barrier_epoch;
1374
1375   barrier_trace_sync (t_entry, t_open, t_closed);
1376
1377 }
1378
1379 void
1380 vlib_worker_thread_barrier_release (vlib_main_t * vm)
1381 {
1382   vlib_global_main_t *vgm = vlib_get_global_main ();
1383   f64 deadline;
1384   f64 now;
1385   f64 minimum_open;
1386   f64 t_entry;
1387   f64 t_closed_total;
1388   f64 t_update_main = 0.0;
1389   int refork_needed = 0;
1390
1391   if (vlib_get_n_threads () < 2)
1392     return;
1393
1394   ASSERT (vlib_get_thread_index () == 0);
1395
1396
1397   now = vlib_time_now (vm);
1398   t_entry = now - vm->barrier_epoch;
1399
1400   if (--vlib_worker_threads[0].recursion_level > 0)
1401     {
1402       barrier_trace_release_rec (t_entry);
1403       return;
1404     }
1405
1406   /* Update (all) node runtimes before releasing the barrier, if needed */
1407   if (vgm->need_vlib_worker_thread_node_runtime_update)
1408     {
1409       /*
1410        * Lock stat segment here, so we's safe when
1411        * rebuilding the stat segment node clones from the
1412        * stat thread...
1413        */
1414       vlib_stats_segment_lock ();
1415
1416       /* Do stats elements on main thread */
1417       worker_thread_node_runtime_update_internal ();
1418       vgm->need_vlib_worker_thread_node_runtime_update = 0;
1419
1420       /* Do per thread rebuilds in parallel */
1421       refork_needed = 1;
1422       clib_atomic_fetch_add (vlib_worker_threads->node_reforks_required,
1423                              (vlib_get_n_threads () - 1));
1424       now = vlib_time_now (vm);
1425       t_update_main = now - vm->barrier_epoch;
1426     }
1427
1428   deadline = now + BARRIER_SYNC_TIMEOUT;
1429
1430   /*
1431    * Note when we let go of the barrier.
1432    * Workers can use this to derive a reasonably accurate
1433    * time offset. See vlib_time_now(...)
1434    */
1435   vm->time_last_barrier_release = vlib_time_now (vm);
1436   CLIB_MEMORY_STORE_BARRIER ();
1437
1438   *vlib_worker_threads->wait_at_barrier = 0;
1439
1440   while (*vlib_worker_threads->workers_at_barrier > 0)
1441     {
1442       if ((now = vlib_time_now (vm)) > deadline)
1443         {
1444           fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
1445           os_panic ();
1446         }
1447     }
1448
1449   /* Wait for reforks before continuing */
1450   if (refork_needed)
1451     {
1452       now = vlib_time_now (vm);
1453
1454       deadline = now + BARRIER_SYNC_TIMEOUT;
1455
1456       while (*vlib_worker_threads->node_reforks_required > 0)
1457         {
1458           if ((now = vlib_time_now (vm)) > deadline)
1459             {
1460               fformat (stderr, "%s: worker thread refork deadlock\n",
1461                        __FUNCTION__);
1462               os_panic ();
1463             }
1464         }
1465       vlib_stats_segment_unlock ();
1466     }
1467
1468   t_closed_total = now - vm->barrier_epoch;
1469
1470   minimum_open = t_closed_total * BARRIER_MINIMUM_OPEN_FACTOR;
1471
1472   if (minimum_open > BARRIER_MINIMUM_OPEN_LIMIT)
1473     {
1474       minimum_open = BARRIER_MINIMUM_OPEN_LIMIT;
1475     }
1476
1477   vm->barrier_no_close_before = now + minimum_open;
1478
1479   /* Record barrier epoch (used to enforce minimum open time) */
1480   vm->barrier_epoch = now;
1481
1482   barrier_trace_release (t_entry, t_closed_total, t_update_main);
1483
1484   if (PREDICT_FALSE (vec_len (vm->barrier_perf_callbacks) != 0))
1485     clib_call_callbacks (vm->barrier_perf_callbacks, vm,
1486                          vm->clib_time.last_cpu_time, 1 /* leave */ );
1487 }
1488
1489 static void
1490 vlib_worker_sync_rpc (void *args)
1491 {
1492   ASSERT (vlib_thread_is_main_w_barrier ());
1493   vlib_worker_threads->wait_before_barrier = 0;
1494 }
1495
1496 void
1497 vlib_workers_sync (void)
1498 {
1499   if (PREDICT_FALSE (!vlib_num_workers ()))
1500     return;
1501
1502   if (!(*vlib_worker_threads->wait_at_barrier) &&
1503       !clib_atomic_swap_rel_n (&vlib_worker_threads->wait_before_barrier, 1))
1504     {
1505       u32 thread_index = vlib_get_thread_index ();
1506       vlib_rpc_call_main_thread (vlib_worker_sync_rpc, (u8 *) &thread_index,
1507                                  sizeof (thread_index));
1508       vlib_worker_flush_pending_rpc_requests (vlib_get_main ());
1509     }
1510
1511   /* Wait until main thread asks for barrier */
1512   while (!(*vlib_worker_threads->wait_at_barrier))
1513     ;
1514
1515   /* Stop before barrier and make sure all threads are either
1516    * at worker barrier or the barrier before it */
1517   clib_atomic_fetch_add (&vlib_worker_threads->workers_before_barrier, 1);
1518   while (vlib_num_workers () > (*vlib_worker_threads->workers_at_barrier +
1519                                 vlib_worker_threads->workers_before_barrier))
1520     ;
1521 }
1522
1523 void
1524 vlib_workers_continue (void)
1525 {
1526   if (PREDICT_FALSE (!vlib_num_workers ()))
1527     return;
1528
1529   clib_atomic_fetch_add (&vlib_worker_threads->done_work_before_barrier, 1);
1530
1531   /* Wait until all workers are done with work before barrier */
1532   while (vlib_worker_threads->done_work_before_barrier <
1533          vlib_worker_threads->workers_before_barrier)
1534     ;
1535
1536   clib_atomic_fetch_add (&vlib_worker_threads->done_work_before_barrier, -1);
1537   clib_atomic_fetch_add (&vlib_worker_threads->workers_before_barrier, -1);
1538 }
1539
1540 /**
1541  * Wait until each of the workers has been once around the track
1542  */
1543 void
1544 vlib_worker_wait_one_loop (void)
1545 {
1546   vlib_global_main_t *vgm = vlib_get_global_main ();
1547   ASSERT (vlib_get_thread_index () == 0);
1548
1549   if (vlib_get_n_threads () < 2)
1550     return;
1551
1552   if (vlib_worker_thread_barrier_held ())
1553     return;
1554
1555   u32 *counts = 0;
1556   u32 ii;
1557
1558   vec_validate (counts, vlib_get_n_threads () - 1);
1559
1560   /* record the current loop counts */
1561   vec_foreach_index (ii, vgm->vlib_mains)
1562     counts[ii] = vgm->vlib_mains[ii]->main_loop_count;
1563
1564   /* spin until each changes, apart from the main thread, or we'd be
1565    * a while */
1566   for (ii = 1; ii < vec_len (counts); ii++)
1567     {
1568       while (counts[ii] == vgm->vlib_mains[ii]->main_loop_count)
1569         CLIB_PAUSE ();
1570     }
1571
1572   vec_free (counts);
1573   return;
1574 }
1575
1576 void
1577 vlib_worker_flush_pending_rpc_requests (vlib_main_t *vm)
1578 {
1579   vlib_main_t *vm_global = vlib_get_first_main ();
1580
1581   ASSERT (vm != vm_global);
1582
1583   clib_spinlock_lock_if_init (&vm_global->pending_rpc_lock);
1584   vec_append (vm_global->pending_rpc_requests, vm->pending_rpc_requests);
1585   vec_reset_length (vm->pending_rpc_requests);
1586   clib_spinlock_unlock_if_init (&vm_global->pending_rpc_lock);
1587 }
1588
1589 void
1590 vlib_worker_thread_fn (void *arg)
1591 {
1592   vlib_global_main_t *vgm = vlib_get_global_main ();
1593   vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
1594   vlib_main_t *vm = vlib_get_main ();
1595   clib_error_t *e;
1596
1597   ASSERT (vm->thread_index == vlib_get_thread_index ());
1598
1599   vlib_worker_thread_init (w);
1600   clib_time_init (&vm->clib_time);
1601   clib_mem_set_heap (w->thread_mheap);
1602
1603   vm->worker_init_functions_called = hash_create (0, 0);
1604
1605   e = vlib_call_init_exit_functions_no_sort (
1606     vm, &vgm->worker_init_function_registrations, 1 /* call_once */,
1607     0 /* is_global */);
1608   if (e)
1609     clib_error_report (e);
1610
1611   vlib_worker_loop (vm);
1612 }
1613
1614 VLIB_REGISTER_THREAD (worker_thread_reg, static) = {
1615   .name = "workers",
1616   .short_name = "wk",
1617   .function = vlib_worker_thread_fn,
1618 };
1619
1620 extern clib_march_fn_registration
1621   *vlib_frame_queue_dequeue_with_aux_fn_march_fn_registrations;
1622 extern clib_march_fn_registration
1623   *vlib_frame_queue_dequeue_fn_march_fn_registrations;
1624 u32
1625 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts)
1626 {
1627   vlib_thread_main_t *tm = vlib_get_thread_main ();
1628   vlib_main_t *vm = vlib_get_main ();
1629   vlib_frame_queue_main_t *fqm;
1630   vlib_frame_queue_t *fq;
1631   vlib_node_t *node;
1632   int i;
1633   u32 num_threads;
1634
1635   if (frame_queue_nelts == 0)
1636     frame_queue_nelts = FRAME_QUEUE_MAX_NELTS;
1637
1638   num_threads = 1 /* main thread */  + tm->n_threads;
1639   ASSERT (frame_queue_nelts >= 8 + num_threads);
1640
1641   vec_add2 (tm->frame_queue_mains, fqm, 1);
1642
1643   node = vlib_get_node (vm, fqm->node_index);
1644   ASSERT (node);
1645   if (node->aux_offset)
1646     {
1647       fqm->frame_queue_dequeue_fn =
1648         CLIB_MARCH_FN_VOID_POINTER (vlib_frame_queue_dequeue_with_aux_fn);
1649     }
1650   else
1651     {
1652       fqm->frame_queue_dequeue_fn =
1653         CLIB_MARCH_FN_VOID_POINTER (vlib_frame_queue_dequeue_fn);
1654     }
1655
1656   fqm->node_index = node_index;
1657   fqm->frame_queue_nelts = frame_queue_nelts;
1658
1659   vec_validate (fqm->vlib_frame_queues, tm->n_vlib_mains - 1);
1660   vec_set_len (fqm->vlib_frame_queues, 0);
1661   for (i = 0; i < tm->n_vlib_mains; i++)
1662     {
1663       fq = vlib_frame_queue_alloc (frame_queue_nelts);
1664       vec_add1 (fqm->vlib_frame_queues, fq);
1665     }
1666
1667   return (fqm - tm->frame_queue_mains);
1668 }
1669
1670 void
1671 vlib_process_signal_event_mt_helper (vlib_process_signal_event_mt_args_t *
1672                                      args)
1673 {
1674   ASSERT (vlib_get_thread_index () == 0);
1675   vlib_process_signal_event (vlib_get_main (), args->node_index,
1676                              args->type_opaque, args->data);
1677 }
1678
1679 void *rpc_call_main_thread_cb_fn;
1680
1681 void
1682 vlib_rpc_call_main_thread (void *callback, u8 * args, u32 arg_size)
1683 {
1684   if (rpc_call_main_thread_cb_fn)
1685     {
1686       void (*fp) (void *, u8 *, u32) = rpc_call_main_thread_cb_fn;
1687       (*fp) (callback, args, arg_size);
1688     }
1689   else
1690     clib_warning ("BUG: rpc_call_main_thread_cb_fn NULL!");
1691 }
1692
1693 clib_error_t *
1694 threads_init (vlib_main_t * vm)
1695 {
1696   const vlib_thread_main_t *tm = vlib_get_thread_main ();
1697
1698   if (tm->main_lcore == ~0 && tm->n_vlib_mains > 1)
1699     return clib_error_return (0, "Configuration error, a main core must "
1700                                  "be specified when using worker threads");
1701
1702   return 0;
1703 }
1704
1705 VLIB_INIT_FUNCTION (threads_init);
1706
1707 static clib_error_t *
1708 show_clock_command_fn (vlib_main_t * vm,
1709                        unformat_input_t * input, vlib_cli_command_t * cmd)
1710 {
1711   int verbose = 0;
1712   clib_timebase_t _tb, *tb = &_tb;
1713
1714   (void) unformat (input, "verbose %=", &verbose, 1);
1715
1716   clib_timebase_init (tb, 0 /* GMT */ , CLIB_TIMEBASE_DAYLIGHT_NONE,
1717                       &vm->clib_time);
1718
1719   vlib_cli_output (vm, "%U, %U GMT", format_clib_time, &vm->clib_time,
1720                    verbose, format_clib_timebase_time,
1721                    clib_timebase_now (tb));
1722
1723   vlib_cli_output (vm, "Time last barrier release %.9f",
1724                    vm->time_last_barrier_release);
1725
1726   foreach_vlib_main ()
1727     {
1728       vlib_cli_output (vm, "%d: %U", this_vlib_main->thread_index,
1729                        format_clib_time, &this_vlib_main->clib_time, verbose);
1730
1731       vlib_cli_output (vm, "Thread %d offset %.9f error %.9f",
1732                        this_vlib_main->thread_index,
1733                        this_vlib_main->time_offset,
1734                        vm->time_last_barrier_release -
1735                          this_vlib_main->time_last_barrier_release);
1736     }
1737   return 0;
1738 }
1739
1740 VLIB_CLI_COMMAND (f_command, static) =
1741 {
1742   .path = "show clock",
1743   .short_help = "show clock",
1744   .function = show_clock_command_fn,
1745 };
1746
1747 vlib_thread_main_t *
1748 vlib_get_thread_main_not_inline (void)
1749 {
1750   return vlib_get_thread_main ();
1751 }
1752
1753 /*
1754  * fd.io coding-style-patch-verification: ON
1755  *
1756  * Local Variables:
1757  * eval: (c-set-style "gnu")
1758  * End:
1759  */