New upstream version 17.11-rc3
[deb_dpdk.git] / examples / performance-thread / common / lthread_sched.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 /*
35  * Some portions of this software is derived from the
36  * https://github.com/halayli/lthread which carrys the following license.
37  *
38  * Copyright (C) 2012, Hasan Alayli <halayli@gmail.com>
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  *
49  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  */
61
62
63 #define RTE_MEM 1
64
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <stdint.h>
69 #include <stddef.h>
70 #include <limits.h>
71 #include <inttypes.h>
72 #include <unistd.h>
73 #include <pthread.h>
74 #include <fcntl.h>
75 #include <sys/time.h>
76 #include <sys/mman.h>
77 #include <sched.h>
78
79 #include <rte_prefetch.h>
80 #include <rte_per_lcore.h>
81 #include <rte_atomic.h>
82 #include <rte_atomic_64.h>
83 #include <rte_log.h>
84 #include <rte_common.h>
85 #include <rte_branch_prediction.h>
86
87 #include "lthread_api.h"
88 #include "lthread_int.h"
89 #include "lthread_sched.h"
90 #include "lthread_objcache.h"
91 #include "lthread_timer.h"
92 #include "lthread_mutex.h"
93 #include "lthread_cond.h"
94 #include "lthread_tls.h"
95 #include "lthread_diag.h"
96
97 /*
98  * This file implements the lthread scheduler
99  * The scheduler is the function lthread_run()
100  * This must be run as the main loop of an EAL thread.
101  *
102  * Currently once a scheduler is created it cannot be destroyed
103  * When a scheduler shuts down it is assumed that the application is terminating
104  */
105
106 static rte_atomic16_t num_schedulers;
107 static rte_atomic16_t active_schedulers;
108
109 /* one scheduler per lcore */
110 RTE_DEFINE_PER_LCORE(struct lthread_sched *, this_sched) = NULL;
111
112 struct lthread_sched *schedcore[LTHREAD_MAX_LCORES];
113
114 diag_callback diag_cb;
115
116 uint64_t diag_mask;
117
118
119 /* constructor */
120 RTE_INIT(lthread_sched_ctor)
121 {
122         memset(schedcore, 0, sizeof(schedcore));
123         rte_atomic16_init(&num_schedulers);
124         rte_atomic16_set(&num_schedulers, 1);
125         rte_atomic16_init(&active_schedulers);
126         rte_atomic16_set(&active_schedulers, 0);
127         diag_cb = NULL;
128 }
129
130
131 enum sched_alloc_phase {
132         SCHED_ALLOC_OK,
133         SCHED_ALLOC_QNODE_POOL,
134         SCHED_ALLOC_READY_QUEUE,
135         SCHED_ALLOC_PREADY_QUEUE,
136         SCHED_ALLOC_LTHREAD_CACHE,
137         SCHED_ALLOC_STACK_CACHE,
138         SCHED_ALLOC_PERLT_CACHE,
139         SCHED_ALLOC_TLS_CACHE,
140         SCHED_ALLOC_COND_CACHE,
141         SCHED_ALLOC_MUTEX_CACHE,
142 };
143
144 static int
145 _lthread_sched_alloc_resources(struct lthread_sched *new_sched)
146 {
147         int alloc_status;
148
149         do {
150                 /* Initialize per scheduler queue node pool */
151                 alloc_status = SCHED_ALLOC_QNODE_POOL;
152                 new_sched->qnode_pool =
153                         _qnode_pool_create("qnode pool", LTHREAD_PREALLOC);
154                 if (new_sched->qnode_pool == NULL)
155                         break;
156
157                 /* Initialize per scheduler local ready queue */
158                 alloc_status = SCHED_ALLOC_READY_QUEUE;
159                 new_sched->ready = _lthread_queue_create("ready queue");
160                 if (new_sched->ready == NULL)
161                         break;
162
163                 /* Initialize per scheduler local peer ready queue */
164                 alloc_status = SCHED_ALLOC_PREADY_QUEUE;
165                 new_sched->pready = _lthread_queue_create("pready queue");
166                 if (new_sched->pready == NULL)
167                         break;
168
169                 /* Initialize per scheduler local free lthread cache */
170                 alloc_status = SCHED_ALLOC_LTHREAD_CACHE;
171                 new_sched->lthread_cache =
172                         _lthread_objcache_create("lthread cache",
173                                                 sizeof(struct lthread),
174                                                 LTHREAD_PREALLOC);
175                 if (new_sched->lthread_cache == NULL)
176                         break;
177
178                 /* Initialize per scheduler local free stack cache */
179                 alloc_status = SCHED_ALLOC_STACK_CACHE;
180                 new_sched->stack_cache =
181                         _lthread_objcache_create("stack_cache",
182                                                 sizeof(struct lthread_stack),
183                                                 LTHREAD_PREALLOC);
184                 if (new_sched->stack_cache == NULL)
185                         break;
186
187                 /* Initialize per scheduler local free per lthread data cache */
188                 alloc_status = SCHED_ALLOC_PERLT_CACHE;
189                 new_sched->per_lthread_cache =
190                         _lthread_objcache_create("per_lt cache",
191                                                 RTE_PER_LTHREAD_SECTION_SIZE,
192                                                 LTHREAD_PREALLOC);
193                 if (new_sched->per_lthread_cache == NULL)
194                         break;
195
196                 /* Initialize per scheduler local free tls cache */
197                 alloc_status = SCHED_ALLOC_TLS_CACHE;
198                 new_sched->tls_cache =
199                         _lthread_objcache_create("TLS cache",
200                                                 sizeof(struct lthread_tls),
201                                                 LTHREAD_PREALLOC);
202                 if (new_sched->tls_cache == NULL)
203                         break;
204
205                 /* Initialize per scheduler local free cond var cache */
206                 alloc_status = SCHED_ALLOC_COND_CACHE;
207                 new_sched->cond_cache =
208                         _lthread_objcache_create("cond cache",
209                                                 sizeof(struct lthread_cond),
210                                                 LTHREAD_PREALLOC);
211                 if (new_sched->cond_cache == NULL)
212                         break;
213
214                 /* Initialize per scheduler local free mutex cache */
215                 alloc_status = SCHED_ALLOC_MUTEX_CACHE;
216                 new_sched->mutex_cache =
217                         _lthread_objcache_create("mutex cache",
218                                                 sizeof(struct lthread_mutex),
219                                                 LTHREAD_PREALLOC);
220                 if (new_sched->mutex_cache == NULL)
221                         break;
222
223                 alloc_status = SCHED_ALLOC_OK;
224         } while (0);
225
226         /* roll back on any failure */
227         switch (alloc_status) {
228         case SCHED_ALLOC_MUTEX_CACHE:
229                 _lthread_objcache_destroy(new_sched->cond_cache);
230                 /* fall through */
231         case SCHED_ALLOC_COND_CACHE:
232                 _lthread_objcache_destroy(new_sched->tls_cache);
233                 /* fall through */
234         case SCHED_ALLOC_TLS_CACHE:
235                 _lthread_objcache_destroy(new_sched->per_lthread_cache);
236                 /* fall through */
237         case SCHED_ALLOC_PERLT_CACHE:
238                 _lthread_objcache_destroy(new_sched->stack_cache);
239                 /* fall through */
240         case SCHED_ALLOC_STACK_CACHE:
241                 _lthread_objcache_destroy(new_sched->lthread_cache);
242                 /* fall through */
243         case SCHED_ALLOC_LTHREAD_CACHE:
244                 _lthread_queue_destroy(new_sched->pready);
245                 /* fall through */
246         case SCHED_ALLOC_PREADY_QUEUE:
247                 _lthread_queue_destroy(new_sched->ready);
248                 /* fall through */
249         case SCHED_ALLOC_READY_QUEUE:
250                 _qnode_pool_destroy(new_sched->qnode_pool);
251                 /* fall through */
252         case SCHED_ALLOC_QNODE_POOL:
253                 /* fall through */
254         case SCHED_ALLOC_OK:
255                 break;
256         }
257         return alloc_status;
258 }
259
260
261 /*
262  * Create a scheduler on the current lcore
263  */
264 struct lthread_sched *_lthread_sched_create(size_t stack_size)
265 {
266         int status;
267         struct lthread_sched *new_sched;
268         unsigned lcoreid = rte_lcore_id();
269
270         RTE_ASSERT(stack_size <= LTHREAD_MAX_STACK_SIZE);
271
272         if (stack_size == 0)
273                 stack_size = LTHREAD_MAX_STACK_SIZE;
274
275         new_sched =
276              rte_calloc_socket(NULL, 1, sizeof(struct lthread_sched),
277                                 RTE_CACHE_LINE_SIZE,
278                                 rte_socket_id());
279         if (new_sched == NULL) {
280                 RTE_LOG(CRIT, LTHREAD,
281                         "Failed to allocate memory for scheduler\n");
282                 return NULL;
283         }
284
285         _lthread_key_pool_init();
286
287         new_sched->stack_size = stack_size;
288         new_sched->birth = rte_rdtsc();
289         THIS_SCHED = new_sched;
290
291         status = _lthread_sched_alloc_resources(new_sched);
292         if (status != SCHED_ALLOC_OK) {
293                 RTE_LOG(CRIT, LTHREAD,
294                         "Failed to allocate resources for scheduler code = %d\n",
295                         status);
296                 rte_free(new_sched);
297                 return NULL;
298         }
299
300         bzero(&new_sched->ctx, sizeof(struct ctx));
301
302         new_sched->lcore_id = lcoreid;
303
304         schedcore[lcoreid] = new_sched;
305
306         new_sched->run_flag = 1;
307
308         DIAG_EVENT(new_sched, LT_DIAG_SCHED_CREATE, rte_lcore_id(), 0);
309
310         rte_wmb();
311         return new_sched;
312 }
313
314 /*
315  * Set the number of schedulers in the system
316  */
317 int lthread_num_schedulers_set(int num)
318 {
319         rte_atomic16_set(&num_schedulers, num);
320         return (int)rte_atomic16_read(&num_schedulers);
321 }
322
323 /*
324  * Return the number of schedulers active
325  */
326 int lthread_active_schedulers(void)
327 {
328         return (int)rte_atomic16_read(&active_schedulers);
329 }
330
331
332 /**
333  * shutdown the scheduler running on the specified lcore
334  */
335 void lthread_scheduler_shutdown(unsigned lcoreid)
336 {
337         uint64_t coreid = (uint64_t) lcoreid;
338
339         if (coreid < LTHREAD_MAX_LCORES) {
340                 if (schedcore[coreid] != NULL)
341                         schedcore[coreid]->run_flag = 0;
342         }
343 }
344
345 /**
346  * shutdown all schedulers
347  */
348 void lthread_scheduler_shutdown_all(void)
349 {
350         uint64_t i;
351
352         /*
353          * give time for all schedulers to have started
354          * Note we use sched_yield() rather than pthread_yield() to allow
355          * for the possibility of a pthread wrapper on lthread_yield(),
356          * something that is not possible unless the scheduler is running.
357          */
358         while (rte_atomic16_read(&active_schedulers) <
359                rte_atomic16_read(&num_schedulers))
360                 sched_yield();
361
362         for (i = 0; i < LTHREAD_MAX_LCORES; i++) {
363                 if (schedcore[i] != NULL)
364                         schedcore[i]->run_flag = 0;
365         }
366 }
367
368 /*
369  * Resume a suspended lthread
370  */
371 static __rte_always_inline void
372 _lthread_resume(struct lthread *lt);
373 static inline void _lthread_resume(struct lthread *lt)
374 {
375         struct lthread_sched *sched = THIS_SCHED;
376         struct lthread_stack *s;
377         uint64_t state = lt->state;
378 #if LTHREAD_DIAG
379         int init = 0;
380 #endif
381
382         sched->current_lthread = lt;
383
384         if (state & (BIT(ST_LT_CANCELLED) | BIT(ST_LT_EXITED))) {
385                 /* if detached we can free the thread now */
386                 if (state & BIT(ST_LT_DETACH)) {
387                         _lthread_free(lt);
388                         sched->current_lthread = NULL;
389                         return;
390                 }
391         }
392
393         if (state & BIT(ST_LT_INIT)) {
394                 /* first time this thread has been run */
395                 /* assign thread to this scheduler */
396                 lt->sched = THIS_SCHED;
397
398                 /* allocate stack */
399                 s = _stack_alloc();
400
401                 lt->stack_container = s;
402                 _lthread_set_stack(lt, s->stack, s->stack_size);
403
404                 /* allocate memory for TLS used by this thread */
405                 _lthread_tls_alloc(lt);
406
407                 lt->state = BIT(ST_LT_READY);
408 #if LTHREAD_DIAG
409                 init = 1;
410 #endif
411         }
412
413         DIAG_EVENT(lt, LT_DIAG_LTHREAD_RESUMED, init, lt);
414
415         /* switch to the new thread */
416         ctx_switch(&lt->ctx, &sched->ctx);
417
418         /* If posting to a queue that could be read by another lcore
419          * we defer the queue write till now to ensure the context has been
420          * saved before the other core tries to resume it
421          * This applies to blocking on mutex, cond, and to set_affinity
422          */
423         if (lt->pending_wr_queue != NULL) {
424                 struct lthread_queue *dest = lt->pending_wr_queue;
425
426                 lt->pending_wr_queue = NULL;
427
428                 /* queue the current thread to the specified queue */
429                 _lthread_queue_insert_mp(dest, lt);
430         }
431
432         sched->current_lthread = NULL;
433 }
434
435 /*
436  * Handle sleep timer expiry
437 */
438 void
439 _sched_timer_cb(struct rte_timer *tim, void *arg)
440 {
441         struct lthread *lt = (struct lthread *) arg;
442         uint64_t state = lt->state;
443
444         DIAG_EVENT(lt, LT_DIAG_LTHREAD_TMR_EXPIRED, &lt->tim, 0);
445
446         rte_timer_stop(tim);
447
448         if (lt->state & BIT(ST_LT_CANCELLED))
449                 (THIS_SCHED)->nb_blocked_threads--;
450
451         lt->state = state | BIT(ST_LT_EXPIRED);
452         _lthread_resume(lt);
453         lt->state = state & CLEARBIT(ST_LT_EXPIRED);
454 }
455
456
457
458 /*
459  * Returns 0 if there is a pending job in scheduler or 1 if done and can exit.
460  */
461 static inline int _lthread_sched_isdone(struct lthread_sched *sched)
462 {
463         return (sched->run_flag == 0) &&
464                         (_lthread_queue_empty(sched->ready)) &&
465                         (_lthread_queue_empty(sched->pready)) &&
466                         (sched->nb_blocked_threads == 0);
467 }
468
469 /*
470  * Wait for all schedulers to start
471  */
472 static inline void _lthread_schedulers_sync_start(void)
473 {
474         rte_atomic16_inc(&active_schedulers);
475
476         /* wait for lthread schedulers
477          * Note we use sched_yield() rather than pthread_yield() to allow
478          * for the possibility of a pthread wrapper on lthread_yield(),
479          * something that is not possible unless the scheduler is running.
480          */
481         while (rte_atomic16_read(&active_schedulers) <
482                rte_atomic16_read(&num_schedulers))
483                 sched_yield();
484
485 }
486
487 /*
488  * Wait for all schedulers to stop
489  */
490 static inline void _lthread_schedulers_sync_stop(void)
491 {
492         rte_atomic16_dec(&active_schedulers);
493         rte_atomic16_dec(&num_schedulers);
494
495         /* wait for schedulers
496          * Note we use sched_yield() rather than pthread_yield() to allow
497          * for the possibility of a pthread wrapper on lthread_yield(),
498          * something that is not possible unless the scheduler is running.
499          */
500         while (rte_atomic16_read(&active_schedulers) > 0)
501                 sched_yield();
502
503 }
504
505
506 /*
507  * Run the lthread scheduler
508  * This loop is the heart of the system
509  */
510 void lthread_run(void)
511 {
512
513         struct lthread_sched *sched = THIS_SCHED;
514         struct lthread *lt = NULL;
515
516         RTE_LOG(INFO, LTHREAD,
517                 "starting scheduler %p on lcore %u phys core %u\n",
518                 sched, rte_lcore_id(),
519                 rte_lcore_index(rte_lcore_id()));
520
521         /* if more than one, wait for all schedulers to start */
522         _lthread_schedulers_sync_start();
523
524
525         /*
526          * This is the main scheduling loop
527          * So long as there are tasks in existence we run this loop.
528          * We check for:-
529          *   expired timers,
530          *   the local ready queue,
531          *   and the peer ready queue,
532          *
533          * and resume lthreads ad infinitum.
534          */
535         while (!_lthread_sched_isdone(sched)) {
536
537                 rte_timer_manage();
538
539                 lt = _lthread_queue_poll(sched->ready);
540                 if (lt != NULL)
541                         _lthread_resume(lt);
542                 lt = _lthread_queue_poll(sched->pready);
543                 if (lt != NULL)
544                         _lthread_resume(lt);
545         }
546
547
548         /* if more than one wait for all schedulers to stop */
549         _lthread_schedulers_sync_stop();
550
551         (THIS_SCHED) = NULL;
552
553         RTE_LOG(INFO, LTHREAD,
554                 "stopping scheduler %p on lcore %u phys core %u\n",
555                 sched, rte_lcore_id(),
556                 rte_lcore_index(rte_lcore_id()));
557         fflush(stdout);
558 }
559
560 /*
561  * Return the scheduler for this lcore
562  *
563  */
564 struct lthread_sched *_lthread_sched_get(unsigned int lcore_id)
565 {
566         struct lthread_sched *res = NULL;
567
568         if (lcore_id < LTHREAD_MAX_LCORES)
569                 res = schedcore[lcore_id];
570
571         return res;
572 }
573
574 /*
575  * migrate the current thread to another scheduler running
576  * on the specified lcore.
577  */
578 int lthread_set_affinity(unsigned lcoreid)
579 {
580         struct lthread *lt = THIS_LTHREAD;
581         struct lthread_sched *dest_sched;
582
583         if (unlikely(lcoreid >= LTHREAD_MAX_LCORES))
584                 return POSIX_ERRNO(EINVAL);
585
586         DIAG_EVENT(lt, LT_DIAG_LTHREAD_AFFINITY, lcoreid, 0);
587
588         dest_sched = schedcore[lcoreid];
589
590         if (unlikely(dest_sched == NULL))
591                 return POSIX_ERRNO(EINVAL);
592
593         if (likely(dest_sched != THIS_SCHED)) {
594                 lt->sched = dest_sched;
595                 lt->pending_wr_queue = dest_sched->pready;
596                 _affinitize();
597                 return 0;
598         }
599         return 0;
600 }