New upstream version 17.08
[deb_dpdk.git] / test / test / test_ring_perf.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34
35 #include <stdio.h>
36 #include <inttypes.h>
37 #include <rte_ring.h>
38 #include <rte_cycles.h>
39 #include <rte_launch.h>
40 #include <rte_pause.h>
41
42 #include "test.h"
43
44 /*
45  * Ring
46  * ====
47  *
48  * Measures performance of various operations using rdtsc
49  *  * Empty ring dequeue
50  *  * Enqueue/dequeue of bursts in 1 threads
51  *  * Enqueue/dequeue of bursts in 2 threads
52  */
53
54 #define RING_NAME "RING_PERF"
55 #define RING_SIZE 4096
56 #define MAX_BURST 32
57
58 /*
59  * the sizes to enqueue and dequeue in testing
60  * (marked volatile so they won't be seen as compile-time constants)
61  */
62 static const volatile unsigned bulk_sizes[] = { 8, 32 };
63
64 /* The ring structure used for tests */
65 static struct rte_ring *r;
66
67 struct lcore_pair {
68         unsigned c1, c2;
69 };
70
71 static volatile unsigned lcore_count = 0;
72
73 /**** Functions to analyse our core mask to get cores for different tests ***/
74
75 static int
76 get_two_hyperthreads(struct lcore_pair *lcp)
77 {
78         unsigned id1, id2;
79         unsigned c1, c2, s1, s2;
80         RTE_LCORE_FOREACH(id1) {
81                 /* inner loop just re-reads all id's. We could skip the first few
82                  * elements, but since number of cores is small there is little point
83                  */
84                 RTE_LCORE_FOREACH(id2) {
85                         if (id1 == id2)
86                                 continue;
87                         c1 = lcore_config[id1].core_id;
88                         c2 = lcore_config[id2].core_id;
89                         s1 = lcore_config[id1].socket_id;
90                         s2 = lcore_config[id2].socket_id;
91                         if ((c1 == c2) && (s1 == s2)){
92                                 lcp->c1 = id1;
93                                 lcp->c2 = id2;
94                                 return 0;
95                         }
96                 }
97         }
98         return 1;
99 }
100
101 static int
102 get_two_cores(struct lcore_pair *lcp)
103 {
104         unsigned id1, id2;
105         unsigned c1, c2, s1, s2;
106         RTE_LCORE_FOREACH(id1) {
107                 RTE_LCORE_FOREACH(id2) {
108                         if (id1 == id2)
109                                 continue;
110                         c1 = lcore_config[id1].core_id;
111                         c2 = lcore_config[id2].core_id;
112                         s1 = lcore_config[id1].socket_id;
113                         s2 = lcore_config[id2].socket_id;
114                         if ((c1 != c2) && (s1 == s2)){
115                                 lcp->c1 = id1;
116                                 lcp->c2 = id2;
117                                 return 0;
118                         }
119                 }
120         }
121         return 1;
122 }
123
124 static int
125 get_two_sockets(struct lcore_pair *lcp)
126 {
127         unsigned id1, id2;
128         unsigned s1, s2;
129         RTE_LCORE_FOREACH(id1) {
130                 RTE_LCORE_FOREACH(id2) {
131                         if (id1 == id2)
132                                 continue;
133                         s1 = lcore_config[id1].socket_id;
134                         s2 = lcore_config[id2].socket_id;
135                         if (s1 != s2){
136                                 lcp->c1 = id1;
137                                 lcp->c2 = id2;
138                                 return 0;
139                         }
140                 }
141         }
142         return 1;
143 }
144
145 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
146 static void
147 test_empty_dequeue(void)
148 {
149         const unsigned iter_shift = 26;
150         const unsigned iterations = 1<<iter_shift;
151         unsigned i = 0;
152         void *burst[MAX_BURST];
153
154         const uint64_t sc_start = rte_rdtsc();
155         for (i = 0; i < iterations; i++)
156                 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
157         const uint64_t sc_end = rte_rdtsc();
158
159         const uint64_t mc_start = rte_rdtsc();
160         for (i = 0; i < iterations; i++)
161                 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
162         const uint64_t mc_end = rte_rdtsc();
163
164         printf("SC empty dequeue: %.2F\n",
165                         (double)(sc_end-sc_start) / iterations);
166         printf("MC empty dequeue: %.2F\n",
167                         (double)(mc_end-mc_start) / iterations);
168 }
169
170 /*
171  * for the separate enqueue and dequeue threads they take in one param
172  * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
173  */
174 struct thread_params {
175         unsigned size;        /* input value, the burst size */
176         double spsc, mpmc;    /* output value, the single or multi timings */
177 };
178
179 /*
180  * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
181  * thread running dequeue_bulk function
182  */
183 static int
184 enqueue_bulk(void *p)
185 {
186         const unsigned iter_shift = 23;
187         const unsigned iterations = 1<<iter_shift;
188         struct thread_params *params = p;
189         const unsigned size = params->size;
190         unsigned i;
191         void *burst[MAX_BURST] = {0};
192
193         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
194                 while(lcore_count != 2)
195                         rte_pause();
196
197         const uint64_t sp_start = rte_rdtsc();
198         for (i = 0; i < iterations; i++)
199                 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
200                         rte_pause();
201         const uint64_t sp_end = rte_rdtsc();
202
203         const uint64_t mp_start = rte_rdtsc();
204         for (i = 0; i < iterations; i++)
205                 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
206                         rte_pause();
207         const uint64_t mp_end = rte_rdtsc();
208
209         params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
210         params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
211         return 0;
212 }
213
214 /*
215  * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
216  * thread running enqueue_bulk function
217  */
218 static int
219 dequeue_bulk(void *p)
220 {
221         const unsigned iter_shift = 23;
222         const unsigned iterations = 1<<iter_shift;
223         struct thread_params *params = p;
224         const unsigned size = params->size;
225         unsigned i;
226         void *burst[MAX_BURST] = {0};
227
228         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
229                 while(lcore_count != 2)
230                         rte_pause();
231
232         const uint64_t sc_start = rte_rdtsc();
233         for (i = 0; i < iterations; i++)
234                 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
235                         rte_pause();
236         const uint64_t sc_end = rte_rdtsc();
237
238         const uint64_t mc_start = rte_rdtsc();
239         for (i = 0; i < iterations; i++)
240                 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
241                         rte_pause();
242         const uint64_t mc_end = rte_rdtsc();
243
244         params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
245         params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
246         return 0;
247 }
248
249 /*
250  * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
251  * used to measure ring perf between hyperthreads, cores and sockets.
252  */
253 static void
254 run_on_core_pair(struct lcore_pair *cores,
255                 lcore_function_t f1, lcore_function_t f2)
256 {
257         struct thread_params param1 = {0}, param2 = {0};
258         unsigned i;
259         for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
260                 lcore_count = 0;
261                 param1.size = param2.size = bulk_sizes[i];
262                 if (cores->c1 == rte_get_master_lcore()) {
263                         rte_eal_remote_launch(f2, &param2, cores->c2);
264                         f1(&param1);
265                         rte_eal_wait_lcore(cores->c2);
266                 } else {
267                         rte_eal_remote_launch(f1, &param1, cores->c1);
268                         rte_eal_remote_launch(f2, &param2, cores->c2);
269                         rte_eal_wait_lcore(cores->c1);
270                         rte_eal_wait_lcore(cores->c2);
271                 }
272                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
273                                 param1.spsc + param2.spsc);
274                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
275                                 param1.mpmc + param2.mpmc);
276         }
277 }
278
279 /*
280  * Test function that determines how long an enqueue + dequeue of a single item
281  * takes on a single lcore. Result is for comparison with the bulk enq+deq.
282  */
283 static void
284 test_single_enqueue_dequeue(void)
285 {
286         const unsigned iter_shift = 24;
287         const unsigned iterations = 1<<iter_shift;
288         unsigned i = 0;
289         void *burst = NULL;
290
291         const uint64_t sc_start = rte_rdtsc();
292         for (i = 0; i < iterations; i++) {
293                 rte_ring_sp_enqueue(r, burst);
294                 rte_ring_sc_dequeue(r, &burst);
295         }
296         const uint64_t sc_end = rte_rdtsc();
297
298         const uint64_t mc_start = rte_rdtsc();
299         for (i = 0; i < iterations; i++) {
300                 rte_ring_mp_enqueue(r, burst);
301                 rte_ring_mc_dequeue(r, &burst);
302         }
303         const uint64_t mc_end = rte_rdtsc();
304
305         printf("SP/SC single enq/dequeue: %"PRIu64"\n",
306                         (sc_end-sc_start) >> iter_shift);
307         printf("MP/MC single enq/dequeue: %"PRIu64"\n",
308                         (mc_end-mc_start) >> iter_shift);
309 }
310
311 /*
312  * Test that does both enqueue and dequeue on a core using the burst() API calls
313  * instead of the bulk() calls used in other tests. Results should be the same
314  * as for the bulk function called on a single lcore.
315  */
316 static void
317 test_burst_enqueue_dequeue(void)
318 {
319         const unsigned iter_shift = 23;
320         const unsigned iterations = 1<<iter_shift;
321         unsigned sz, i = 0;
322         void *burst[MAX_BURST] = {0};
323
324         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
325                 const uint64_t sc_start = rte_rdtsc();
326                 for (i = 0; i < iterations; i++) {
327                         rte_ring_sp_enqueue_burst(r, burst,
328                                         bulk_sizes[sz], NULL);
329                         rte_ring_sc_dequeue_burst(r, burst,
330                                         bulk_sizes[sz], NULL);
331                 }
332                 const uint64_t sc_end = rte_rdtsc();
333
334                 const uint64_t mc_start = rte_rdtsc();
335                 for (i = 0; i < iterations; i++) {
336                         rte_ring_mp_enqueue_burst(r, burst,
337                                         bulk_sizes[sz], NULL);
338                         rte_ring_mc_dequeue_burst(r, burst,
339                                         bulk_sizes[sz], NULL);
340                 }
341                 const uint64_t mc_end = rte_rdtsc();
342
343                 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
344                 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
345
346                 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
347                                 sc_avg);
348                 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
349                                 mc_avg);
350         }
351 }
352
353 /* Times enqueue and dequeue on a single lcore */
354 static void
355 test_bulk_enqueue_dequeue(void)
356 {
357         const unsigned iter_shift = 23;
358         const unsigned iterations = 1<<iter_shift;
359         unsigned sz, i = 0;
360         void *burst[MAX_BURST] = {0};
361
362         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
363                 const uint64_t sc_start = rte_rdtsc();
364                 for (i = 0; i < iterations; i++) {
365                         rte_ring_sp_enqueue_bulk(r, burst,
366                                         bulk_sizes[sz], NULL);
367                         rte_ring_sc_dequeue_bulk(r, burst,
368                                         bulk_sizes[sz], NULL);
369                 }
370                 const uint64_t sc_end = rte_rdtsc();
371
372                 const uint64_t mc_start = rte_rdtsc();
373                 for (i = 0; i < iterations; i++) {
374                         rte_ring_mp_enqueue_bulk(r, burst,
375                                         bulk_sizes[sz], NULL);
376                         rte_ring_mc_dequeue_bulk(r, burst,
377                                         bulk_sizes[sz], NULL);
378                 }
379                 const uint64_t mc_end = rte_rdtsc();
380
381                 double sc_avg = ((double)(sc_end-sc_start) /
382                                 (iterations * bulk_sizes[sz]));
383                 double mc_avg = ((double)(mc_end-mc_start) /
384                                 (iterations * bulk_sizes[sz]));
385
386                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
387                                 sc_avg);
388                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
389                                 mc_avg);
390         }
391 }
392
393 static int
394 test_ring_perf(void)
395 {
396         struct lcore_pair cores;
397         r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
398         if (r == NULL && (r = rte_ring_lookup(RING_NAME)) == NULL)
399                 return -1;
400
401         printf("### Testing single element and burst enq/deq ###\n");
402         test_single_enqueue_dequeue();
403         test_burst_enqueue_dequeue();
404
405         printf("\n### Testing empty dequeue ###\n");
406         test_empty_dequeue();
407
408         printf("\n### Testing using a single lcore ###\n");
409         test_bulk_enqueue_dequeue();
410
411         if (get_two_hyperthreads(&cores) == 0) {
412                 printf("\n### Testing using two hyperthreads ###\n");
413                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
414         }
415         if (get_two_cores(&cores) == 0) {
416                 printf("\n### Testing using two physical cores ###\n");
417                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
418         }
419         if (get_two_sockets(&cores) == 0) {
420                 printf("\n### Testing using two NUMA nodes ###\n");
421                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
422         }
423         return 0;
424 }
425
426 REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);