New upstream version 17.11.1
[deb_dpdk.git] / test / test / test_ring_perf.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34
35 #include <stdio.h>
36 #include <inttypes.h>
37 #include <rte_ring.h>
38 #include <rte_cycles.h>
39 #include <rte_launch.h>
40 #include <rte_pause.h>
41
42 #include "test.h"
43
44 /*
45  * Ring
46  * ====
47  *
48  * Measures performance of various operations using rdtsc
49  *  * Empty ring dequeue
50  *  * Enqueue/dequeue of bursts in 1 threads
51  *  * Enqueue/dequeue of bursts in 2 threads
52  */
53
54 #define RING_NAME "RING_PERF"
55 #define RING_SIZE 4096
56 #define MAX_BURST 32
57
58 /*
59  * the sizes to enqueue and dequeue in testing
60  * (marked volatile so they won't be seen as compile-time constants)
61  */
62 static const volatile unsigned bulk_sizes[] = { 8, 32 };
63
64 struct lcore_pair {
65         unsigned c1, c2;
66 };
67
68 static volatile unsigned lcore_count = 0;
69
70 /**** Functions to analyse our core mask to get cores for different tests ***/
71
72 static int
73 get_two_hyperthreads(struct lcore_pair *lcp)
74 {
75         unsigned id1, id2;
76         unsigned c1, c2, s1, s2;
77         RTE_LCORE_FOREACH(id1) {
78                 /* inner loop just re-reads all id's. We could skip the first few
79                  * elements, but since number of cores is small there is little point
80                  */
81                 RTE_LCORE_FOREACH(id2) {
82                         if (id1 == id2)
83                                 continue;
84                         c1 = lcore_config[id1].core_id;
85                         c2 = lcore_config[id2].core_id;
86                         s1 = lcore_config[id1].socket_id;
87                         s2 = lcore_config[id2].socket_id;
88                         if ((c1 == c2) && (s1 == s2)){
89                                 lcp->c1 = id1;
90                                 lcp->c2 = id2;
91                                 return 0;
92                         }
93                 }
94         }
95         return 1;
96 }
97
98 static int
99 get_two_cores(struct lcore_pair *lcp)
100 {
101         unsigned id1, id2;
102         unsigned c1, c2, s1, s2;
103         RTE_LCORE_FOREACH(id1) {
104                 RTE_LCORE_FOREACH(id2) {
105                         if (id1 == id2)
106                                 continue;
107                         c1 = lcore_config[id1].core_id;
108                         c2 = lcore_config[id2].core_id;
109                         s1 = lcore_config[id1].socket_id;
110                         s2 = lcore_config[id2].socket_id;
111                         if ((c1 != c2) && (s1 == s2)){
112                                 lcp->c1 = id1;
113                                 lcp->c2 = id2;
114                                 return 0;
115                         }
116                 }
117         }
118         return 1;
119 }
120
121 static int
122 get_two_sockets(struct lcore_pair *lcp)
123 {
124         unsigned id1, id2;
125         unsigned s1, s2;
126         RTE_LCORE_FOREACH(id1) {
127                 RTE_LCORE_FOREACH(id2) {
128                         if (id1 == id2)
129                                 continue;
130                         s1 = lcore_config[id1].socket_id;
131                         s2 = lcore_config[id2].socket_id;
132                         if (s1 != s2){
133                                 lcp->c1 = id1;
134                                 lcp->c2 = id2;
135                                 return 0;
136                         }
137                 }
138         }
139         return 1;
140 }
141
142 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
143 static void
144 test_empty_dequeue(struct rte_ring *r)
145 {
146         const unsigned iter_shift = 26;
147         const unsigned iterations = 1<<iter_shift;
148         unsigned i = 0;
149         void *burst[MAX_BURST];
150
151         const uint64_t sc_start = rte_rdtsc();
152         for (i = 0; i < iterations; i++)
153                 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
154         const uint64_t sc_end = rte_rdtsc();
155
156         const uint64_t mc_start = rte_rdtsc();
157         for (i = 0; i < iterations; i++)
158                 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
159         const uint64_t mc_end = rte_rdtsc();
160
161         printf("SC empty dequeue: %.2F\n",
162                         (double)(sc_end-sc_start) / iterations);
163         printf("MC empty dequeue: %.2F\n",
164                         (double)(mc_end-mc_start) / iterations);
165 }
166
167 /*
168  * for the separate enqueue and dequeue threads they take in one param
169  * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
170  */
171 struct thread_params {
172         struct rte_ring *r;
173         unsigned size;        /* input value, the burst size */
174         double spsc, mpmc;    /* output value, the single or multi timings */
175 };
176
177 /*
178  * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
179  * thread running dequeue_bulk function
180  */
181 static int
182 enqueue_bulk(void *p)
183 {
184         const unsigned iter_shift = 23;
185         const unsigned iterations = 1<<iter_shift;
186         struct thread_params *params = p;
187         struct rte_ring *r = params->r;
188         const unsigned size = params->size;
189         unsigned i;
190         void *burst[MAX_BURST] = {0};
191
192         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
193                 while(lcore_count != 2)
194                         rte_pause();
195
196         const uint64_t sp_start = rte_rdtsc();
197         for (i = 0; i < iterations; i++)
198                 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
199                         rte_pause();
200         const uint64_t sp_end = rte_rdtsc();
201
202         const uint64_t mp_start = rte_rdtsc();
203         for (i = 0; i < iterations; i++)
204                 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
205                         rte_pause();
206         const uint64_t mp_end = rte_rdtsc();
207
208         params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
209         params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
210         return 0;
211 }
212
213 /*
214  * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
215  * thread running enqueue_bulk function
216  */
217 static int
218 dequeue_bulk(void *p)
219 {
220         const unsigned iter_shift = 23;
221         const unsigned iterations = 1<<iter_shift;
222         struct thread_params *params = p;
223         struct rte_ring *r = params->r;
224         const unsigned size = params->size;
225         unsigned i;
226         void *burst[MAX_BURST] = {0};
227
228         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
229                 while(lcore_count != 2)
230                         rte_pause();
231
232         const uint64_t sc_start = rte_rdtsc();
233         for (i = 0; i < iterations; i++)
234                 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
235                         rte_pause();
236         const uint64_t sc_end = rte_rdtsc();
237
238         const uint64_t mc_start = rte_rdtsc();
239         for (i = 0; i < iterations; i++)
240                 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
241                         rte_pause();
242         const uint64_t mc_end = rte_rdtsc();
243
244         params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
245         params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
246         return 0;
247 }
248
249 /*
250  * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
251  * used to measure ring perf between hyperthreads, cores and sockets.
252  */
253 static void
254 run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r,
255                 lcore_function_t f1, lcore_function_t f2)
256 {
257         struct thread_params param1 = {0}, param2 = {0};
258         unsigned i;
259         for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
260                 lcore_count = 0;
261                 param1.size = param2.size = bulk_sizes[i];
262                 param1.r = param2.r = r;
263                 if (cores->c1 == rte_get_master_lcore()) {
264                         rte_eal_remote_launch(f2, &param2, cores->c2);
265                         f1(&param1);
266                         rte_eal_wait_lcore(cores->c2);
267                 } else {
268                         rte_eal_remote_launch(f1, &param1, cores->c1);
269                         rte_eal_remote_launch(f2, &param2, cores->c2);
270                         rte_eal_wait_lcore(cores->c1);
271                         rte_eal_wait_lcore(cores->c2);
272                 }
273                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
274                                 param1.spsc + param2.spsc);
275                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
276                                 param1.mpmc + param2.mpmc);
277         }
278 }
279
280 /*
281  * Test function that determines how long an enqueue + dequeue of a single item
282  * takes on a single lcore. Result is for comparison with the bulk enq+deq.
283  */
284 static void
285 test_single_enqueue_dequeue(struct rte_ring *r)
286 {
287         const unsigned iter_shift = 24;
288         const unsigned iterations = 1<<iter_shift;
289         unsigned i = 0;
290         void *burst = NULL;
291
292         const uint64_t sc_start = rte_rdtsc();
293         for (i = 0; i < iterations; i++) {
294                 rte_ring_sp_enqueue(r, burst);
295                 rte_ring_sc_dequeue(r, &burst);
296         }
297         const uint64_t sc_end = rte_rdtsc();
298
299         const uint64_t mc_start = rte_rdtsc();
300         for (i = 0; i < iterations; i++) {
301                 rte_ring_mp_enqueue(r, burst);
302                 rte_ring_mc_dequeue(r, &burst);
303         }
304         const uint64_t mc_end = rte_rdtsc();
305
306         printf("SP/SC single enq/dequeue: %"PRIu64"\n",
307                         (sc_end-sc_start) >> iter_shift);
308         printf("MP/MC single enq/dequeue: %"PRIu64"\n",
309                         (mc_end-mc_start) >> iter_shift);
310 }
311
312 /*
313  * Test that does both enqueue and dequeue on a core using the burst() API calls
314  * instead of the bulk() calls used in other tests. Results should be the same
315  * as for the bulk function called on a single lcore.
316  */
317 static void
318 test_burst_enqueue_dequeue(struct rte_ring *r)
319 {
320         const unsigned iter_shift = 23;
321         const unsigned iterations = 1<<iter_shift;
322         unsigned sz, i = 0;
323         void *burst[MAX_BURST] = {0};
324
325         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
326                 const uint64_t sc_start = rte_rdtsc();
327                 for (i = 0; i < iterations; i++) {
328                         rte_ring_sp_enqueue_burst(r, burst,
329                                         bulk_sizes[sz], NULL);
330                         rte_ring_sc_dequeue_burst(r, burst,
331                                         bulk_sizes[sz], NULL);
332                 }
333                 const uint64_t sc_end = rte_rdtsc();
334
335                 const uint64_t mc_start = rte_rdtsc();
336                 for (i = 0; i < iterations; i++) {
337                         rte_ring_mp_enqueue_burst(r, burst,
338                                         bulk_sizes[sz], NULL);
339                         rte_ring_mc_dequeue_burst(r, burst,
340                                         bulk_sizes[sz], NULL);
341                 }
342                 const uint64_t mc_end = rte_rdtsc();
343
344                 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
345                 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
346
347                 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
348                                 sc_avg);
349                 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
350                                 mc_avg);
351         }
352 }
353
354 /* Times enqueue and dequeue on a single lcore */
355 static void
356 test_bulk_enqueue_dequeue(struct rte_ring *r)
357 {
358         const unsigned iter_shift = 23;
359         const unsigned iterations = 1<<iter_shift;
360         unsigned sz, i = 0;
361         void *burst[MAX_BURST] = {0};
362
363         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
364                 const uint64_t sc_start = rte_rdtsc();
365                 for (i = 0; i < iterations; i++) {
366                         rte_ring_sp_enqueue_bulk(r, burst,
367                                         bulk_sizes[sz], NULL);
368                         rte_ring_sc_dequeue_bulk(r, burst,
369                                         bulk_sizes[sz], NULL);
370                 }
371                 const uint64_t sc_end = rte_rdtsc();
372
373                 const uint64_t mc_start = rte_rdtsc();
374                 for (i = 0; i < iterations; i++) {
375                         rte_ring_mp_enqueue_bulk(r, burst,
376                                         bulk_sizes[sz], NULL);
377                         rte_ring_mc_dequeue_bulk(r, burst,
378                                         bulk_sizes[sz], NULL);
379                 }
380                 const uint64_t mc_end = rte_rdtsc();
381
382                 double sc_avg = ((double)(sc_end-sc_start) /
383                                 (iterations * bulk_sizes[sz]));
384                 double mc_avg = ((double)(mc_end-mc_start) /
385                                 (iterations * bulk_sizes[sz]));
386
387                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
388                                 sc_avg);
389                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
390                                 mc_avg);
391         }
392 }
393
394 static int
395 test_ring_perf(void)
396 {
397         struct lcore_pair cores;
398         struct rte_ring *r = NULL;
399
400         r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
401         if (r == NULL)
402                 return -1;
403
404         printf("### Testing single element and burst enq/deq ###\n");
405         test_single_enqueue_dequeue(r);
406         test_burst_enqueue_dequeue(r);
407
408         printf("\n### Testing empty dequeue ###\n");
409         test_empty_dequeue(r);
410
411         printf("\n### Testing using a single lcore ###\n");
412         test_bulk_enqueue_dequeue(r);
413
414         if (get_two_hyperthreads(&cores) == 0) {
415                 printf("\n### Testing using two hyperthreads ###\n");
416                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
417         }
418         if (get_two_cores(&cores) == 0) {
419                 printf("\n### Testing using two physical cores ###\n");
420                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
421         }
422         if (get_two_sockets(&cores) == 0) {
423                 printf("\n### Testing using two NUMA nodes ###\n");
424                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
425         }
426         rte_ring_free(r);
427         return 0;
428 }
429
430 REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);