New upstream version 17.11-rc3
[deb_dpdk.git] / test / test / test_memcpy_perf.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include <sys/time.h>
39
40 #include <rte_common.h>
41 #include <rte_cycles.h>
42 #include <rte_random.h>
43 #include <rte_malloc.h>
44
45 #include <rte_memcpy.h>
46
47 #include "test.h"
48
49 /*
50  * Set this to the maximum buffer size you want to test. If it is 0, then the
51  * values in the buf_sizes[] array below will be used.
52  */
53 #define TEST_VALUE_RANGE        0
54
55 /* List of buffer sizes to test */
56 #if TEST_VALUE_RANGE == 0
57 static size_t buf_sizes[] = {
58         1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
59         129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448,
60         449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600,
61         2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192
62 };
63 /* MUST be as large as largest packet size above */
64 #define SMALL_BUFFER_SIZE       8192
65 #else /* TEST_VALUE_RANGE != 0 */
66 static size_t buf_sizes[TEST_VALUE_RANGE];
67 #define SMALL_BUFFER_SIZE       TEST_VALUE_RANGE
68 #endif /* TEST_VALUE_RANGE == 0 */
69
70
71 /*
72  * Arrays of this size are used for measuring uncached memory accesses by
73  * picking a random location within the buffer. Make this smaller if there are
74  * memory allocation errors.
75  */
76 #define LARGE_BUFFER_SIZE       (100 * 1024 * 1024)
77
78 /* How many times to run timing loop for performance tests */
79 #define TEST_ITERATIONS         1000000
80 #define TEST_BATCH_SIZE         100
81
82 /* Data is aligned on this many bytes (power of 2) */
83 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
84 #define ALIGNMENT_UNIT          64
85 #elif defined RTE_MACHINE_CPUFLAG_AVX2
86 #define ALIGNMENT_UNIT          32
87 #else /* RTE_MACHINE_CPUFLAG */
88 #define ALIGNMENT_UNIT          16
89 #endif /* RTE_MACHINE_CPUFLAG */
90
91 /*
92  * Pointers used in performance tests. The two large buffers are for uncached
93  * access where random addresses within the buffer are used for each
94  * memcpy. The two small buffers are for cached access.
95  */
96 static uint8_t *large_buf_read, *large_buf_write;
97 static uint8_t *small_buf_read, *small_buf_write;
98
99 /* Initialise data buffers. */
100 static int
101 init_buffers(void)
102 {
103         unsigned i;
104
105         large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
106         if (large_buf_read == NULL)
107                 goto error_large_buf_read;
108
109         large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
110         if (large_buf_write == NULL)
111                 goto error_large_buf_write;
112
113         small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
114         if (small_buf_read == NULL)
115                 goto error_small_buf_read;
116
117         small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
118         if (small_buf_write == NULL)
119                 goto error_small_buf_write;
120
121         for (i = 0; i < LARGE_BUFFER_SIZE; i++)
122                 large_buf_read[i] = rte_rand();
123         for (i = 0; i < SMALL_BUFFER_SIZE; i++)
124                 small_buf_read[i] = rte_rand();
125
126         return 0;
127
128 error_small_buf_write:
129         rte_free(small_buf_read);
130 error_small_buf_read:
131         rte_free(large_buf_write);
132 error_large_buf_write:
133         rte_free(large_buf_read);
134 error_large_buf_read:
135         printf("ERROR: not enough memory\n");
136         return -1;
137 }
138
139 /* Cleanup data buffers */
140 static void
141 free_buffers(void)
142 {
143         rte_free(large_buf_read);
144         rte_free(large_buf_write);
145         rte_free(small_buf_read);
146         rte_free(small_buf_write);
147 }
148
149 /*
150  * Get a random offset into large array, with enough space needed to perform
151  * max copy size. Offset is aligned, uoffset is used for unalignment setting.
152  */
153 static inline size_t
154 get_rand_offset(size_t uoffset)
155 {
156         return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
157                         ~(ALIGNMENT_UNIT - 1)) + uoffset;
158 }
159
160 /* Fill in source and destination addresses. */
161 static inline void
162 fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset,
163                                  size_t *src_addr, int is_src_cached, size_t src_uoffset)
164 {
165         unsigned int i;
166
167         for (i = 0; i < TEST_BATCH_SIZE; i++) {
168                 dst_addr[i] = (is_dst_cached) ? dst_uoffset : get_rand_offset(dst_uoffset);
169                 src_addr[i] = (is_src_cached) ? src_uoffset : get_rand_offset(src_uoffset);
170         }
171 }
172
173 /*
174  * WORKAROUND: For some reason the first test doing an uncached write
175  * takes a very long time (~25 times longer than is expected). So we do
176  * it once without timing.
177  */
178 static void
179 do_uncached_write(uint8_t *dst, int is_dst_cached,
180                                   const uint8_t *src, int is_src_cached, size_t size)
181 {
182         unsigned i, j;
183         size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];
184
185         for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
186                 fill_addr_arrays(dst_addrs, is_dst_cached, 0,
187                                                  src_addrs, is_src_cached, 0);
188                 for (j = 0; j < TEST_BATCH_SIZE; j++) {
189                         rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
190                 }
191         }
192 }
193
194 /*
195  * Run a single memcpy performance test. This is a macro to ensure that if
196  * the "size" parameter is a constant it won't be converted to a variable.
197  */
198 #define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset,                   \
199                          src, is_src_cached, src_uoffset, size)             \
200 do {                                                                        \
201     unsigned int iter, t;                                                   \
202     size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];          \
203     uint64_t start_time, total_time = 0;                                    \
204     uint64_t total_time2 = 0;                                               \
205     for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {    \
206         fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset,             \
207                          src_addrs, is_src_cached, src_uoffset);            \
208         start_time = rte_rdtsc();                                           \
209         for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
210             rte_memcpy(dst+dst_addrs[t], src+src_addrs[t], size);           \
211         total_time += rte_rdtsc() - start_time;                             \
212     }                                                                       \
213     for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {    \
214         fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset,             \
215                          src_addrs, is_src_cached, src_uoffset);            \
216         start_time = rte_rdtsc();                                           \
217         for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
218             memcpy(dst+dst_addrs[t], src+src_addrs[t], size);               \
219         total_time2 += rte_rdtsc() - start_time;                            \
220     }                                                                       \
221     printf("%3.0f -", (double)total_time  / TEST_ITERATIONS);                 \
222     printf("%3.0f",   (double)total_time2 / TEST_ITERATIONS);                 \
223     printf("(%6.2f%%) ", ((double)total_time - total_time2)*100/total_time2); \
224 } while (0)
225
226 /* Run aligned memcpy tests for each cached/uncached permutation */
227 #define ALL_PERF_TESTS_FOR_SIZE(n)                                       \
228 do {                                                                     \
229     if (__builtin_constant_p(n))                                         \
230         printf("\nC%6u", (unsigned)n);                                   \
231     else                                                                 \
232         printf("\n%7u", (unsigned)n);                                    \
233     SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n);    \
234     SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n);    \
235     SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n);    \
236     SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n);    \
237 } while (0)
238
239 /* Run unaligned memcpy tests for each cached/uncached permutation */
240 #define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n)                             \
241 do {                                                                     \
242     if (__builtin_constant_p(n))                                         \
243         printf("\nC%6u", (unsigned)n);                                   \
244     else                                                                 \
245         printf("\n%7u", (unsigned)n);                                    \
246     SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n);    \
247     SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n);    \
248     SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n);    \
249     SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n);    \
250 } while (0)
251
252 /* Run memcpy tests for constant length */
253 #define ALL_PERF_TEST_FOR_CONSTANT                                      \
254 do {                                                                    \
255     TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U);         \
256     TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U);      \
257     TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U);    \
258 } while (0)
259
260 /* Run all memcpy tests for aligned constant cases */
261 static inline void
262 perf_test_constant_aligned(void)
263 {
264 #define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE
265         ALL_PERF_TEST_FOR_CONSTANT;
266 #undef TEST_CONSTANT
267 }
268
269 /* Run all memcpy tests for unaligned constant cases */
270 static inline void
271 perf_test_constant_unaligned(void)
272 {
273 #define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED
274         ALL_PERF_TEST_FOR_CONSTANT;
275 #undef TEST_CONSTANT
276 }
277
278 /* Run all memcpy tests for aligned variable cases */
279 static inline void
280 perf_test_variable_aligned(void)
281 {
282         unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
283         unsigned i;
284         for (i = 0; i < n; i++) {
285                 ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]);
286         }
287 }
288
289 /* Run all memcpy tests for unaligned variable cases */
290 static inline void
291 perf_test_variable_unaligned(void)
292 {
293         unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
294         unsigned i;
295         for (i = 0; i < n; i++) {
296                 ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]);
297         }
298 }
299
300 /* Run all memcpy tests */
301 static int
302 perf_test(void)
303 {
304         int ret;
305         struct timeval tv_begin, tv_end;
306         double time_aligned, time_unaligned;
307         double time_aligned_const, time_unaligned_const;
308
309         ret = init_buffers();
310         if (ret != 0)
311                 return ret;
312
313 #if TEST_VALUE_RANGE != 0
314         /* Set up buf_sizes array, if required */
315         unsigned i;
316         for (i = 0; i < TEST_VALUE_RANGE; i++)
317                 buf_sizes[i] = i;
318 #endif
319
320         /* See function comment */
321         do_uncached_write(large_buf_write, 0, small_buf_read, 1, SMALL_BUFFER_SIZE);
322
323         printf("\n** rte_memcpy() - memcpy perf. tests (C = compile-time constant) **\n"
324                    "======= ================= ================= ================= =================\n"
325                    "   Size   Cache to cache     Cache to mem      Mem to cache        Mem to mem\n"
326                    "(bytes)          (ticks)          (ticks)           (ticks)           (ticks)\n"
327                    "------- ----------------- ----------------- ----------------- -----------------");
328
329         printf("\n================================= %2dB aligned =================================",
330                 ALIGNMENT_UNIT);
331         /* Do aligned tests where size is a variable */
332         gettimeofday(&tv_begin, NULL);
333         perf_test_variable_aligned();
334         gettimeofday(&tv_end, NULL);
335         time_aligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
336                 + ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
337         printf("\n------- ----------------- ----------------- ----------------- -----------------");
338         /* Do aligned tests where size is a compile-time constant */
339         gettimeofday(&tv_begin, NULL);
340         perf_test_constant_aligned();
341         gettimeofday(&tv_end, NULL);
342         time_aligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
343                 + ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
344         printf("\n================================== Unaligned ==================================");
345         /* Do unaligned tests where size is a variable */
346         gettimeofday(&tv_begin, NULL);
347         perf_test_variable_unaligned();
348         gettimeofday(&tv_end, NULL);
349         time_unaligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
350                 + ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
351         printf("\n------- ----------------- ----------------- ----------------- -----------------");
352         /* Do unaligned tests where size is a compile-time constant */
353         gettimeofday(&tv_begin, NULL);
354         perf_test_constant_unaligned();
355         gettimeofday(&tv_end, NULL);
356         time_unaligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
357                 + ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
358         printf("\n======= ================= ================= ================= =================\n\n");
359
360         printf("Test Execution Time (seconds):\n");
361         printf("Aligned variable copy size   = %8.3f\n", time_aligned);
362         printf("Aligned constant copy size   = %8.3f\n", time_aligned_const);
363         printf("Unaligned variable copy size = %8.3f\n", time_unaligned);
364         printf("Unaligned constant copy size = %8.3f\n", time_unaligned_const);
365         free_buffers();
366
367         return 0;
368 }
369
370 static int
371 test_memcpy_perf(void)
372 {
373         int ret;
374
375         ret = perf_test();
376         if (ret != 0)
377                 return -1;
378         return 0;
379 }
380
381 REGISTER_TEST_COMMAND(memcpy_perf_autotest, test_memcpy_perf);