From d5045e68a782d484e3f0e54edb4a88dc3dfac291 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 6 Apr 2022 21:16:37 +0200 Subject: [PATCH] vppinfra: introduce clib_perfmom Type: improvement Change-Id: I85a90774eb313020435c9bc2297c1bdf23d52efc Signed-off-by: Damjan Marion --- src/vppinfra/CMakeLists.txt | 4 + src/vppinfra/clib.h | 3 + src/vppinfra/perfmon/bundle_core_power.c | 48 +++++++ src/vppinfra/perfmon/bundle_default.c | 64 +++++++++ src/vppinfra/perfmon/perfmon.c | 230 +++++++++++++++++++++++++++++++ src/vppinfra/perfmon/perfmon.h | 117 ++++++++++++++++ src/vppinfra/vector/test/ip_csum.c | 26 ++-- src/vppinfra/vector/test/sha2.c | 7 +- src/vppinfra/vector/test/test.c | 163 +++------------------- src/vppinfra/vector/test/test.h | 30 ++-- src/vppinfra/vector/test/toeplitz.c | 54 ++++---- 11 files changed, 530 insertions(+), 216 deletions(-) create mode 100644 src/vppinfra/perfmon/bundle_core_power.c create mode 100644 src/vppinfra/perfmon/bundle_default.c create mode 100644 src/vppinfra/perfmon/perfmon.c create mode 100644 src/vppinfra/perfmon/perfmon.h diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt index 8f7129aa488..d7445b0982f 100644 --- a/src/vppinfra/CMakeLists.txt +++ b/src/vppinfra/CMakeLists.txt @@ -71,6 +71,9 @@ set(VPPINFRA_SRCS mhash.c mpcap.c pcap.c + perfmon/bundle_default.c + perfmon/bundle_core_power.c + perfmon/perfmon.c pmalloc.c pool.c ptclosure.c @@ -155,6 +158,7 @@ set(VPPINFRA_HEADERS os.h pcap.h pcap_funcs.h + perfmon/perfmon.h pmalloc.h pool.h ptclosure.h diff --git a/src/vppinfra/clib.h b/src/vppinfra/clib.h index 2db611d969b..746cb511bbe 100644 --- a/src/vppinfra/clib.h +++ b/src/vppinfra/clib.h @@ -95,6 +95,9 @@ /* Make a string from the macro's argument */ #define CLIB_STRING_MACRO(x) #x +#define CLIB_STRING_ARRAY(...) \ + (char *[]) { __VA_ARGS__, 0 } + /* sanitizers */ #ifdef __has_feature #if __has_feature(address_sanitizer) diff --git a/src/vppinfra/perfmon/bundle_core_power.c b/src/vppinfra/perfmon/bundle_core_power.c new file mode 100644 index 00000000000..6a30cdfdde4 --- /dev/null +++ b/src/vppinfra/perfmon/bundle_core_power.c @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2022 Cisco Systems, Inc. + */ + +#ifdef __x86_64__ + +#include +#include +#include + +static u8 * +format_perfmon_bundle_core_power (u8 *s, va_list *args) +{ + clib_perfmon_ctx_t __clib_unused *ctx = va_arg (*args, clib_perfmon_ctx_t *); + clib_perfmon_capture_t *c = va_arg (*args, clib_perfmon_capture_t *); + u32 col = va_arg (*args, int); + u64 *d = c->data; + + switch (col) + { + case 0: + return format (s, "%7.1f %%", (f64) 100 * d[1] / d[0]); + case 1: + return format (s, "%7.1f %%", (f64) 100 * d[2] / d[0]); + case 2: + return format (s, "%7.1f %%", (f64) 100 * d[3] / d[0]); + default: + return s; + } +} + +#define PERF_INTEL_CODE(event, umask) ((event) | (umask) << 8) + +CLIB_PERFMON_BUNDLE (core_power) = { + .name = "core-power", + .desc = + "Core cycles where the core was running under specific turbo schedule.", + .type = PERF_TYPE_RAW, + .config[0] = PERF_INTEL_CODE (0x3c, 0x00), + .config[1] = PERF_INTEL_CODE (0x28, 0x07), + .config[2] = PERF_INTEL_CODE (0x28, 0x18), + .config[3] = PERF_INTEL_CODE (0x28, 0x20), + .n_events = 4, + .format_fn = format_perfmon_bundle_core_power, + .column_headers = CLIB_STRING_ARRAY ("Level 0", "Level 1", "Level 2"), +}; + +#endif diff --git a/src/vppinfra/perfmon/bundle_default.c b/src/vppinfra/perfmon/bundle_default.c new file mode 100644 index 00000000000..b5282c51740 --- /dev/null +++ b/src/vppinfra/perfmon/bundle_default.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2022 Cisco Systems, Inc. + */ + +#include +#include +#include + +static u8 * +format_perfmon_bundle_default (u8 *s, va_list *args) +{ + clib_perfmon_ctx_t *ctx = va_arg (*args, clib_perfmon_ctx_t *); + clib_perfmon_capture_t *c = va_arg (*args, clib_perfmon_capture_t *); + u32 col = va_arg (*args, int); + u64 *d = c->data; + + switch (col) + { + case 0: + if (ctx->ref_clock > 0) + return format (s, "%8.1f", (f64) d[0] / d[1] * (ctx->ref_clock / 1e9)); + else + return s; + case 1: + return format (s, "%5.2f", (f64) d[2] / d[0]); + case 2: + if (c->n_ops > 1) + return format (s, "%8.2f", (f64) d[0] / c->n_ops); + else + return format (s, "%8u", d[0]); + case 3: + if (c->n_ops > 1) + return format (s, "%8.2f", (f64) d[2] / c->n_ops); + else + return format (s, "%8u", d[2]); + case 4: + if (c->n_ops > 1) + return format (s, "%9.2f", (f64) d[3] / c->n_ops); + else + return format (s, "%9u", d[3]); + case 5: + if (c->n_ops > 1) + return format (s, "%10.2f", (f64) d[4] / c->n_ops); + else + return format (s, "%10u", d[4]); + default: + return s; + } +} + +CLIB_PERFMON_BUNDLE (default) = { + .name = "default", + .desc = "IPC, Clocks/Operatiom, Instr/Operation, Branch Total & Miss", + .type = PERF_TYPE_HARDWARE, + .config[0] = PERF_COUNT_HW_CPU_CYCLES, + .config[1] = PERF_COUNT_HW_REF_CPU_CYCLES, + .config[2] = PERF_COUNT_HW_INSTRUCTIONS, + .config[3] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + .config[4] = PERF_COUNT_HW_BRANCH_MISSES, + .n_events = 5, + .format_fn = format_perfmon_bundle_default, + .column_headers = CLIB_STRING_ARRAY ("Freq", "IPC", "Clks/Op", "Inst/Op", + "Brnch/Op", "BrMiss/Op"), +}; diff --git a/src/vppinfra/perfmon/perfmon.c b/src/vppinfra/perfmon/perfmon.c new file mode 100644 index 00000000000..9ec90b88d67 --- /dev/null +++ b/src/vppinfra/perfmon/perfmon.c @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2022 Cisco Systems, Inc. + */ + +#include +#include +#include +#include + +clib_perfmon_main_t clib_perfmon_main; + +__clib_export clib_error_t * +clib_perfmon_init_by_bundle_name (clib_perfmon_ctx_t *ctx, char *fmt, ...) +{ + clib_perfmon_main_t *pm = &clib_perfmon_main; + clib_perfmon_bundle_t *b = 0; + int group_fd = -1; + clib_error_t *err = 0; + va_list va; + char *bundle_name; + + struct perf_event_attr pe = { + .size = sizeof (struct perf_event_attr), + .disabled = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + .pinned = 1, + .exclusive = 1, + .read_format = (PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING), + }; + + va_start (va, fmt); + bundle_name = (char *) va_format (0, fmt, &va); + va_end (va); + vec_add1 (bundle_name, 0); + + for (clib_perfmon_bundle_reg_t *r = pm->bundle_regs; r; r = r->next) + { + if (strncmp (r->bundle->name, bundle_name, vec_len (bundle_name) - 1)) + continue; + b = r->bundle; + break; + } + + if (b == 0) + { + err = clib_error_return (0, "Unknown bundle '%s'", bundle_name); + goto done; + } + + clib_memset_u8 (ctx, 0, sizeof (clib_perfmon_ctx_t)); + vec_validate_init_empty (ctx->fds, b->n_events - 1, -1); + ctx->bundle = b; + + for (int i = 0; i < b->n_events; i++) + { + pe.config = b->config[i]; + pe.type = b->type; + int fd = syscall (__NR_perf_event_open, &pe, /* pid */ 0, /* cpu */ -1, + /* group_fd */ group_fd, /* flags */ 0); + if (fd < 0) + { + err = clib_error_return_unix (0, "perf_event_open[%u]", i); + goto done; + } + + if (ctx->debug) + fformat (stderr, "perf event %u open, fd %d\n", i, fd); + + if (group_fd == -1) + { + group_fd = fd; + pe.pinned = 0; + pe.exclusive = 0; + } + + ctx->fds[i] = fd; + } + + ctx->group_fd = group_fd; + ctx->data = vec_new (u64, 3 + b->n_events); + ctx->ref_clock = os_cpu_clock_frequency (); + vec_validate (ctx->capture_groups, 0); + +done: + if (err) + clib_perfmon_free (ctx); + + vec_free (bundle_name); + return err; +} + +__clib_export void +clib_perfmon_free (clib_perfmon_ctx_t *ctx) +{ + clib_perfmon_clear (ctx); + vec_free (ctx->captures); + vec_free (ctx->capture_groups); + + for (int i = 0; i < vec_len (ctx->fds); i++) + if (ctx->fds[i] > -1) + close (ctx->fds[i]); + vec_free (ctx->fds); + vec_free (ctx->data); +} + +__clib_export void +clib_perfmon_clear (clib_perfmon_ctx_t *ctx) +{ + for (int i = 0; i < vec_len (ctx->captures); i++) + vec_free (ctx->captures[i].desc); + vec_reset_length (ctx->captures); + for (int i = 0; i < vec_len (ctx->capture_groups); i++) + vec_free (ctx->capture_groups[i].name); + vec_reset_length (ctx->capture_groups); +} + +__clib_export u64 * +clib_perfmon_capture (clib_perfmon_ctx_t *ctx, u32 n_ops, char *fmt, ...) +{ + u32 read_size = (ctx->bundle->n_events + 3) * sizeof (u64); + clib_perfmon_capture_t *c; + u64 d[CLIB_PERFMON_MAX_EVENTS + 3]; + va_list va; + + if ((read (ctx->group_fd, d, read_size) != read_size)) + { + if (ctx->debug) + fformat (stderr, "reading of %u bytes failed, %s (%d)\n", read_size, + strerror (errno), errno); + return 0; + } + + if (ctx->debug) + { + fformat (stderr, "read events: %lu enabled: %lu running: %lu ", d[0], + d[1], d[2]); + fformat (stderr, "data: [%lu", d[3]); + for (int i = 1; i < ctx->bundle->n_events; i++) + fformat (stderr, ", %lu", d[i + 3]); + fformat (stderr, "]\n"); + } + + vec_add2 (ctx->captures, c, 1); + + va_start (va, fmt); + c->desc = va_format (0, fmt, &va); + va_end (va); + + c->n_ops = n_ops; + c->group = vec_len (ctx->capture_groups) - 1; + c->time_enabled = d[1]; + c->time_running = d[2]; + for (int i = 0; i < CLIB_PERFMON_MAX_EVENTS; i++) + c->data[i] = d[i + 3]; + + return ctx->data + vec_len (ctx->data) - ctx->bundle->n_events; +} + +__clib_export void +clib_perfmon_capture_group (clib_perfmon_ctx_t *ctx, char *fmt, ...) +{ + clib_perfmon_capture_group_t *cg; + va_list va; + + cg = vec_end (ctx->capture_groups) - 1; + + if (cg->name != 0) + vec_add2 (ctx->capture_groups, cg, 1); + + va_start (va, fmt); + cg->name = va_format (0, fmt, &va); + va_end (va); + ASSERT (cg->name); +} + +__clib_export void +clib_perfmon_warmup (clib_perfmon_ctx_t *ctx) +{ + for (u64 i = 0; i < (u64) ctx->ref_clock; i++) + asm inline("" : : "r"(i * i) : "memory"); +} + +__clib_export u8 * +format_perfmon_bundle (u8 *s, va_list *args) +{ + clib_perfmon_ctx_t *ctx = va_arg (*args, clib_perfmon_ctx_t *); + clib_perfmon_capture_t *c; + clib_perfmon_capture_group_t *cg = 0; + char **hdr = ctx->bundle->column_headers; + table_t _t = {}, *t = &_t; + u32 n_row = 0, col = 0; + + table_add_header_row (t, 0); + + for (char **h = ctx->bundle->column_headers; h[0]; h++) + n_row++; + + vec_foreach (c, ctx->captures) + { + if (cg != ctx->capture_groups + c->group) + { + cg = ctx->capture_groups + c->group; + table_format_cell (t, col, -1, "%v", cg->name); + table_set_cell_align (t, col, -1, TTAA_LEFT); + table_set_cell_fg_color (t, col, -1, TTAC_BRIGHT_RED); + + table_format_cell (t, col, 0, "Ops"); + table_set_cell_fg_color (t, col, 0, TTAC_BRIGHT_YELLOW); + + for (int i = 0; i < n_row; i++) + { + table_format_cell (t, col, i + 1, "%s", hdr[i]); + table_set_cell_fg_color (t, col, i + 1, TTAC_BRIGHT_YELLOW); + } + col++; + } + table_format_cell (t, col, -1, "%v", c->desc); + table_format_cell (t, col, 0, "%7u", c->n_ops); + for (int i = 0; i < n_row; i++) + table_format_cell (t, col, i + 1, "%U", ctx->bundle->format_fn, ctx, c, + i); + col++; + } + + s = format (s, "%U", format_table, t); + table_free (t); + return s; +} diff --git a/src/vppinfra/perfmon/perfmon.h b/src/vppinfra/perfmon/perfmon.h new file mode 100644 index 00000000000..0d09dc6fb23 --- /dev/null +++ b/src/vppinfra/perfmon/perfmon.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2022 Cisco Systems, Inc. + */ + +#ifndef included_perfmon_perfmon_h +#define included_perfmon_perfmon_h + +#include +#ifdef __linux__ +#include +#include +#endif + +#define CLIB_PERFMON_MAX_EVENTS 7 +typedef struct +{ + char *name; + char *desc; + u64 config[CLIB_PERFMON_MAX_EVENTS]; + u32 type; + u8 n_events; + format_function_t *format_fn; + char **column_headers; +} clib_perfmon_bundle_t; + +typedef struct +{ + u64 time_enabled; + u64 time_running; + u64 data[CLIB_PERFMON_MAX_EVENTS]; + u8 *desc; + u32 n_ops; + u32 group; +} clib_perfmon_capture_t; + +typedef struct +{ + u8 *name; + u32 start; +} clib_perfmon_capture_group_t; + +typedef struct +{ + int group_fd; + int *fds; + clib_perfmon_bundle_t *bundle; + u64 *data; + u8 debug : 1; + u32 n_captures; + clib_perfmon_capture_t *captures; + clib_perfmon_capture_group_t *capture_groups; + f64 ref_clock; +} clib_perfmon_ctx_t; + +typedef struct clib_perfmon_bundle_reg +{ + clib_perfmon_bundle_t *bundle; + struct clib_perfmon_bundle_reg *next; +} clib_perfmon_bundle_reg_t; + +typedef struct +{ + clib_perfmon_bundle_reg_t *bundle_regs; +} clib_perfmon_main_t; + +extern clib_perfmon_main_t clib_perfmon_main; + +static_always_inline void +clib_perfmon_ioctl (int fd, u32 req) +{ +#ifdef __x86_64__ + asm inline("syscall" + : + : "D"(fd), "S"(req), "a"(__NR_ioctl), "d"(PERF_IOC_FLAG_GROUP) + : "rcx", "r11" /* registers modified by kernel */); +#else + ioctl (fd, req, PERF_IOC_FLAG_GROUP); +#endif +} + +clib_error_t *clib_perfmon_init_by_bundle_name (clib_perfmon_ctx_t *ctx, + char *fmt, ...); +void clib_perfmon_free (clib_perfmon_ctx_t *ctx); +void clib_perfmon_warmup (clib_perfmon_ctx_t *ctx); +void clib_perfmon_clear (clib_perfmon_ctx_t *ctx); +u64 *clib_perfmon_capture (clib_perfmon_ctx_t *ctx, u32 n_ops, char *fmt, ...); +void clib_perfmon_capture_group (clib_perfmon_ctx_t *ctx, char *fmt, ...); +format_function_t format_perfmon_bundle; + +static_always_inline void +clib_perfmon_reset (clib_perfmon_ctx_t *ctx) +{ + clib_perfmon_ioctl (ctx->group_fd, PERF_EVENT_IOC_RESET); +} +static_always_inline void +clib_perfmon_enable (clib_perfmon_ctx_t *ctx) +{ + clib_perfmon_ioctl (ctx->group_fd, PERF_EVENT_IOC_ENABLE); +} +static_always_inline void +clib_perfmon_disable (clib_perfmon_ctx_t *ctx) +{ + clib_perfmon_ioctl (ctx->group_fd, PERF_EVENT_IOC_DISABLE); +} + +#define CLIB_PERFMON_BUNDLE(x) \ + static clib_perfmon_bundle_reg_t clib_perfmon_bundle_reg_##x; \ + static clib_perfmon_bundle_t clib_perfmon_bundle_##x; \ + static void __clib_constructor clib_perfmon_bundle_reg_fn_##x (void) \ + { \ + clib_perfmon_bundle_reg_##x.bundle = &clib_perfmon_bundle_##x; \ + clib_perfmon_bundle_reg_##x.next = clib_perfmon_main.bundle_regs; \ + clib_perfmon_main.bundle_regs = &clib_perfmon_bundle_reg_##x; \ + } \ + static clib_perfmon_bundle_t clib_perfmon_bundle_##x + +#endif diff --git a/src/vppinfra/vector/test/ip_csum.c b/src/vppinfra/vector/test/ip_csum.c index cb33c036120..17a606523f2 100644 --- a/src/vppinfra/vector/test/ip_csum.c +++ b/src/vppinfra/vector/test/ip_csum.c @@ -115,48 +115,48 @@ done: } void __test_perf_fn -perftest_ip4_hdr (int fd, test_perf_t *tp) +perftest_ip4_hdr (test_perf_t *tp) { u32 n = tp->n_ops; u8 *data = test_mem_alloc_and_splat (20, n, (void *) &test1); u16 *res = test_mem_alloc (n * sizeof (u16)); - test_perf_event_enable (fd); + test_perf_event_enable (tp); for (int i = 0; i < n; i++) res[i] = clib_ip_csum (data + i * 20, 20); - test_perf_event_disable (fd); + test_perf_event_disable (tp); test_mem_free (data); test_mem_free (res); } void __test_perf_fn -perftest_tcp_payload (int fd, test_perf_t *tp) +perftest_tcp_payload (test_perf_t *tp) { u32 n = tp->n_ops; volatile uword *lenp = &tp->arg0; u8 *data = test_mem_alloc_and_splat (20, n, (void *) &test1); u16 *res = test_mem_alloc (n * sizeof (u16)); - test_perf_event_enable (fd); + test_perf_event_enable (tp); for (int i = 0; i < n; i++) res[i] = clib_ip_csum (data + i * lenp[0], lenp[0]); - test_perf_event_disable (fd); + test_perf_event_disable (tp); test_mem_free (data); test_mem_free (res); } void __test_perf_fn -perftest_byte (int fd, test_perf_t *tp) +perftest_byte (test_perf_t *tp) { volatile uword *np = &tp->n_ops; u8 *data = test_mem_alloc_and_fill_inc_u8 (*np, 0, 0); u16 *res = test_mem_alloc (sizeof (u16)); - test_perf_event_enable (fd); + test_perf_event_enable (tp); res[0] = clib_ip_csum (data, np[0]); - test_perf_event_disable (fd); + test_perf_event_disable (tp); test_mem_free (data); test_mem_free (res); @@ -166,16 +166,14 @@ REGISTER_TEST (clib_ip_csum) = { .name = "clib_ip_csum", .fn = test_clib_ip_csum, .perf_tests = PERF_TESTS ( - { .name = "ip4_hdr", - .op_name = "IP4Hdr", + { .name = "fixed size (per IPv4 Header)", .n_ops = 1024, .fn = perftest_ip4_hdr }, - { .name = "tcp_paylaad", - .op_name = "1460Byte", + { .name = "fixed size (per 1460 byte block)", .n_ops = 16, .arg0 = 1460, .fn = perftest_tcp_payload }, - { .name = "byte", .op_name = "Byte", .n_ops = 16384, .fn = perftest_byte } + { .name = "variable size (per byte)", .n_ops = 16384, .fn = perftest_byte } ), }; diff --git a/src/vppinfra/vector/test/sha2.c b/src/vppinfra/vector/test/sha2.c index 58fb2e74401..81365792063 100644 --- a/src/vppinfra/vector/test/sha2.c +++ b/src/vppinfra/vector/test/sha2.c @@ -293,7 +293,7 @@ check_digest (clib_error_t *err, int tc, u8 *calculated, const u8 *expected, return err; \ } \ \ - void __test_perf_fn perftest_sha##bits##_byte (int fd, test_perf_t *tp) \ + void __test_perf_fn perftest_sha##bits##_byte (test_perf_t *tp) \ { \ volatile uword *np = &tp->n_ops; \ volatile uword *kl = &tp->arg0; \ @@ -302,9 +302,9 @@ check_digest (clib_error_t *err, int tc, u8 *calculated, const u8 *expected, u8 *data = test_mem_alloc_and_fill_inc_u8 (*np, 0, 0); \ u8 *digest = test_mem_alloc (64); \ \ - test_perf_event_enable (fd); \ + test_perf_event_enable (tp); \ clib_hmac_sha##bits (key, *kl, data, *np, digest); \ - test_perf_event_disable (fd); \ + test_perf_event_disable (tp); \ \ test_mem_free (key); \ test_mem_free (data); \ @@ -314,7 +314,6 @@ check_digest (clib_error_t *err, int tc, u8 *calculated, const u8 *expected, .name = "clib_hmac_sha" #bits, \ .fn = test_clib_hmac_sha##bits, \ .perf_tests = PERF_TESTS ({ .name = "byte", \ - .op_name = "Byte", \ .n_ops = 16384, \ .arg0 = 20, \ .fn = perftest_sha##bits##_byte }) \ diff --git a/src/vppinfra/vector/test/test.c b/src/vppinfra/vector/test/test.c index 51b6bbf4bb2..dc5651c47cc 100644 --- a/src/vppinfra/vector/test/test.c +++ b/src/vppinfra/vector/test/test.c @@ -53,61 +53,7 @@ test_funct (test_main_t *tm) return 0; } -#define TEST_PERF_MAX_EVENTS 7 -typedef struct -{ - char *name; - char *desc; - u64 config[TEST_PERF_MAX_EVENTS]; - u32 type; - u8 n_events; - format_function_t *format_fn; -} test_perf_event_bundle_t; - -static u8 * -format_test_perf_bundle_default (u8 *s, va_list *args) -{ - test_main_t *tm = &test_main; - test_perf_event_bundle_t __clib_unused *b = - va_arg (*args, test_perf_event_bundle_t *); - test_perf_t *tp = va_arg (*args, test_perf_t *); - u64 *data = va_arg (*args, u64 *); - - if (tm->ref_clock > 0) - { - if (data) - s = format (s, "%8.1f", tm->ref_clock * data[0] / data[1] / 1e9); - else - s = format (s, "%8s", "Freq"); - } - - if (data) - s = format (s, "%5.2f", (f64) data[2] / data[0]); - else - s = format (s, "%5s", "IPC"); - - if (data) - s = format (s, "%8.2f", (f64) data[0] / tp->n_ops); - else - s = format (s, "%8s", "Clks/Op"); - - if (data) - s = format (s, "%8.2f", (f64) data[2] / tp->n_ops); - else - s = format (s, "%8s", "Inst/Op"); - - if (data) - s = format (s, "%9.2f", (f64) data[3] / tp->n_ops); - else - s = format (s, "%9s", "Brnch/Op"); - - if (data) - s = format (s, "%10.2f", (f64) data[4] / tp->n_ops); - else - s = format (s, "%10s", "BrMiss/Op"); - return s; -} - +#if 0 static u8 * format_test_perf_bundle_core_power (u8 *s, va_list *args) { @@ -134,19 +80,6 @@ format_test_perf_bundle_core_power (u8 *s, va_list *args) return s; } -test_perf_event_bundle_t perf_bundles[] = { - { - .name = "default", - .desc = "IPC, Clocks/Operatiom, Instr/Operation, Branch Total & Miss", - .type = PERF_TYPE_HARDWARE, - .config[0] = PERF_COUNT_HW_CPU_CYCLES, - .config[1] = PERF_COUNT_HW_REF_CPU_CYCLES, - .config[2] = PERF_COUNT_HW_INSTRUCTIONS, - .config[3] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, - .config[4] = PERF_COUNT_HW_BRANCH_MISSES, - .n_events = 5, - .format_fn = format_test_perf_bundle_default, - } #ifdef __x86_64__ #define PERF_INTEL_CODE(event, umask) ((event) | (umask) << 8) , @@ -165,69 +98,21 @@ test_perf_event_bundle_t perf_bundles[] = { } #endif }; +#endif #ifdef __linux__ clib_error_t * test_perf (test_main_t *tm) { clib_error_t *err = 0; - test_perf_event_bundle_t *b = 0; - int group_fd = -1, fds[TEST_PERF_MAX_EVENTS]; - u64 count[TEST_PERF_MAX_EVENTS + 3] = {}; - struct perf_event_attr pe = { - .size = sizeof (struct perf_event_attr), - .disabled = 1, - .exclude_kernel = 1, - .exclude_hv = 1, - .pinned = 1, - .exclusive = 1, - .read_format = (PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING), - }; - - for (int i = 0; i < TEST_PERF_MAX_EVENTS; i++) - fds[i] = -1; - - tm->ref_clock = os_cpu_clock_frequency (); - - if (tm->bundle) - { - for (int i = 0; i < ARRAY_LEN (perf_bundles); i++) - if (strncmp ((char *) tm->bundle, perf_bundles[i].name, - vec_len (tm->bundle)) == 0) - { - b = perf_bundles + i; - break; - } - if (b == 0) - return clib_error_return (0, "Unknown bundle '%s'", tm->bundle); - } - else - b = perf_bundles; + clib_perfmon_ctx_t _ctx, *ctx = &_ctx; - for (int i = 0; i < b->n_events; i++) - { - pe.config = b->config[i]; - pe.type = b->type; - int fd = syscall (__NR_perf_event_open, &pe, /* pid */ 0, /* cpu */ -1, - /* group_fd */ group_fd, /* flags */ 0); - if (fd < 0) - { - err = clib_error_return_unix (0, "perf_event_open"); - goto done; - } + if ((err = clib_perfmon_init_by_bundle_name ( + ctx, "%s", tm->bundle ? (char *) tm->bundle : "default"))) + return err; - if (group_fd == -1) - { - group_fd = fd; - pe.pinned = 0; - pe.exclusive = 0; - } - fds[i] = fd; - } fformat (stdout, "Warming up...\n"); - for (u64 i = 0; i < (u64) tm->ref_clock; i++) - asm inline("" : : "r"(i * i) : "memory"); + clib_perfmon_warmup (ctx); for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++) { @@ -246,31 +131,16 @@ test_perf (test_main_t *tm) test_perf_t *pt = r->perf_tests; if (tm->filter && strstr (r->name, (char *) tm->filter) == 0) goto next; - fformat (stdout, "%-22s%-12s%U\n", r->name, "OpType", - b->format_fn, b, pt, 0UL); + + clib_perfmon_capture_group (ctx, "%s", r->name); do { - u32 read_size = (b->n_events + 3) * sizeof (u64); for (int i = 0; i < tm->repeat; i++) { - test_perf_event_reset (group_fd); - pt->fn (group_fd, pt); - if ((read (group_fd, &count, read_size) != read_size)) - { - err = clib_error_return_unix (0, "read"); - goto done; - } - if (count[1] != count[2]) - clib_warning ( - "perf counters were not running all the time." -#ifdef __x86_64__ - "\nConsider turning NMI watchdog off ('sysctl -w " - "kernel.nmi_watchdog=0')." -#endif - ); - fformat (stdout, " %-20s%-12s%U\n", pt->name, - pt->op_name ? pt->op_name : "", b->format_fn, b, - pt, count + 3); + pt->fd = ctx->group_fd; + clib_perfmon_reset (ctx); + pt->fn (pt); + clib_perfmon_capture (ctx, pt->n_ops, "%0s", pt->name); } } while ((++pt)->fn); @@ -278,12 +148,11 @@ test_perf (test_main_t *tm) next: r = r->next; } + fformat (stdout, "%U\n", format_perfmon_bundle, ctx); + clib_perfmon_clear (ctx); } -done: - for (int i = 0; i < TEST_PERF_MAX_EVENTS; i++) - if (fds[i] != -1) - close (fds[i]); + clib_perfmon_free (ctx); return err; } #endif diff --git a/src/vppinfra/vector/test/test.h b/src/vppinfra/vector/test/test.h index 4511bf31fa9..7d54d80c6ec 100644 --- a/src/vppinfra/vector/test/test.h +++ b/src/vppinfra/vector/test/test.h @@ -6,6 +6,7 @@ #define included_test_test_h #include +#include #ifdef __linux__ #include #include @@ -14,10 +15,11 @@ typedef clib_error_t *(test_fn_t) (clib_error_t *); struct test_perf_; -typedef void (test_perf_fn_t) (int fd, struct test_perf_ *tp); +typedef void (test_perf_fn_t) (struct test_perf_ *tp); typedef struct test_perf_ { + int fd; u64 n_ops; union { @@ -34,7 +36,6 @@ typedef struct test_perf_ u64 arg2; void *ptr2; }; - char *op_name; char *name; test_perf_fn_t *fn; } test_perf_t; @@ -83,32 +84,19 @@ extern test_main_t test_main; } static_always_inline void -test_perf_event_ioctl (int fd, u32 req) +test_perf_event_reset (test_perf_t *t) { -#ifdef __x86_64__ - asm inline("syscall" - : - : "D"(fd), "S"(req), "a"(__NR_ioctl), "d"(PERF_IOC_FLAG_GROUP) - : "rcx", "r11" /* registers modified by kernel */); -#else - ioctl (fd, req, PERF_IOC_FLAG_GROUP); -#endif -} - -static_always_inline void -test_perf_event_reset (int fd) -{ - test_perf_event_ioctl (fd, PERF_EVENT_IOC_RESET); + clib_perfmon_ioctl (t->fd, PERF_EVENT_IOC_RESET); } static_always_inline void -test_perf_event_enable (int fd) +test_perf_event_enable (test_perf_t *t) { - test_perf_event_ioctl (fd, PERF_EVENT_IOC_ENABLE); + clib_perfmon_ioctl (t->fd, PERF_EVENT_IOC_ENABLE); } static_always_inline void -test_perf_event_disable (int fd) +test_perf_event_disable (test_perf_t *t) { - test_perf_event_ioctl (fd, PERF_EVENT_IOC_DISABLE); + clib_perfmon_ioctl (t->fd, PERF_EVENT_IOC_DISABLE); } void *test_mem_alloc (uword size); diff --git a/src/vppinfra/vector/test/toeplitz.c b/src/vppinfra/vector/test/toeplitz.c index d425a443eec..fbe4275f9fa 100644 --- a/src/vppinfra/vector/test/toeplitz.c +++ b/src/vppinfra/vector/test/toeplitz.c @@ -259,17 +259,17 @@ done: } void __test_perf_fn -perftest_fixed_12byte (int fd, test_perf_t *tp) +perftest_fixed_12byte (test_perf_t *tp) { u32 n = tp->n_ops; u8 *data = test_mem_alloc_and_splat (12, n, (void *) &ip4_tests[0].key); u8 *res = test_mem_alloc (4 * n); clib_toeplitz_hash_key_t *k = clib_toeplitz_hash_key_init (0, 0); - test_perf_event_enable (fd); + test_perf_event_enable (tp); for (int i = 0; i < n; i++) ((u32 *) res)[i] = clib_toeplitz_hash (k, data + i * 12, 12); - test_perf_event_disable (fd); + test_perf_event_disable (tp); clib_toeplitz_hash_key_free (k); test_mem_free (data); @@ -277,17 +277,17 @@ perftest_fixed_12byte (int fd, test_perf_t *tp) } void __test_perf_fn -perftest_fixed_36byte (int fd, test_perf_t *tp) +perftest_fixed_36byte (test_perf_t *tp) { u32 n = tp->n_ops; u8 *data = test_mem_alloc_and_splat (36, n, (void *) &ip6_tests[0].key); u8 *res = test_mem_alloc (4 * n); clib_toeplitz_hash_key_t *k = clib_toeplitz_hash_key_init (0, 0); - test_perf_event_enable (fd); + test_perf_event_enable (tp); for (int i = 0; i < n; i++) ((u32 *) res)[i] = clib_toeplitz_hash (k, data + i * 36, 36); - test_perf_event_disable (fd); + test_perf_event_disable (tp); clib_toeplitz_hash_key_free (k); test_mem_free (data); @@ -295,7 +295,7 @@ perftest_fixed_36byte (int fd, test_perf_t *tp) } void __test_perf_fn -perftest_variable_size (int fd, test_perf_t *tp) +perftest_variable_size (test_perf_t *tp) { u32 key_len, n_keys, n = tp->n_ops; u8 *key, *data = test_mem_alloc (n); @@ -309,9 +309,9 @@ perftest_variable_size (int fd, test_perf_t *tp) clib_toeplitz_hash_key_free (k); k = clib_toeplitz_hash_key_init (key, key_len * n_keys); - test_perf_event_enable (fd); + test_perf_event_enable (tp); res[0] = clib_toeplitz_hash (k, data, n); - test_perf_event_disable (fd); + test_perf_event_disable (tp); clib_toeplitz_hash_key_free (k); test_mem_free (data); @@ -322,16 +322,13 @@ perftest_variable_size (int fd, test_perf_t *tp) REGISTER_TEST (clib_toeplitz_hash) = { .name = "clib_toeplitz_hash", .fn = test_clib_toeplitz_hash, - .perf_tests = PERF_TESTS ({ .name = "fixed_12", - .op_name = "12B Tuple", + .perf_tests = PERF_TESTS ({ .name = "fixed (per 12 byte tuple)", .n_ops = 1024, .fn = perftest_fixed_12byte }, - { .name = "fixed_36", - .op_name = "36B Tuple", + { .name = "fixed (per 36 byte tuple)", .n_ops = 1024, .fn = perftest_fixed_36byte }, - { .name = "variable_size", - .op_name = "Byte", + { .name = "variable size (per byte)", .n_ops = 16384, .fn = perftest_variable_size }), }; @@ -442,7 +439,7 @@ done: } void __test_perf_fn -perftest_fixed_12byte_x4 (int fd, test_perf_t *tp) +perftest_fixed_12byte_x4 (test_perf_t *tp) { u32 n = tp->n_ops / 4; u8 *d0 = test_mem_alloc_and_splat (12, n, (void *) &ip4_tests[0].key); @@ -455,11 +452,11 @@ perftest_fixed_12byte_x4 (int fd, test_perf_t *tp) u32 *h3 = test_mem_alloc (4 * n); clib_toeplitz_hash_key_t *k = clib_toeplitz_hash_key_init (0, 0); - test_perf_event_enable (fd); + test_perf_event_enable (tp); for (int i = 0; i < n; i++) clib_toeplitz_hash_x4 (k, d0 + i * 12, d1 + i * 12, d2 + i * 12, d3 + i * 12, h0 + i, h1 + i, h2 + i, h3 + i, 12); - test_perf_event_disable (fd); + test_perf_event_disable (tp); clib_toeplitz_hash_key_free (k); test_mem_free (d0); @@ -473,7 +470,7 @@ perftest_fixed_12byte_x4 (int fd, test_perf_t *tp) } void __test_perf_fn -perftest_fixed_36byte_x4 (int fd, test_perf_t *tp) +perftest_fixed_36byte_x4 (test_perf_t *tp) { u32 n = tp->n_ops / 4; u8 *d0 = test_mem_alloc_and_splat (36, n, (void *) &ip4_tests[0].key); @@ -486,11 +483,11 @@ perftest_fixed_36byte_x4 (int fd, test_perf_t *tp) u32 *h3 = test_mem_alloc (4 * n); clib_toeplitz_hash_key_t *k = clib_toeplitz_hash_key_init (0, 0); - test_perf_event_enable (fd); + test_perf_event_enable (tp); for (int i = 0; i < n; i++) clib_toeplitz_hash_x4 (k, d0 + i * 36, d1 + i * 36, d2 + i * 36, d3 + i * 36, h0 + i, h1 + i, h2 + i, h3 + i, 36); - test_perf_event_disable (fd); + test_perf_event_disable (tp); clib_toeplitz_hash_key_free (k); test_mem_free (d0); @@ -504,7 +501,7 @@ perftest_fixed_36byte_x4 (int fd, test_perf_t *tp) } void __test_perf_fn -perftest_variable_size_x4 (int fd, test_perf_t *tp) +perftest_variable_size_x4 (test_perf_t *tp) { u32 key_len, n_keys, n = tp->n_ops / 4; u8 *key; @@ -525,9 +522,9 @@ perftest_variable_size_x4 (int fd, test_perf_t *tp) clib_toeplitz_hash_key_free (k); k = clib_toeplitz_hash_key_init (key, key_len * n_keys); - test_perf_event_enable (fd); + test_perf_event_enable (tp); clib_toeplitz_hash_x4 (k, d0, d1, d2, d3, h0, h1, h2, h3, n); - test_perf_event_disable (fd); + test_perf_event_disable (tp); clib_toeplitz_hash_key_free (k); test_mem_free (key); @@ -544,16 +541,13 @@ perftest_variable_size_x4 (int fd, test_perf_t *tp) REGISTER_TEST (clib_toeplitz_hash_x4) = { .name = "clib_toeplitz_hash_x4", .fn = test_clib_toeplitz_hash_x4, - .perf_tests = PERF_TESTS ({ .name = "fixed_12", - .op_name = "12B Tuple", + .perf_tests = PERF_TESTS ({ .name = "fixed (per 12 byte tuple)", .n_ops = 1024, .fn = perftest_fixed_12byte_x4 }, - { .name = "fixed_36", - .op_name = "36B Tuple", + { .name = "fixed (per 36 byte tuple)", .n_ops = 1024, .fn = perftest_fixed_36byte_x4 }, - { .name = "variable_size", - .op_name = "Byte", + { .name = "variable size (per byte)", .n_ops = 16384, .fn = perftest_variable_size_x4 }), }; -- 2.16.6