perfmon: topdown backend bound core bundle 39/35139/2
authorRay Kinsella <mdr@ashroe.eu>
Fri, 28 Jan 2022 08:56:57 +0000 (08:56 +0000)
committerDamjan Marion <dmarion@me.com>
Sun, 30 Jan 2022 14:43:34 +0000 (14:43 +0000)
Add a bundle to measure topdown backend bound core cycles, will indicate if any
given execution port has contention.

Type: improvement

Signed-off-by: Ray Kinsella <mdr@ashroe.eu>
Change-Id: I37d1b38c101ac42d51c10fa4452b822d34b729c9

src/plugins/perfmon/CMakeLists.txt
src/plugins/perfmon/intel/bundle/backend_bound_core.c [new file with mode: 0644]
src/plugins/perfmon/intel/core.h

index e262984..05c280f 100644 (file)
@@ -24,6 +24,7 @@ add_vpp_plugin(perfmon
   intel/core.c
   intel/uncore.c
   intel/bundle/backend_bound_mem.c
+  intel/bundle/backend_bound_core.c
   intel/bundle/inst_and_clock.c
   intel/bundle/load_blocks.c
   intel/bundle/mem_bw.c
diff --git a/src/plugins/perfmon/intel/bundle/backend_bound_core.c b/src/plugins/perfmon/intel/bundle/backend_bound_core.c
new file mode 100644 (file)
index 0000000..1690523
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <perfmon/perfmon.h>
+#include <perfmon/intel/core.h>
+
+enum
+{
+  PORT0 = 0,
+  PORT1 = 1,
+  PORT5 = 2,
+  PORT6 = 3,
+  PORT2_3 = 4,
+  PORT4_9 = 5,
+  PORT7_8 = 6,
+  DISTRIBUTED = 7,
+};
+
+static u8 *
+format_intel_backend_bound_core (u8 *s, va_list *args)
+{
+  perfmon_node_stats_t *ss = va_arg (*args, perfmon_node_stats_t *);
+  int row = va_arg (*args, int);
+  f64 sv = 0;
+
+  if (!ss->n_packets)
+    return s;
+
+  if (0 == row)
+    {
+      sv = ss->value[DISTRIBUTED] / ss->n_packets;
+
+      s = format (s, "%.0f", sv);
+      return s;
+    }
+
+  switch (row)
+    {
+    case 1:
+      sv = ss->value[PORT0] / (f64) ss->value[DISTRIBUTED];
+      break;
+    case 2:
+      sv = ss->value[PORT1] / (f64) ss->value[DISTRIBUTED];
+      break;
+    case 3:
+      sv = ss->value[PORT5] / (f64) ss->value[DISTRIBUTED];
+      break;
+    case 4:
+      sv = ss->value[PORT6] / (f64) ss->value[DISTRIBUTED];
+      break;
+    case 5:
+      sv = (ss->value[PORT2_3]) / (f64) (2 * ss->value[DISTRIBUTED]);
+      break;
+    case 6:
+      sv = (ss->value[PORT4_9] + ss->value[PORT7_8]) /
+          (f64) (4 * ss->value[DISTRIBUTED]);
+      break;
+    }
+
+  sv = clib_max (sv * 100, 0);
+  s = format (s, "%04.1f", sv);
+
+  return s;
+}
+
+static perfmon_cpu_supports_t backend_bound_core_cpu_supports[] = {
+  { clib_cpu_supports_avx512_bitalg, PERFMON_BUNDLE_TYPE_NODE },
+};
+
+PERFMON_REGISTER_BUNDLE (intel_core_backend_bound_core) = {
+  .name = "td-backend-core",
+  .description = "Topdown BackEnd-bound Core - % cycles core resources busy",
+  .source = "intel-core",
+  .events[0] = INTEL_CORE_E_UOPS_DISPATCHED_PORT_0,      /* 0xFF */
+  .events[1] = INTEL_CORE_E_UOPS_DISPATCHED_PORT_1,      /* 0xFF */
+  .events[2] = INTEL_CORE_E_UOPS_DISPATCHED_PORT_5,      /* 0xFF */
+  .events[3] = INTEL_CORE_E_UOPS_DISPATCHED_PORT_6,      /* 0xFF */
+  .events[4] = INTEL_CORE_E_UOPS_DISPATCHED_PORT_2_3,    /* 0xFF */
+  .events[5] = INTEL_CORE_E_UOPS_DISPATCHED_PORT_4_9,    /* 0xFF */
+  .events[6] = INTEL_CORE_E_UOPS_DISPATCHED_PORT_7_8,    /* 0xFF */
+  .events[7] = INTEL_CORE_E_CPU_CLK_UNHALTED_DISTRIBUTED, /* 0xFF */
+  .n_events = 8,
+  .format_fn = format_intel_backend_bound_core,
+  .cpu_supports = backend_bound_core_cpu_supports,
+  .n_cpu_supports = ARRAY_LEN (backend_bound_core_cpu_supports),
+  .column_headers = PERFMON_STRINGS ("Clocks/Packet", "%Port0", "%Port1",
+                                    "%Port5", "%Port6", "%Load", "%Store"),
+};
index 31daf27..971dc34 100644 (file)
   _ (0x9C, 0x01, 0, 0, 0, 0x00, IDQ_UOPS_NOT_DELIVERED, CORE,                 \
      "Uops not delivered to Resource Allocation Table (RAT) per thread when " \
      "backend of the machine is not stalled")                                 \
+  _ (0xA1, 0x01, 0, 0, 0, 0x00, UOPS_DISPATCHED, PORT_0,                      \
+     "Number of uops executed on port 0")                                     \
+  _ (0xA1, 0x02, 0, 0, 0, 0x00, UOPS_DISPATCHED, PORT_1,                      \
+     "Number of uops executed on port 1")                                     \
+  _ (0xA1, 0x04, 0, 0, 0, 0x00, UOPS_DISPATCHED, PORT_2_3,                    \
+     "Number of uops executed on port 2 and 3")                               \
+  _ (0xA1, 0x10, 0, 0, 0, 0x00, UOPS_DISPATCHED, PORT_4_9,                    \
+     "Number of uops executed on port 4 and 9")                               \
+  _ (0xA1, 0x20, 0, 0, 0, 0x00, UOPS_DISPATCHED, PORT_5,                      \
+     "Number of uops executed on port 5")                                     \
+  _ (0xA1, 0x40, 0, 0, 0, 0x00, UOPS_DISPATCHED, PORT_6,                      \
+     "Number of uops executed on port 6")                                     \
+  _ (0xA1, 0x80, 0, 0, 0, 0x00, UOPS_DISPATCHED, PORT_7_8,                    \
+     "Number of uops executed on port 7 and 8")                               \
   _ (0xA2, 0x08, 0, 0, 0, 0x00, RESOURCE_STALLS, SB,                          \
      "Counts allocation stall cycles caused by the store buffer (SB) being "  \
      "full. This counts cycles that the pipeline back-end blocked uop "       \
      "Counts the total number when the front end is resteered, mainly when "  \
      "the BPU cannot provide a correct prediction and this is corrected by "  \
      "other branch handling mechanisms at the front end.")                    \
+  _ (0xEC, 0x02, 0, 0, 0, 0x00, CPU_CLK_UNHALTED, DISTRIBUTED,                \
+     "Cycle counts are evenly distributed between active threads in the "     \
+     " Core")                                                                 \
   _ (0xF0, 0x40, 0, 0, 0, 0x00, L2_TRANS, L2_WB,                              \
      "L2 writebacks that access L2 cache")                                    \
   _ (0xF1, 0x1F, 0, 0, 0, 0x00, L2_LINES_IN, ALL,                             \