perfmon: add membw-bound bundle 88/33688/4
authorRay Kinsella <mdr@ashroe.eu>
Wed, 9 Jun 2021 10:35:26 +0000 (11:35 +0100)
committerDamjan Marion <dmarion@me.com>
Wed, 8 Sep 2021 14:30:03 +0000 (14:30 +0000)
Added memory bandwidth boundedness bundle, closely related to cache-hierarchy.
This bundle works on ICX only, due to an ICX specific counter.

Type: improvement

Signed-off-by: Ray Kinsella <mdr@ashroe.eu>
Change-Id: Id385bd5f4e645ac020774e311c623afb64b79b1e

src/plugins/perfmon/CMakeLists.txt
src/plugins/perfmon/intel/bundle/membw_bound.c [new file with mode: 0644]
src/plugins/perfmon/intel/core.h
src/plugins/perfmon/perfmon.h

index af0bd3c..6b8e7c8 100644 (file)
@@ -24,6 +24,7 @@ add_vpp_plugin(perfmon
   table.c
   intel/core.c
   intel/uncore.c
+  intel/bundle/membw_bound.c
   intel/bundle/inst_and_clock.c
   intel/bundle/load_blocks.c
   intel/bundle/mem_bw.c
diff --git a/src/plugins/perfmon/intel/bundle/membw_bound.c b/src/plugins/perfmon/intel/bundle/membw_bound.c
new file mode 100644 (file)
index 0000000..2e4b4aa
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <perfmon/perfmon.h>
+#include <perfmon/intel/core.h>
+
+static u8 *
+format_intel_membw_bound (u8 *s, va_list *args)
+{
+  perfmon_node_stats_t *ss = va_arg (*args, perfmon_node_stats_t *);
+  int row = va_arg (*args, int);
+  f64 sv = 0;
+
+  if (!ss->n_packets)
+    return s;
+
+  sv = ss->value[row] / ss->n_packets;
+
+  s = format (s, "%5.0f", sv);
+
+  return s;
+}
+
+static perfmon_cpu_supports_t membw_bound_cpu_supports[] = {
+  { clib_cpu_supports_avx512_bitalg, PERFMON_BUNDLE_TYPE_NODE },
+};
+
+PERFMON_REGISTER_BUNDLE (intel_core_membw_bound) = {
+  .name = "membw-bound",
+  .description = "memory bandwidth boundedness",
+  .source = "intel-core",
+  .events[0] = INTEL_CORE_E_CPU_CLK_UNHALTED_THREAD_P,       /* FIXED */
+  .events[1] = INTEL_CORE_E_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE, /*CMask: 0xFF*/
+  .events[2] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_MEM_ANY,    /*CMask: 0xFF*/
+  .events[3] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_L1D_MISS,   /*CMask: 0xF*/
+  .events[4] = INTEL_CORE_E_L1D_PEND_MISS_FB_FULL,           /*CMask: 0xF*/
+  .events[5] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_L3_MISS,    /*CMask: 0xF*/
+  .events[6] = INTEL_CORE_E_SQ_MISC_SQ_FULL,                 /*CMask: 0xF*/
+  .n_events = 7,
+  .format_fn = format_intel_membw_bound,
+  .cpu_supports = membw_bound_cpu_supports,
+  .n_cpu_supports = ARRAY_LEN (membw_bound_cpu_supports),
+  .column_headers = PERFMON_STRINGS ("Cycles/Packet", "Cycles Stall/Packet",
+                                    "Mem Stall/Packet",
+                                    "L1D Miss Stall/Packet", "FB Full/Packet",
+                                    "L3 Miss Stall/Packet", "SQ Full/Packet"),
+};
index a6a5269..0e29022 100644 (file)
   _ (0x9C, 0x01, 0, 0, 0, 0x00, IDQ_UOPS_NOT_DELIVERED, CORE,                 \
      "Uops not delivered to Resource Allocation Table (RAT) per thread when " \
      "backend of the machine is not stalled")                                 \
+  _ (0xA2, 0x08, 0, 0, 0, 0x00, RESOURCE_STALLS, SB,                          \
+     "Counts allocation stall cycles caused by the store buffer (SB) being "  \
+     "full. This counts cycles that the pipeline back-end blocked uop "       \
+     "delivery"                                                               \
+     "from the front-end.")                                                   \
+  _ (0xA3, 0x04, 0, 0, 0, 0x04, CYCLE_ACTIVITY, CYCLES_NO_EXECUTE,            \
+     "This event counts cycles during which no instructions were executed in" \
+     " the execution stage of the pipeline.")                                 \
+  _ (0xA3, 0x05, 0, 0, 0, 0x05, CYCLE_ACTIVITY, STALLS_L2_MISS,               \
+     "Execution stalls while L2 cache miss demand load is outstanding")       \
+  _ (0xA3, 0x06, 0, 0, 0, 0x06, CYCLE_ACTIVITY, STALLS_L3_MISS,               \
+     "Execution stalls while L3 cache miss demand load is outstanding")       \
+  _ (0xA3, 0x0C, 0, 0, 0, 0x0C, CYCLE_ACTIVITY, STALLS_L1D_MISS,              \
+     "Execution stalls while L1 cache miss demand load is outstanding")       \
+  _ (0xA3, 0x14, 0, 0, 0, 0x14, CYCLE_ACTIVITY, STALLS_MEM_ANY,               \
+     "Execution stalls while memory subsystem has an outstanding load.")      \
   _ (0xC0, 0x00, 0, 0, 0, 0x00, INST_RETIRED, ANY_P,                          \
      "Number of instructions retired. General Counter - architectural event") \
   _ (0xC2, 0x02, 0, 0, 0, 0x00, UOPS_RETIRED, RETIRE_SLOTS,                   \
      "L2 writebacks that access L2 cache")                                    \
   _ (0xF1, 0x1F, 0, 0, 0, 0x00, L2_LINES_IN, ALL,                             \
      "L2 cache lines filling L2")                                             \
+  _ (0xF4, 0x04, 0, 0, 0, 0x00, SQ_MISC, SQ_FULL,                             \
+     "Counts the cycles for which the thread is active and the superQ cannot" \
+     "take any more entries.")                                                \
   _ (0xFE, 0x02, 0, 0, 0, 0x00, IDI_MISC, WB_UPGRADE,                         \
      "Counts number of cache lines that are allocated and written back to L3" \
      " with the intention that they are more likely to be reused shortly")    \
index 967b92e..ffcf2fd 100644 (file)
@@ -23,7 +23,7 @@
 #include <vppinfra/cpu.h>
 #include <vlib/vlib.h>
 
-#define PERF_MAX_EVENTS 7 /* 3 fixed and 4 programmable */
+#define PERF_MAX_EVENTS 8 /* 4 fixed and 8 programmable on ICX */
 
 typedef enum
 {