From 3ff919f8eb9fa7eb98887f029be7f817de7a1303 Mon Sep 17 00:00:00 2001
From: Peter Mikus <pmikus@cisco.com>
Date: Tue, 19 Nov 2019 12:00:57 +0000
Subject: [PATCH] Telemetry: Add more operational data

+ Add both NDR and PDR telemetry capture
+ Speedup sockets
+ Adjust privileges

Signed-off-by: Peter Mikus <pmikus@cisco.com>
Change-Id: Ia6fd5d405e6fb410651d8b705c921653753aea10
---
 resources/libraries/python/PapiExecutor.py         |  16 ++-
 resources/libraries/python/VPPUtil.py              |  16 ++-
 resources/libraries/python/VppCounters.py          |  46 ++++---
 .../robot/performance/performance_utils.robot      | 139 +++++----------------
 .../libraries/robot/shared/test_teardown.robot     |   2 +
 resources/tools/presentation/input_data_parser.py  |   3 +-
 6 files changed, 84 insertions(+), 138 deletions(-)

diff --git a/resources/libraries/python/PapiExecutor.py b/resources/libraries/python/PapiExecutor.py
index cbb3e28603..8308303b8a 100644
--- a/resources/libraries/python/PapiExecutor.py
+++ b/resources/libraries/python/PapiExecutor.py
@@ -89,7 +89,6 @@ class PapiSocketExecutor:
     The reconnection is logged at WARN level, so it is prominently shown
     in log.html, so we can see how frequently it happens.
 
-    TODO: Support sockets in NFs somehow.
     TODO: Support handling of retval!=0 without try/except in caller.
 
     Note: Use only with "with" statement, e.g.:
@@ -222,6 +221,7 @@ class PapiSocketExecutor:
         :returns: self
         :rtype: PapiSocketExecutor
         """
+        time_enter = time.time()
         # Parsing takes longer than connecting, prepare instance before tunnel.
         vpp_instance = self.vpp_instance
         node = self._node
@@ -244,16 +244,11 @@ class PapiSocketExecutor:
         # Even if ssh can perhaps reuse this file,
         # we need to remove it for readiness detection to work correctly.
         run([u"rm", u"-rvf", self._local_vpp_socket])
-        # On VIRL, the ssh user is not added to "vpp" group,
-        # so we need to change remote socket file access rights.
-        exec_cmd_no_error(
-            node, u"chmod o+rwx " + self._remote_vpp_socket, sudo=True
-        )
-        # We use sleep command. The ssh command will exit in 10 second,
+        # We use sleep command. The ssh command will exit in 30 second,
         # unless a local socket connection is established,
         # in which case the ssh command will exit only when
         # the ssh connection is closed again (via control socket).
-        # The log level is to supress "Warning: Permanently added" messages.
+        # The log level is to suppress "Warning: Permanently added" messages.
         ssh_cmd = [
             u"ssh", u"-S", ssh_socket, u"-M",
             u"-o", u"LogLevel=ERROR", u"-o", u"UserKnownHostsFile=/dev/null",
@@ -261,7 +256,7 @@ class PapiSocketExecutor:
             u"-o", u"ExitOnForwardFailure=yes",
             u"-L", self._local_vpp_socket + u":" + self._remote_vpp_socket,
             u"-p", str(node[u"port"]), node[u"username"] + u"@" + node[u"host"],
-            u"sleep", u"10"
+            u"sleep", u"30"
         ]
         priv_key = node.get(u"priv_key")
         if priv_key:
@@ -311,6 +306,9 @@ class PapiSocketExecutor:
                 break
         else:
             raise RuntimeError(u"Failed to connect to VPP over a socket.")
+        logger.trace(
+            f"Establishing socket connection took {time.time()-time_enter}s"
+        )
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
diff --git a/resources/libraries/python/VPPUtil.py b/resources/libraries/python/VPPUtil.py
index 7dabb4fc61..865775f995 100644
--- a/resources/libraries/python/VPPUtil.py
+++ b/resources/libraries/python/VPPUtil.py
@@ -116,6 +116,18 @@ class VPPUtil:
         cmd = u"command -v vpp"
         exec_cmd_no_error(node, cmd, message=u"VPP is not installed!")
 
+    @staticmethod
+    def adjust_privileges(node):
+        """Adjust privileges to control VPP without sudo.
+
+        :param node: Topology node.
+        :type node: dict
+        """
+        cmd = u"chmod -R o+rwx /run/vpp"
+        exec_cmd_no_error(
+            node, cmd, sudo=True, message=u"Failed to adjust privileges!",
+            retries=120)
+
     @staticmethod
     def verify_vpp_started(node):
         """Verify that VPP is started on the specified topology node.
@@ -137,7 +149,7 @@ class VPPUtil:
     @staticmethod
     def verify_vpp(node):
         """Verify that VPP is installed and started on the specified topology
-        node.
+        node. Adjust privileges so user can connect without sudo.
 
         :param node: Topology node.
         :type node: dict
@@ -147,6 +159,8 @@ class VPPUtil:
         try:
             # Verify responsiveness of vppctl.
             VPPUtil.verify_vpp_started(node)
+            # Adjust privileges.
+            VPPUtil.adjust_privileges(node)
             # Verify responsiveness of PAPI.
             VPPUtil.show_log(node)
             VPPUtil.vpp_show_version(node)
diff --git a/resources/libraries/python/VppCounters.py b/resources/libraries/python/VppCounters.py
index e6bb51ef4e..411302ae58 100644
--- a/resources/libraries/python/VppCounters.py
+++ b/resources/libraries/python/VppCounters.py
@@ -114,7 +114,7 @@ class VppCounters:
                     )
 
     @staticmethod
-    def vpp_show_runtime_counters_on_all_duts(nodes):
+    def vpp_show_runtime_on_all_duts(nodes):
         """Clear VPP runtime counters on all DUTs.
 
         :param nodes: VPP nodes.
@@ -125,8 +125,8 @@ class VppCounters:
                 VppCounters.vpp_show_runtime(node)
 
     @staticmethod
-    def vpp_show_hardware_verbose(node):
-        """Run "show hardware-interfaces verbose" debug CLI command.
+    def vpp_show_interface(node):
+        """Run "show interface" debug CLI command.
 
         :param node: Node to run command on.
         :type node: dict
@@ -148,6 +148,17 @@ class VppCounters:
             node, u"show memory verbose api-segment stats-segment main-heap"
         )
 
+    @staticmethod
+    def vpp_show_memory_on_all_duts(nodes):
+        """Run "show memory" on all DUTs.
+
+        :param nodes: VPP nodes.
+        :type nodes: dict
+        """
+        for node in nodes.values():
+            if node[u"type"] == NodeType.DUT:
+                VppCounters.vpp_show_memory(node)
+
     @staticmethod
     def vpp_clear_runtime(node):
         """Run "clear runtime" CLI command.
@@ -160,7 +171,7 @@ class VppCounters:
         )
 
     @staticmethod
-    def vpp_clear_runtime_counters_on_all_duts(nodes):
+    def vpp_clear_runtime_on_all_duts(nodes):
         """Run "clear runtime" CLI command on all DUTs.
 
         :param nodes: VPP nodes.
@@ -171,8 +182,8 @@ class VppCounters:
                 VppCounters.vpp_clear_runtime(node)
 
     @staticmethod
-    def vpp_clear_hardware_counters(node):
-        """Run "clear hardware" CLI command.
+    def vpp_clear_interfaces(node):
+        """Run "clear interfaces" CLI command.
 
         :param node: Node to run command on.
         :type node: dict
@@ -180,22 +191,22 @@ class VppCounters:
         :rtype: dict
         """
         PapiSocketExecutor.run_cli_cmd_on_all_sockets(
-            node, u"clear hardware", log=False
+            node, u"clear interfaces", log=False
         )
 
     @staticmethod
-    def vpp_clear_hardware_counters_on_all_duts(nodes):
-        """Clear hardware counters on all DUTs.
+    def vpp_clear_interfaces_on_all_duts(nodes):
+        """Clear interfaces on all DUTs.
 
         :param nodes: VPP nodes.
         :type nodes: dict
         """
         for node in nodes.values():
             if node[u"type"] == NodeType.DUT:
-                VppCounters.vpp_clear_hardware_counters(node)
+                VppCounters.vpp_clear_interfaces(node)
 
     @staticmethod
-    def vpp_clear_errors_counters(node):
+    def vpp_clear_errors(node):
         """Run "clear errors" CLI command.
 
         :param node: Node to run command on.
@@ -206,7 +217,7 @@ class VppCounters:
         )
 
     @staticmethod
-    def vpp_clear_error_counters_on_all_duts(nodes):
+    def vpp_clear_errors_on_all_duts(nodes):
         """Clear VPP errors counters on all DUTs.
 
         :param nodes: VPP nodes.
@@ -214,7 +225,7 @@ class VppCounters:
         """
         for node in nodes.values():
             if node[u"type"] == NodeType.DUT:
-                VppCounters.vpp_clear_errors_counters(node)
+                VppCounters.vpp_clear_errors(node)
 
     @staticmethod
     def show_vpp_statistics(node):
@@ -224,9 +235,7 @@ class VppCounters:
         :type node: dict
         """
         VppCounters.vpp_show_errors(node)
-        VppCounters.vpp_show_hardware_verbose(node)
-        VppCounters.vpp_show_runtime(node)
-        VppCounters.vpp_show_memory(node)
+        VppCounters.vpp_show_interface(node)
 
     @staticmethod
     def show_statistics_on_all_duts(nodes):
@@ -246,9 +255,8 @@ class VppCounters:
         :param node: VPP node.
         :type node: dict
         """
-        VppCounters.vpp_clear_errors_counters(node)
-        VppCounters.vpp_clear_hardware_counters(node)
-        VppCounters.vpp_clear_runtime(node)
+        VppCounters.vpp_clear_errors(node)
+        VppCounters.vpp_clear_interfaces(node)
 
     @staticmethod
     def clear_statistics_on_all_duts(nodes):
diff --git a/resources/libraries/robot/performance/performance_utils.robot b/resources/libraries/robot/performance/performance_utils.robot
index eac7fe0075..f5e5913fa3 100644
--- a/resources/libraries/robot/performance/performance_utils.robot
+++ b/resources/libraries/robot/performance/performance_utils.robot
@@ -15,10 +15,7 @@
 | Library | Collections
 | Library | resources.libraries.python.topology.Topology
 | Library | resources.libraries.python.NodePath
-| Library | resources.libraries.python.DpdkUtil
 | Library | resources.libraries.python.InterfaceUtil
-| Library | resources.libraries.python.KubernetesUtils
-| Library | resources.libraries.python.VhostUser
 | Library | resources.libraries.python.TrafficGenerator
 | Library | resources.libraries.python.TrafficGenerator.OptimizedSearch
 | Library | resources.libraries.python.TrafficGenerator.TGDropRateSearchImpl
@@ -39,11 +36,6 @@
 | | ... | reported result contains aggregate rates.
 | | ... | Currently, the min_rate value is hardcoded to match test teardowns.
 | |
-| | ... | TODO: Should the trial duration of the additional
-| | ... | measurements be configurable?
-| |
-| | ... | Some inputs are read from variables to streamline suites.
-| |
 | | ... | *Test (or broader scope) variables read:*
 | | ... | - traffic_profile - Name of module defining traffc for measurements.
 | | ... | Type: string
@@ -84,27 +76,16 @@
 | | Check NDRPDR interval validity | ${result.pdr_interval}
 | | ... | ${packet_loss_ratio}
 | | Check NDRPDR interval validity | ${result.ndr_interval}
-| | Perform additional measurements based on NDRPDR result
-| | ... | ${result} | ${frame_size} | ${traffic_profile}
-
-| Display Reconfig Test Message
-| | [Documentation]
-| | ... | Display the number of packets lost (bidirectionally)
-| | ... | due to reconfiguration under traffic.
-| |
-| | ... | *Arguments:*
-| | ... | - result - Result of bidirectional measurtement.
-| | ... | Type: ReceiveRateMeasurement
-| |
-| | ... | *Example:*
-| |
-| | ... | \| Display Reconfig Test Message \| \${result} \|
-| |
-| | [Arguments] | ${result}
-| |
-| | Set Test Message | Packets lost due to reconfig: ${result.loss_count}
-| | ${time_lost} = | Evaluate | ${result.loss_count} / ${result.target_tr}
-| | Set Test Message | ${\n}Implied time lost: ${time_lost} | append=yes
+| | ${rate_sum}= | Set Variable | ${result.ndr_interval.measured_low.target_tr}
+| | ${rate_per_stream}= | Evaluate | ${rate_sum} / float(${traffic_directions})
+| | Send traffic at specified rate
+| | ... | ${2.0} | ${rate_per_stream}pps | ${framesize} | ${traffic_profile}
+| | ... | traffic_directions=${traffic_directions}
+| | ${rate_sum}= | Set Variable | ${result.pdr_interval.measured_low.target_tr}
+| | ${rate_per_stream}= | Evaluate | ${rate_sum} / float(${traffic_directions})
+| | Send traffic at specified rate
+| | ... | ${2.0} | ${rate_per_stream}pps | ${framesize} | ${traffic_profile}
+| | ... | traffic_directions=${traffic_directions}
 
 | Find Throughput Using MLRsearch
 | | [Documentation]
@@ -113,11 +94,6 @@
 | | ... | Input rates are understood as uni-directional.
 | | ... | Currently, the min_rate value is hardcoded to match test teardowns.
 | |
-| | ... | TODO: Should the trial duration of the additional
-| | ... | measurements be configurable?
-| |
-| | ... | Some inputs are read from variables to streamline suites.
-| |
 | | ... | *Test (or broader scope) variables read:*
 | | ... | - traffic_profile - Name of module defining traffc for measurements.
 | | ... | Type: string
@@ -168,7 +144,6 @@
 | | ... | Input rates are understood as uni-directional,
 | | ... | reported result contains aggregate rates.
 | | ... | Currently, the min_rate value is hardcoded to match test teardowns.
-| | ... | Some inputs are read from variables to streamline suites.
 | |
 | | ... | *Test (or broader scope) variables read:*
 | | ... | - traffic_profile - Name of module defining traffc for measurements.
@@ -232,6 +207,25 @@
 | | Set Test Message | ${\n}LATENCY [min/avg/max/hdrh] per stream: ${latency}
 | | ... | append=yes
 
+| Display Reconfig Test Message
+| | [Documentation]
+| | ... | Display the number of packets lost (bidirectionally)
+| | ... | due to reconfiguration under traffic.
+| | ...
+| | ... | *Arguments:*
+| | ... | - result - Result of bidirectional measurtement.
+| | ... |   Type: ReceiveRateMeasurement
+| | ...
+| | ... | *Example:*
+| | ...
+| | ... | \| Display Reconfig Test Message \| \${result} \|
+| | ...
+| | [Arguments] | ${result}
+| | ...
+| | Set Test Message | Packets lost due to reconfig: ${result.loss_count}
+| | ${time_lost} = | Evaluate | ${result.loss_count} / ${result.target_tr}
+| | Set Test Message | ${\n}Implied time lost: ${time_lost} | append=yes
+
 | Display result of NDRPDR search
 | | [Documentation]
 | | ... | Display result of NDR+PDR search, both quantities, both bounds,
@@ -275,8 +269,6 @@
 | | ... | Sum of measured rates over streams
 | | ... | Bandwidth is calculated as:
 | | ... | (Throughput * (L2 Frame Size + IPG) * 8)
-| | ... | TODO: Do we want to report some latency data,
-| | ... | even if not measured at the reported bounds?.
 | |
 | | ... | *Test (or broader scope) variables read:*
 | | ... | - frame_size - L2 Frame Size [B] or IMIX string. Type: int or str
@@ -330,69 +322,10 @@
 | | ... | ${message}${\n}${message_zero} | ${message}${\n}${message_other}
 | | Fail | ${message}
 
-| Perform additional measurements based on NDRPDR result
-| | [Documentation]
-| | ... | Perform any additional measurements which are not directly needed
-| | ... | for determining NDR nor PDR, but which are needed for gathering
-| | ... | additional data for debug purposes.
-| | ... | Currently, just "Traffic should pass with no loss" is called.
-| | ... | TODO: Move latency measurements from optimized search here.
-| |
-| | ... | *Arguments:*
-| | ... | - result - Measured result data per stream [pps]. Type: NdrPdrResult
-| | ... | - frame_size - L2 Frame Size [B] or IMIX string. Type: int or str
-| | ... | - traffic_profile - Topology profile. Type: string
-| | ... | - traffic_directions - Bi- (2) or uni- (1) directional traffic.
-| | ... | Type: int
-| |
-| | ... | *Example:*
-| | ... | \| Perform additional measurements based on NDRPDR result \
-| | ... | \| \${result} \| \${64} \| 3-node-IPv4 \| \${2} \|
-| |
-| | [Arguments] | ${result} | ${framesize} | ${traffic_profile}
-| | ... | ${traffic_directions}=${2}
-| |
-| | ${duration}= | Set Variable | ${2.0}
-| | ${rate_sum}= | Set Variable | ${result.ndr_interval.measured_low.target_tr}
-| | ${rate_per_stream}= | Evaluate | ${rate_sum} / float(${traffic_directions})
-| | Traffic should pass with no loss | ${duration} | ${rate_per_stream}pps
-| | ... | ${framesize} | ${traffic_profile} | fail_on_loss=${False}
-| | ... | traffic_directions=${traffic_directions}
-
-| Traffic should pass with no loss
-| | [Documentation]
-| | ... | Send traffic at specified rate. No packet loss is accepted at loss
-| | ... | evaluation.
-| |
-| | ... | *Arguments:*
-| | ... | - duration - Duration of traffic run [s]. Type: integer
-| | ... | - rate - Rate for sending packets. Type: string
-| | ... | - frame_size - L2 Frame Size [B] or IMIX_v4_1. Type: integer/string
-| | ... | - traffic_profile - Name of module defining traffc for measurements.
-| | ... | Type: string
-| | ... | - fail_on_loss - If True, the keyword fails if loss occurred.
-| | ... | Type: boolean
-| | ... | - traffic_directions - Bi- (2) or uni- (1) directional traffic.
-| | ... | Type: int
-| |
-| | ... | *Example:*
-| |
-| | ... | \| Traffic should pass with no loss \| \${10} \| 4.0mpps \| \${64} \
-| | ... | \| 3-node-IPv4 \| \${2} \|
-| |
-| | [Arguments] | ${duration} | ${rate} | ${frame_size} | ${traffic_profile}
-| | ... | ${fail_on_loss}=${True} | ${traffic_directions}=${2}
-| |
-| | Send traffic at specified rate | ${duration} | ${rate} | ${frame_size}
-| | ... | ${traffic_profile} | traffic_directions=${traffic_directions}
-| | Run Keyword If | ${fail_on_loss} | No traffic loss occurred
-
 | Traffic should pass with maximum rate
 | | [Documentation]
 | | ... | Send traffic at maximum rate.
 | |
-| | ... | Some inputs are read from variables to streamline suites.
-| |
 | | ... | *Test (or broader scope) variables read:*
 | | ... | - traffic_profile - Name of module defining traffc for measurements.
 | | ... | Type: string
@@ -424,8 +357,6 @@
 | | Set Test Message | ${\n}Maximum Receive Rate trial results
 | | Set Test Message | in packets per second: ${results}
 | | ... | append=yes
-| | # TODO: Should we also report the percentage relative to transmit rate,
-| | # so that people looking at console can decide how close to 100% it is?
 | | Run Keyword If | ${fail_no_traffic} | Fail if no traffic forwarded
 
 | Send traffic at specified rate
@@ -516,21 +447,17 @@
 | | ... | traffic_directions=${traffic_directions} | tx_port=${tx_port}
 | | ... | rx_port=${rx_port}
 | | Run Keyword If | ${dut_stats}==${True}
-| | ... | VPP clear runtime counters on all DUTs | ${nodes}
+| | ... | VPP clear runtime on all DUTs | ${nodes}
 | | Sleep | ${duration}
 | | Run Keyword If | ${dut_stats}==${True}
-| | ... | VPP show runtime counters on all DUTs | ${nodes}
+| | ... | VPP show runtime on all DUTs | ${nodes}
 | | Stop traffic on tg
 
 | Start Traffic on Background
 | | [Documentation]
 | | ... | Start traffic at specified rate then return control to Robot.
-| |
 | | ... | This keyword is useful if the test needs to do something
 | | ... | while traffic is running.
-| | ... | Just a wrapper around L1 keyword.
-| | ... |
-| | ... | TODO: How to make sure the traffic is stopped on any failure?
 | |
 | | ... | *Test (or broader scope) variables read:*
 | | ... | - traffic_profile - Name of module defining traffc for measurements.
@@ -562,10 +489,6 @@
 | | ... | Stop the running traffic, return measurement result.
 | | ... | For bidirectional traffic, the reported values are bi-directional.
 | |
-| | ... | Just a wrapper around L1 keyword.
-| | ... |
-| | ... | TODO: Tolerate if traffic was not started.
-| |
 | | ... | *Returns:*
 | | ... | - Measurement result. Type: ReceiveRateMeasurement
 | |
diff --git a/resources/libraries/robot/shared/test_teardown.robot b/resources/libraries/robot/shared/test_teardown.robot
index 96d3cd825b..fe0dda2f19 100644
--- a/resources/libraries/robot/shared/test_teardown.robot
+++ b/resources/libraries/robot/shared/test_teardown.robot
@@ -37,6 +37,8 @@
 | | ... | Get Core Files on All Nodes | ${nodes}
 | | Run Keyword If Test Failed
 | | ... | Verify VPP PID in Teardown
+| | Run Keyword If Test Failed
+| | ... | VPP Show Memory On All DUTs | ${nodes}
 | | FOR | ${action} | IN | @{actions}
 | | | Run Keyword | Additional Test Tear Down Action For ${action}
 | | END
diff --git a/resources/tools/presentation/input_data_parser.py b/resources/tools/presentation/input_data_parser.py
index 2f126f4c8b..aabb0c5d1b 100644
--- a/resources/tools/presentation/input_data_parser.py
+++ b/resources/tools/presentation/input_data_parser.py
@@ -922,7 +922,8 @@ class ExecutionChecker(ResultVisitor):
         :type test_kw: Keyword
         :returns: Nothing.
         """
-        if test_kw.name.count(u"Show Runtime Counters On All Duts"):
+        if test_kw.name.count(u"Show Runtime On All Duts") or
+                test_kw.name.count(u"Show Runtime Counters On All Duts"):
             self._lookup_kw_nr += 1
             self._show_run_lookup_nr = 0
             self._msg_type = u"test-show-runtime"
-- 
2.16.6