STL traffic: Use the sleep+stop logic

author Vratko Polak <vrpolak@cisco.com>

Thu, 25 Mar 2021 18:26:35 +0000 (19:26 +0100)

committer Vratko Polak <vrpolak@cisco.com>

Fri, 26 Mar 2021 13:32:10 +0000 (14:32 +0100)
author Vratko Polak <vrpolak@cisco.com>
Thu, 25 Mar 2021 18:26:35 +0000 (19:26 +0100)
committer Vratko Polak <vrpolak@cisco.com>
Fri, 26 Mar 2021 13:32:10 +0000 (14:32 +0100)
diff --git a/GPL/tools/trex/trex_astf_profile.py b/GPL/tools/trex/trex_astf_profile.py

index 3f244ec..d08fb61 100644 (file)
--- a/GPL/tools/trex/trex_astf_profile.py
+++ b/GPL/tools/trex/trex_astf_profile.py
@@ -79,6 +79,7 @@ def simple_burst(
          latency,
          async_start=False,
          traffic_directions=2,
+        delay=0.0,
      ):
      """Send traffic and measure packet loss and latency.
  
@@ -98,8 +99,9 @@ def simple_burst(
      Duration details:
      Contrary to stateless mode, ASTF profiles typically limit the number
      of flows/transactions that can happen.
-    The caller is expected to set the duration parameter accordingly to
-    this limit and multiplier, including any overheads.
+    The caller is expected to set the duration parameter to idealized value,
+    but set the delay arguments when TRex is expected
+    to finish processing replies later (including a window for latency).
      See *_traffic_duration output fields for TRex's measurement
      of the real traffic duration (should be without any inactivity overheads).
      If traffic has not ended by the final time, the traffic
@@ -111,11 +113,7 @@ def simple_burst(
  
      :param profile_file: A python module with T-rex traffic profile.
      :param duration: Expected duration for all transactions to finish,
-        assuming only tolerable duration stretching happens.
-        This includes later start of later transactions
-        (according to TPS multiplier) and expected duration of each transaction.
-        Critically, this also includes any delay TRex shows when starting
-        traffic (but not the similar delay during stopping).
+        without any TRex related delays, without even latency.
      :param framesize: Frame size.
      :param multiplier: Multiplier of profile CPS.
      :param port_0: Port 0 on the traffic generator.
@@ -123,6 +121,7 @@ def simple_burst(
      :param latency: With latency stats.
      :param async_start: Start the traffic and exit.
      :param traffic_directions: Bidirectional (2) or unidirectional (1) traffic.
+    :param delay: Time increase [s] for sleep duration.
      :type profile_file: str
      :type duration: float
      :type framesize: int or str
@@ -132,6 +131,7 @@ def simple_burst(
      :type latency: bool
      :type async_start: bool
      :type traffic_directions: int
+    :type delay: float
      """
      client = None
      total_received = 0
@@ -179,14 +179,12 @@ def simple_burst(
          # Choose CPS and start traffic.
          client.start(
              mult=multiplier,
-            # Increase the input duration slightly,
-            # to ensure it does not end before sleep&stop below happens.
-            duration=duration + 0.1 if duration > 0 else duration,
+            duration=duration,
              nc=True,
              latency_pps=int(multiplier) if latency else 0,
              client_mask=2**len(ports)-1,
          )
-        time_start = time.monotonic()
+        time_stop = time.monotonic() + duration + delay
  
          if async_start:
              # For async stop, we need to export the current snapshot.
@@ -196,8 +194,7 @@ def simple_burst(
                  xsnap1 = client.ports[port_1].get_xstats().reference_stats
                  print(f"Xstats snapshot 1: {xsnap1!r}")
          else:
-            time.sleep(duration)
-
+            time.sleep(duration + delay)
              # Do not block yet, the existing transactions may take long time
              # to finish. We need an action that is almost reset(),
              # but without clearing stats.
@@ -208,7 +205,7 @@ def simple_burst(
              client.stop(block=True)
  
              # Read the stats after the traffic stopped (or time up).
-            stats[time.monotonic() - time_start] = client.get_stats(
+            stats[time.monotonic() - time_stop] = client.get_stats(
                  ports=ports
              )
  
@@ -442,6 +439,10 @@ def main():
          u"--traffic_directions", type=int, default=2,
          help=u"Send bi- (2) or uni- (1) directional traffic."
      )
+    parser.add_argument(
+        u"--delay", required=True, type=float, default=0.0,
+        help=u"Allowed time overhead, sleep time is increased by this [s]."
+    )
  
      args = parser.parse_args()
  
@@ -460,6 +461,7 @@ def main():
          latency=args.latency,
          async_start=args.async_start,
          traffic_directions=args.traffic_directions,
+        delay=args.delay,
      )
  
  
diff --git a/GPL/tools/trex/trex_stl_profile.py b/GPL/tools/trex/trex_stl_profile.py

index 8dbc046..4eedc80 100644 (file)
--- a/GPL/tools/trex/trex_stl_profile.py
+++ b/GPL/tools/trex/trex_stl_profile.py
@@ -80,6 +80,7 @@ def simple_burst(
          async_start=False,
          traffic_directions=2,
          force=False,
+        delay=0.0,
      ):
      """Send traffic and measure packet loss and latency.
  
@@ -108,6 +109,7 @@ def simple_burst(
      :param async_start: Start the traffic and exit.
      :param traffic_directions: Bidirectional (2) or unidirectional (1) traffic.
      :param force: Force start regardless of ports state.
+    :param delay: Sleep overhead [s].
      :type profile_file: str
      :type framesize: int or str
      :type duration: float
@@ -118,6 +120,7 @@ def simple_burst(
      :type async_start: bool
      :type traffic_directions: int
      :type force: bool
+    :type delay: float
      """
      client = None
      total_rcvd = 0
@@ -201,18 +204,20 @@ def simple_burst(
                  xsnap1 = client.ports[1].get_xstats().reference_stats
                  print(f"Xstats snapshot 1: {xsnap1!r}")
          else:
-            # Block until done:
              time_start = time.monotonic()
-            client.wait_on_traffic(ports=ports, timeout=duration+30)
+            # wait_on_traffic fails if duration stretches by 30 seconds or more.
+            # TRex has some overhead, wait some more.
+            time.sleep(duration + delay)
+            client.stop()
              time_stop = time.monotonic()
-            approximated_duration = time_stop - time_start
-
+            approximated_duration = time_stop - time_start - delay
+            # Read the stats after the traffic stopped (or time up).
+            stats = client.get_stats()
              if client.get_warnings():
                  for warning in client.get_warnings():
                      print(warning)
-
-            # Read the stats after the test
-            stats = client.get_stats()
+            # Now finish the complete reset.
+            client.reset()
  
              print(u"##### Statistics #####")
              print(json.dumps(stats, indent=4, separators=(u",", u": ")))
@@ -314,6 +319,10 @@ def main():
          u"--force", action=u"store_true", default=False,
          help=u"Force start regardless of ports state."
      )
+    parser.add_argument(
+        u"--delay", required=True, type=float, default=0.0,
+        help=u"Delay assumed for traffic, sleep time is increased by this [s]."
+    )
  
      args = parser.parse_args()
  
@@ -333,6 +342,7 @@ def main():
          async_start=args.async_start,
          traffic_directions=args.traffic_directions,
          force=args.force,
+        delay=args.delay,
      )
  
  
diff --git a/resources/libraries/python/Constants.py b/resources/libraries/python/Constants.py

index 8937e6c..14fb795 100644 (file)
--- a/resources/libraries/python/Constants.py
+++ b/resources/libraries/python/Constants.py
@@ -234,7 +234,7 @@ class Constants:
      # Number of trials to execute in MRR test.
      PERF_TRIAL_MULTIPLICITY = get_int_from_env(u"PERF_TRIAL_MULTIPLICITY", 10)
  
-    # Duration of one trial in MRR test.
+    # Duration [s] of one trial in MRR test.
      PERF_TRIAL_DURATION = get_float_from_env(u"PERF_TRIAL_DURATION", 1.0)
  
      # Whether to use latency streams in main search trials.
@@ -244,6 +244,16 @@ class Constants:
      PERF_TRIAL_LATENCY_DURATION = get_float_from_env(
          u"PERF_TRIAL_LATENCY_DURATION", 5.0)
  
+    # For some testbeds TG takes longer than usual to start sending traffic.
+    # This constant [s] allows longer wait, without affecting
+    # the approximate duration. For example, use 0.098 for AWS.
+    PERF_TRIAL_STL_DELAY = get_float_from_env(u"PERF_TRIAL_STL_DELAY", 0.0)
+
+    # ASTF usually needs a different value for the delay.
+    PERF_TRIAL_ASTF_DELAY = get_float_from_env(
+        u"PERF_TRIAL_ASTF_DELAY", 0.112
+    )
+
      # Extended debug (incl. vpp packet trace, linux perf stat, ...).
      # Full list is available as suite variable (__init__.robot) or is
      # override by test.
@@ -300,7 +310,7 @@ class Constants:
          # Vxlan traffic will still show stretching at 36 Mpps (>12%),
          # but we do not care about those tests that much.
          u"Mellanox-CX556A": 36000000, # 148809523,
-        u"Amazon-Nitro-50G": 1500000,
+        u"Amazon-Nitro-50G": 1200000,
          u"virtual": 14880952,
      }
  
diff --git a/resources/libraries/python/TrafficGenerator.py b/resources/libraries/python/TrafficGenerator.py

index c0d1b4e..e42ad48 100644 (file)
--- a/resources/libraries/python/TrafficGenerator.py
+++ b/resources/libraries/python/TrafficGenerator.py
@@ -13,6 +13,7 @@
  
  """Performance testing traffic generator library."""
  
+import math
  import time
  
  from robot.api import logger
@@ -577,7 +578,6 @@ class TrafficGenerator(AbstractMeasurer):
              # so we can compare with what telemetry suggests
              # the real duration was.
              logger.debug(f"Expected duration {computed_duration}")
-            computed_duration += 0.1115
          if not self.duration_limit:
              return computed_duration, True
          limited_duration = min(computed_duration, self.duration_limit)
@@ -652,6 +652,9 @@ class TrafficGenerator(AbstractMeasurer):
          command_line.add_if(u"async_start", async_call)
          command_line.add_if(u"latency", self.use_latency)
          command_line.add_if(u"force", Constants.TREX_SEND_FORCE)
+        command_line.add_with_value(
+            u"delay", Constants.PERF_TRIAL_ASTF_DELAY
+        )
  
          self._start_time = time.monotonic()
          self._rate = multiplier
@@ -759,6 +762,7 @@ class TrafficGenerator(AbstractMeasurer):
          command_line.add_if(u"async_start", async_call)
          command_line.add_if(u"latency", self.use_latency)
          command_line.add_if(u"force", Constants.TREX_SEND_FORCE)
+        command_line.add_with_value(u"delay", Constants.PERF_TRIAL_STL_DELAY)
  
          # TODO: This is ugly. Handle parsing better.
          self._start_time = time.monotonic()
@@ -1244,16 +1248,27 @@ class TrafficGenerator(AbstractMeasurer):
          if not target_duration:
              target_duration = approximated_duration
          transmit_rate = self._rate
+        unsent = 0
          if self.transaction_type == u"packet":
              partial_attempt_count = self._sent
-            expected_attempt_count = self._sent
-            fail_count = self._loss
+            packet_rate = transmit_rate * self.ppta
+            # We have a float. TRex way of rounding it is not obvious.
+            # The biggest source of mismatch is Inter Stream Gap.
+            # So the code tolerates 10 usec of missing packets.
+            expected_attempt_count = (target_duration - 1e-5) * packet_rate
+            expected_attempt_count = math.ceil(expected_attempt_count)
+            # TRex can send more.
+            expected_attempt_count = max(expected_attempt_count, self._sent)
+            unsent = expected_attempt_count - self._sent
+            pass_count = self._received
+            fail_count = expected_attempt_count - pass_count
          elif self.transaction_type == u"udp_cps":
              if not self.transaction_scale:
                  raise RuntimeError(u"Add support for no-limit udp_cps.")
              partial_attempt_count = self._l7_data[u"client"][u"sent"]
              # We do not care whether TG is slow, it should have attempted all.
              expected_attempt_count = self.transaction_scale
+            unsent = expected_attempt_count - partial_attempt_count
              pass_count = self._l7_data[u"client"][u"received"]
              fail_count = expected_attempt_count - pass_count
          elif self.transaction_type == u"tcp_cps":
@@ -1263,6 +1278,7 @@ class TrafficGenerator(AbstractMeasurer):
              partial_attempt_count = ctca
              # We do not care whether TG is slow, it should have attempted all.
              expected_attempt_count = self.transaction_scale
+            unsent = expected_attempt_count - partial_attempt_count
              # From TCP point of view, server/connects counts full connections,
              # but we are testing NAT session so client/connects counts that
              # (half connections from TCP point of view).
@@ -1273,7 +1289,8 @@ class TrafficGenerator(AbstractMeasurer):
                  raise RuntimeError(u"Add support for no-limit udp_pps.")
              partial_attempt_count = self._sent
              expected_attempt_count = self.transaction_scale * self.ppta
-            fail_count = self._loss + (expected_attempt_count - self._sent)
+            unsent = expected_attempt_count - self._sent
+            fail_count = self._loss + unsent
          elif self.transaction_type == u"tcp_pps":
              if not self.transaction_scale:
                  raise RuntimeError(u"Add support for no-limit tcp_pps.")
@@ -1286,9 +1303,12 @@ class TrafficGenerator(AbstractMeasurer):
              # A simple workaround is to add absolute difference.
              # Probability of retransmissions exactly cancelling
              # packets unsent due to duration stretching is quite low.
-            fail_count = self._loss + abs(expected_attempt_count - self._sent)
+            unsent = abs(expected_attempt_count - self._sent)
+            fail_count = self._loss + unsent
          else:
              raise RuntimeError(f"Unknown parsing {self.transaction_type!r}")
+        if unsent:
+            logger.debug(f"Unsent packets/transactions: {unsent}")
          if fail_count < 0 and not self.negative_loss:
              fail_count = 0
          measurement = ReceiveRateMeasurement(
diff --git a/resources/libraries/robot/performance/performance_utils.robot b/resources/libraries/robot/performance/performance_utils.robot

index 5942330..ef268ff 100644 (file)
--- a/resources/libraries/robot/performance/performance_utils.robot
+++ b/resources/libraries/robot/performance/performance_utils.robot
@@ -441,7 +441,7 @@
  | | ${results} = | Create List
  | | FOR | ${i} | IN RANGE | ${trial_multiplicity}
  | | | Call Resetter
-| | | ${result}= | Send traffic on tg
+| | | ${result} = | Send traffic on tg
  | | | ... | duration=${trial_duration}
  | | | ... | rate=${rate}
  | | | ... | frame_size=${frame_size}
author	Vratko Polak <vrpolak@cisco.com>
	Thu, 25 Mar 2021 18:26:35 +0000 (19:26 +0100)
committer	Vratko Polak <vrpolak@cisco.com>
	Fri, 26 Mar 2021 13:32:10 +0000 (14:32 +0100)
GPL/tools/trex/trex_astf_profile.py		patch \| blob \| history
GPL/tools/trex/trex_stl_profile.py		patch \| blob \| history
resources/libraries/python/Constants.py		patch \| blob \| history
resources/libraries/python/TrafficGenerator.py		patch \| blob \| history
resources/libraries/robot/performance/performance_utils.robot		patch \| blob \| history