From 9f0ea4e6be697aa8ca08a6da596d7ca5892d2183 Mon Sep 17 00:00:00 2001 From: Vratko Polak Date: Thu, 25 Mar 2021 19:26:35 +0100 Subject: [PATCH] STL traffic: Use the sleep+stop logic + PPS limit for AWS set to 1.2 Mpps. + The logic is very similar to that one in ASTF driver. + This helps for testbeds with high duration stretching (e.g. AWS). + Difference: No transaction scale, and we deal with floats. + Update loss counting to count unsent packets as lost. + Also count "unsent" transactions for other transaction types. + If nonzero, log the number of unsent packets/transactions. + Make STL and ASTF time overhead constant (called delay) configurable. + Subtract delay from approximated_duration, also for ASTF. Change-Id: I6ee6aa6fba4f110ba1636e1b0ff76cac64383e33 Signed-off-by: Vratko Polak --- GPL/tools/trex/trex_astf_profile.py | 30 ++++++++++++---------- GPL/tools/trex/trex_stl_profile.py | 24 ++++++++++++----- resources/libraries/python/Constants.py | 14 ++++++++-- resources/libraries/python/TrafficGenerator.py | 30 ++++++++++++++++++---- .../robot/performance/performance_utils.robot | 2 +- 5 files changed, 71 insertions(+), 29 deletions(-) diff --git a/GPL/tools/trex/trex_astf_profile.py b/GPL/tools/trex/trex_astf_profile.py index 3f244ecf09..d08fb61778 100644 --- a/GPL/tools/trex/trex_astf_profile.py +++ b/GPL/tools/trex/trex_astf_profile.py @@ -79,6 +79,7 @@ def simple_burst( latency, async_start=False, traffic_directions=2, + delay=0.0, ): """Send traffic and measure packet loss and latency. @@ -98,8 +99,9 @@ def simple_burst( Duration details: Contrary to stateless mode, ASTF profiles typically limit the number of flows/transactions that can happen. - The caller is expected to set the duration parameter accordingly to - this limit and multiplier, including any overheads. + The caller is expected to set the duration parameter to idealized value, + but set the delay arguments when TRex is expected + to finish processing replies later (including a window for latency). See *_traffic_duration output fields for TRex's measurement of the real traffic duration (should be without any inactivity overheads). If traffic has not ended by the final time, the traffic @@ -111,11 +113,7 @@ def simple_burst( :param profile_file: A python module with T-rex traffic profile. :param duration: Expected duration for all transactions to finish, - assuming only tolerable duration stretching happens. - This includes later start of later transactions - (according to TPS multiplier) and expected duration of each transaction. - Critically, this also includes any delay TRex shows when starting - traffic (but not the similar delay during stopping). + without any TRex related delays, without even latency. :param framesize: Frame size. :param multiplier: Multiplier of profile CPS. :param port_0: Port 0 on the traffic generator. @@ -123,6 +121,7 @@ def simple_burst( :param latency: With latency stats. :param async_start: Start the traffic and exit. :param traffic_directions: Bidirectional (2) or unidirectional (1) traffic. + :param delay: Time increase [s] for sleep duration. :type profile_file: str :type duration: float :type framesize: int or str @@ -132,6 +131,7 @@ def simple_burst( :type latency: bool :type async_start: bool :type traffic_directions: int + :type delay: float """ client = None total_received = 0 @@ -179,14 +179,12 @@ def simple_burst( # Choose CPS and start traffic. client.start( mult=multiplier, - # Increase the input duration slightly, - # to ensure it does not end before sleep&stop below happens. - duration=duration + 0.1 if duration > 0 else duration, + duration=duration, nc=True, latency_pps=int(multiplier) if latency else 0, client_mask=2**len(ports)-1, ) - time_start = time.monotonic() + time_stop = time.monotonic() + duration + delay if async_start: # For async stop, we need to export the current snapshot. @@ -196,8 +194,7 @@ def simple_burst( xsnap1 = client.ports[port_1].get_xstats().reference_stats print(f"Xstats snapshot 1: {xsnap1!r}") else: - time.sleep(duration) - + time.sleep(duration + delay) # Do not block yet, the existing transactions may take long time # to finish. We need an action that is almost reset(), # but without clearing stats. @@ -208,7 +205,7 @@ def simple_burst( client.stop(block=True) # Read the stats after the traffic stopped (or time up). - stats[time.monotonic() - time_start] = client.get_stats( + stats[time.monotonic() - time_stop] = client.get_stats( ports=ports ) @@ -442,6 +439,10 @@ def main(): u"--traffic_directions", type=int, default=2, help=u"Send bi- (2) or uni- (1) directional traffic." ) + parser.add_argument( + u"--delay", required=True, type=float, default=0.0, + help=u"Allowed time overhead, sleep time is increased by this [s]." + ) args = parser.parse_args() @@ -460,6 +461,7 @@ def main(): latency=args.latency, async_start=args.async_start, traffic_directions=args.traffic_directions, + delay=args.delay, ) diff --git a/GPL/tools/trex/trex_stl_profile.py b/GPL/tools/trex/trex_stl_profile.py index 8dbc046482..4eedc80899 100644 --- a/GPL/tools/trex/trex_stl_profile.py +++ b/GPL/tools/trex/trex_stl_profile.py @@ -80,6 +80,7 @@ def simple_burst( async_start=False, traffic_directions=2, force=False, + delay=0.0, ): """Send traffic and measure packet loss and latency. @@ -108,6 +109,7 @@ def simple_burst( :param async_start: Start the traffic and exit. :param traffic_directions: Bidirectional (2) or unidirectional (1) traffic. :param force: Force start regardless of ports state. + :param delay: Sleep overhead [s]. :type profile_file: str :type framesize: int or str :type duration: float @@ -118,6 +120,7 @@ def simple_burst( :type async_start: bool :type traffic_directions: int :type force: bool + :type delay: float """ client = None total_rcvd = 0 @@ -201,18 +204,20 @@ def simple_burst( xsnap1 = client.ports[1].get_xstats().reference_stats print(f"Xstats snapshot 1: {xsnap1!r}") else: - # Block until done: time_start = time.monotonic() - client.wait_on_traffic(ports=ports, timeout=duration+30) + # wait_on_traffic fails if duration stretches by 30 seconds or more. + # TRex has some overhead, wait some more. + time.sleep(duration + delay) + client.stop() time_stop = time.monotonic() - approximated_duration = time_stop - time_start - + approximated_duration = time_stop - time_start - delay + # Read the stats after the traffic stopped (or time up). + stats = client.get_stats() if client.get_warnings(): for warning in client.get_warnings(): print(warning) - - # Read the stats after the test - stats = client.get_stats() + # Now finish the complete reset. + client.reset() print(u"##### Statistics #####") print(json.dumps(stats, indent=4, separators=(u",", u": "))) @@ -314,6 +319,10 @@ def main(): u"--force", action=u"store_true", default=False, help=u"Force start regardless of ports state." ) + parser.add_argument( + u"--delay", required=True, type=float, default=0.0, + help=u"Delay assumed for traffic, sleep time is increased by this [s]." + ) args = parser.parse_args() @@ -333,6 +342,7 @@ def main(): async_start=args.async_start, traffic_directions=args.traffic_directions, force=args.force, + delay=args.delay, ) diff --git a/resources/libraries/python/Constants.py b/resources/libraries/python/Constants.py index 8937e6cfc0..14fb7958b8 100644 --- a/resources/libraries/python/Constants.py +++ b/resources/libraries/python/Constants.py @@ -234,7 +234,7 @@ class Constants: # Number of trials to execute in MRR test. PERF_TRIAL_MULTIPLICITY = get_int_from_env(u"PERF_TRIAL_MULTIPLICITY", 10) - # Duration of one trial in MRR test. + # Duration [s] of one trial in MRR test. PERF_TRIAL_DURATION = get_float_from_env(u"PERF_TRIAL_DURATION", 1.0) # Whether to use latency streams in main search trials. @@ -244,6 +244,16 @@ class Constants: PERF_TRIAL_LATENCY_DURATION = get_float_from_env( u"PERF_TRIAL_LATENCY_DURATION", 5.0) + # For some testbeds TG takes longer than usual to start sending traffic. + # This constant [s] allows longer wait, without affecting + # the approximate duration. For example, use 0.098 for AWS. + PERF_TRIAL_STL_DELAY = get_float_from_env(u"PERF_TRIAL_STL_DELAY", 0.0) + + # ASTF usually needs a different value for the delay. + PERF_TRIAL_ASTF_DELAY = get_float_from_env( + u"PERF_TRIAL_ASTF_DELAY", 0.112 + ) + # Extended debug (incl. vpp packet trace, linux perf stat, ...). # Full list is available as suite variable (__init__.robot) or is # override by test. @@ -300,7 +310,7 @@ class Constants: # Vxlan traffic will still show stretching at 36 Mpps (>12%), # but we do not care about those tests that much. u"Mellanox-CX556A": 36000000, # 148809523, - u"Amazon-Nitro-50G": 1500000, + u"Amazon-Nitro-50G": 1200000, u"virtual": 14880952, } diff --git a/resources/libraries/python/TrafficGenerator.py b/resources/libraries/python/TrafficGenerator.py index c0d1b4ec63..e42ad48930 100644 --- a/resources/libraries/python/TrafficGenerator.py +++ b/resources/libraries/python/TrafficGenerator.py @@ -13,6 +13,7 @@ """Performance testing traffic generator library.""" +import math import time from robot.api import logger @@ -577,7 +578,6 @@ class TrafficGenerator(AbstractMeasurer): # so we can compare with what telemetry suggests # the real duration was. logger.debug(f"Expected duration {computed_duration}") - computed_duration += 0.1115 if not self.duration_limit: return computed_duration, True limited_duration = min(computed_duration, self.duration_limit) @@ -652,6 +652,9 @@ class TrafficGenerator(AbstractMeasurer): command_line.add_if(u"async_start", async_call) command_line.add_if(u"latency", self.use_latency) command_line.add_if(u"force", Constants.TREX_SEND_FORCE) + command_line.add_with_value( + u"delay", Constants.PERF_TRIAL_ASTF_DELAY + ) self._start_time = time.monotonic() self._rate = multiplier @@ -759,6 +762,7 @@ class TrafficGenerator(AbstractMeasurer): command_line.add_if(u"async_start", async_call) command_line.add_if(u"latency", self.use_latency) command_line.add_if(u"force", Constants.TREX_SEND_FORCE) + command_line.add_with_value(u"delay", Constants.PERF_TRIAL_STL_DELAY) # TODO: This is ugly. Handle parsing better. self._start_time = time.monotonic() @@ -1244,16 +1248,27 @@ class TrafficGenerator(AbstractMeasurer): if not target_duration: target_duration = approximated_duration transmit_rate = self._rate + unsent = 0 if self.transaction_type == u"packet": partial_attempt_count = self._sent - expected_attempt_count = self._sent - fail_count = self._loss + packet_rate = transmit_rate * self.ppta + # We have a float. TRex way of rounding it is not obvious. + # The biggest source of mismatch is Inter Stream Gap. + # So the code tolerates 10 usec of missing packets. + expected_attempt_count = (target_duration - 1e-5) * packet_rate + expected_attempt_count = math.ceil(expected_attempt_count) + # TRex can send more. + expected_attempt_count = max(expected_attempt_count, self._sent) + unsent = expected_attempt_count - self._sent + pass_count = self._received + fail_count = expected_attempt_count - pass_count elif self.transaction_type == u"udp_cps": if not self.transaction_scale: raise RuntimeError(u"Add support for no-limit udp_cps.") partial_attempt_count = self._l7_data[u"client"][u"sent"] # We do not care whether TG is slow, it should have attempted all. expected_attempt_count = self.transaction_scale + unsent = expected_attempt_count - partial_attempt_count pass_count = self._l7_data[u"client"][u"received"] fail_count = expected_attempt_count - pass_count elif self.transaction_type == u"tcp_cps": @@ -1263,6 +1278,7 @@ class TrafficGenerator(AbstractMeasurer): partial_attempt_count = ctca # We do not care whether TG is slow, it should have attempted all. expected_attempt_count = self.transaction_scale + unsent = expected_attempt_count - partial_attempt_count # From TCP point of view, server/connects counts full connections, # but we are testing NAT session so client/connects counts that # (half connections from TCP point of view). @@ -1273,7 +1289,8 @@ class TrafficGenerator(AbstractMeasurer): raise RuntimeError(u"Add support for no-limit udp_pps.") partial_attempt_count = self._sent expected_attempt_count = self.transaction_scale * self.ppta - fail_count = self._loss + (expected_attempt_count - self._sent) + unsent = expected_attempt_count - self._sent + fail_count = self._loss + unsent elif self.transaction_type == u"tcp_pps": if not self.transaction_scale: raise RuntimeError(u"Add support for no-limit tcp_pps.") @@ -1286,9 +1303,12 @@ class TrafficGenerator(AbstractMeasurer): # A simple workaround is to add absolute difference. # Probability of retransmissions exactly cancelling # packets unsent due to duration stretching is quite low. - fail_count = self._loss + abs(expected_attempt_count - self._sent) + unsent = abs(expected_attempt_count - self._sent) + fail_count = self._loss + unsent else: raise RuntimeError(f"Unknown parsing {self.transaction_type!r}") + if unsent: + logger.debug(f"Unsent packets/transactions: {unsent}") if fail_count < 0 and not self.negative_loss: fail_count = 0 measurement = ReceiveRateMeasurement( diff --git a/resources/libraries/robot/performance/performance_utils.robot b/resources/libraries/robot/performance/performance_utils.robot index 59423304a2..ef268ff663 100644 --- a/resources/libraries/robot/performance/performance_utils.robot +++ b/resources/libraries/robot/performance/performance_utils.robot @@ -441,7 +441,7 @@ | | ${results} = | Create List | | FOR | ${i} | IN RANGE | ${trial_multiplicity} | | | Call Resetter -| | | ${result}= | Send traffic on tg +| | | ${result} = | Send traffic on tg | | | ... | duration=${trial_duration} | | | ... | rate=${rate} | | | ... | frame_size=${frame_size} -- 2.16.6