From: Vratko Polak <vrpolak@cisco.com>
Date: Fri, 8 Sep 2023 15:22:20 +0000 (+0200)
Subject: feat(perpatch): parse results from json
X-Git-Url: https://gerrit.fd.io/r/gitweb?p=csit.git;a=commitdiff_plain;h=d8ec3f8673346c0dc93e567159771f24c1bf74fc

feat(perpatch): parse results from json

+ Use test names in output.
- Methodology updated in subsequent change.

Change-Id: I6a62f87249ea79262778f68d00f9bb81134f0b02
Signed-off-by: Vratko Polak <vrpolak@cisco.com>
---

diff --git a/resources/libraries/bash/entry/per_patch_perf.sh b/resources/libraries/bash/entry/per_patch_perf.sh
index b1854992e2..aba899689e 100644
--- a/resources/libraries/bash/entry/per_patch_perf.sh
+++ b/resources/libraries/bash/entry/per_patch_perf.sh
@@ -73,13 +73,13 @@ for ((iter=0; iter<iterations; iter++)); do
     select_build "build_current" || die
     check_download_dir || die
     run_robot || die
-    archive_parse_test_results "csit_current/${iter}" || die
+    archive_test_results "csit_current/${iter}" || die
     # TODO: Use less heavy way to avoid apt remove failures.
     ansible_playbook "cleanup" || die
     select_build "build_parent" || die
     check_download_dir || die
     run_robot || die
-    archive_parse_test_results "csit_parent/${iter}" || die
+    archive_test_results "csit_parent/${iter}" || die
 done
 untrap_and_unreserve_testbed || die
 compare_test_results  # The error code becomes this script's error code.
diff --git a/resources/libraries/bash/function/per_patch.sh b/resources/libraries/bash/function/per_patch.sh
index 0cf7a6503d..cfd22e7826 100644
--- a/resources/libraries/bash/function/per_patch.sh
+++ b/resources/libraries/bash/function/per_patch.sh
@@ -46,26 +46,6 @@ function archive_test_results () {
 }
 
 
-function archive_parse_test_results () {
-
-    # Arguments:
-    # - ${1}: Directory to archive to. Required. Parent has to exist.
-    # Variables read:
-    # - TARGET - Target directory.
-    # Functions called:
-    # - die - Print to stderr and exit, defined in common.sh
-    # - archive_test_results - Archiving results.
-    # - parse_results - See definition in this file.
-
-    set -exuo pipefail
-
-    archive_test_results "$1" || die
-    parse_results "${TARGET}" || {
-        die "The function should have died on error."
-    }
-}
-
-
 function build_vpp_ubuntu_amd64 () {
 
     # This function is using make pkg-verify to build VPP with all dependencies
@@ -152,224 +132,6 @@ function initialize_csit_dirs () {
 }
 
 
-function parse_results () {
-
-    # Currently "parsing" is just few greps on output.xml.
-    # TODO: Parse json outputs properly.
-    #
-    # The current implementation attempts to parse for BMRR, PDR and passrate.
-    # If failures are present, they are reported as fake throughput values,
-    # enabling bisection to focus on the cause (or the fix) of the failures.
-    #
-    # The fake values are created with MRR multiplicity,
-    # otherwise jumpavg (which dislikes short groups) could misclassify them.
-    #
-    # Arguments:
-    # - ${1} - Path to (existing) directory holding robot output.xml result.
-    # Files read:
-    # - output.xml - From argument location.
-    # Files updated:
-    # - results.txt - (Re)created, in argument location.
-    # Variables read:
-    # - CSIT_PERF_TRIAL_MULTIPLICITY - To create fake results of this length.
-    # Functions called:
-    # - die - Print to stderr and exit, defined in common.sh
-    # - parse_results_mrr - See definition in this file.
-    # - parse_results_ndrpdr - See definition in this file.
-    # - parse_results_passrate - See definition in this file.
-    # - parse_results_soak - See definition in this file.
-
-    set -exuo pipefail
-
-    rel_dir="$(readlink -e "${1}")" || die "Readlink failed."
-    in_file="${rel_dir}/output.xml" || die
-    out_file="${rel_dir}/results.txt" || die
-    echo "Parsing ${in_file} putting results into ${out_file}" || die
-    # Frst attempt: (B)MRR.
-    if parse_results_mrr "${in_file}" "${out_file}"; then
-        return 0
-    fi
-    # BMRR parsing failed. Attempt PDR/NDR.
-    if parse_results_ndrpdr "${in_file}" "${out_file}"; then
-        return 0
-    fi
-    # PDR/NDR parsing failed. Attempt soak.
-    if parse_results_soak "${in_file}" "${out_file}"; then
-        return 0
-    fi
-    # Soak parsing failed.
-    # Probably not a perf test at all (or a failed one),
-    # but we can still bisect by passrate.
-    parse_results_passrate "${in_file}" "${out_file}" || die
-}
-
-
-function parse_results_mrr () {
-
-    # Parse MRR test message(s) into JSON-readable output.
-    #
-    # Return non-zero if parsing fails.
-    #
-    # Arguments:
-    # - ${1} - Path to (existing) input file. Required.
-    # - ${2} - Path to (overwritten if exists) output file. Required.
-    # Files read:
-    # - output.xml - The input file from argument location.
-    # Files updated:
-    # - results.txt - (Re)created, in argument location.
-    # Functions called:
-    # - die - Print to stderr and exit, defined in common.sh
-
-    set -exuo pipefail
-
-    in_file="${1}" || die "Two arguments needed."
-    out_file="${2}" || die "Two arguments needed."
-    pattern='Maximum Receive Rate trial results in .*' || die
-    pattern+=' per second: .*\]</status>' || die
-    # RC of the following line is returned.
-    grep -o "${pattern}" "${in_file}" | grep -o '\[.*\]' > "${out_file}"
-}
-
-
-function parse_results_ndrpdr () {
-
-    # Parse NDRPDR test message(s) for PDR_LOWER, into JSON-readable output.
-    #
-    # Return non-zero if parsing fails.
-    # Parse for PDR, unless environment variable says NDR.
-    #
-    # Arguments:
-    # - ${1} - Path to (existing) input file. Required.
-    # - ${2} - Path to (overwritten if exists) output file. Required.
-    # Variables read:
-    # - FDIO_CSIT_PERF_PARSE_NDR - If defined and "yes", parse for NDR, not PDR.
-    # Files read:
-    # - output.xml - The input file from argument location.
-    # Files updated:
-    # - results.txt - (Re)created, in argument location.
-    # Functions called:
-    # - die - Print to stderr and exit, defined in common.sh
-
-    set -exuo pipefail
-
-    in_file="${1}" || die "Two arguments needed."
-    out_file="${2}" || die "Two arguments needed."
-    if [[ "${FDIO_CSIT_PERF_PARSE_NDR:-no}" == "yes" ]]; then
-        pattern1="Arguments: [ '\\nNDR_LOWER: " || die
-    else
-        pattern1="Arguments: [ '\\nPDR_LOWER: " || die
-    fi
-    # Adapted from https://superuser.com/a/377084
-    pattern2='(?<=R: ).*(?= pps)' || die
-    if fgrep "${pattern1}" "${in_file}" | grep -Po "${pattern2}" >> "${out_file}"
-    then
-        # Add bracket https://www.shellhacks.com/sed-awk-add-end-beginning-line/
-        sed -i 's/.*/[&]/' "${out_file}"
-        # Returns nonzero if fails.
-        return "$?"
-    fi
-    # Maybe it was CPS instead of pps?
-    pattern2='(?<=R: ).*(?= CPS)' || die
-    if fgrep "${pattern1}" "${in_file}" | grep -Po "${pattern2}" >> "${out_file}"
-    then
-        # Add bracket https://www.shellhacks.com/sed-awk-add-end-beginning-line/
-        sed -i 's/.*/[&]/' "${out_file}"
-        # Returns nonzero if fails.
-        return "$?"
-    else
-        return 1
-    fi
-}
-
-
-function parse_results_passrate () {
-
-    # Create fake values for failed tests.
-    #
-    # This function always passes (or dies).
-    #
-    # A non-zero but small value is chosen for failed run, to distinguish from
-    # real nonzero perf (which are big in general) and real zero values.
-    # A medium sized value is chosen for a passed run.
-    # This way bisect can search for breakages and fixes in device tests.
-    # At least in theory, as device tests are bootstrapped too differently.
-    #
-    # The fake value is repeated according to BMRR multiplicity,
-    # because a single value can be lost in high stdev data.
-    # (And it does not hurt for single value outputs such as NDR.)
-    #
-    # TODO: Count number of tests and generate fake results for every one.
-    #       Currently that would interfere with test retry logic.
-    #
-    # Arguments:
-    # - ${1} - Path to (existing) input file. Required.
-    # - ${2} - Path to (overwritten if exists) output file. Required.
-    # Variables read:
-    # - CSIT_PERF_TRIAL_MULTIPLICITY - To create fake results of this length.
-    # Files read:
-    # - output.xml - The input file from argument location.
-    # Files updated:
-    # - results.txt - (Re)created, in argument location.
-    # Functions called:
-    # - die - Print to stderr and exit, defined in common.sh
-
-    set -exuo pipefail
-
-    in_file="${1}" || die "Two arguments needed."
-    out_file="${2}" || die "Two arguments needed."
-    # The last status is the top level (global) robot status.
-    # It only passes if there were no (critical) test failures.
-    if fgrep '<status status=' "${out_file}" | tail -n 1 | fgrep '"PASS"'; then
-        fake_value="30.0" || die
-    else
-        fake_value="2.0" || die
-    fi
-    out_arr=("[") || die
-    for i in `seq "${CSIT_PERF_TRIAL_MULTIPLICITY:-1}"`; do
-        out_arr+=("${fake_value}" ",") || die
-    done
-    # The Python part uses JSON parser, the last comma has to be removed.
-    # Requires Bash 4.3 https://stackoverflow.com/a/36978740
-    out_arr[-1]="]" || die
-    # TODO: Is it possible to avoid space separation by manipulating IFS?
-    echo "${out_arr[@]}" > "${out_file}" || die
-}
-
-
-function parse_results_soak () {
-
-    # Parse soak test message(s) for lower bound, into JSON-readable output.
-    #
-    # Return non-zero if parsing fails.
-    #
-    # Arguments:
-    # - ${1} - Path to (existing) input file. Required.
-    # - ${2} - Path to (overwritten if exists) output file. Required.
-    # Files read:
-    # - output.xml - The input file from argument location.
-    # Files updated:
-    # - results.txt - (Re)created, in argument location.
-    # Functions called:
-    # - die - Print to stderr and exit, defined in common.sh
-
-    set -exuo pipefail
-
-    in_file="${1}" || die "Two arguments needed."
-    out_file="${2}" || die "Two arguments needed."
-    pattern1='PLRsearch lower bound: .*, .*<' || die
-    # Adapted from https://superuser.com/a/377084
-    pattern2='(?<=: ).*(?= pps)' || die
-    if grep "${pattern1}" "${in_file}" | grep -Po "${pattern2}" >> "${out_file}"
-    then
-        # Add bracket https://www.shellhacks.com/sed-awk-add-end-beginning-line/
-        sed -i 's/.*/[&]/' "${out_file}"
-        # Returns nonzero if fails.
-    else
-        return 1
-    fi
-}
-
-
 function select_build () {
 
     # Arguments:
diff --git a/resources/tools/integrated/compare_perpatch.py b/resources/tools/integrated/compare_perpatch.py
index b4d52dcdfe..0adb6ae73e 100644
--- a/resources/tools/integrated/compare_perpatch.py
+++ b/resources/tools/integrated/compare_perpatch.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 Cisco and/or its affiliates.
+# Copyright (c) 2023 Cisco and/or its affiliates.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at:
@@ -13,72 +13,127 @@
 
 """Script for determining whether per-patch perf test votes -1.
 
-This script assumes there exist two text files with processed BMRR results,
-located at hardcoded relative paths (subdirs thereof), having several lines
-of json-parseable lists of float values, corresponding to testcase results.
+This script expects a particular tree created on a filesystem by
+per_patch_perf.sh bootstrap script, including test results
+exported as json files according to a current model schema.
+This script extracts the results (according to tresult type)
+and joins them into one list of floats for parent and one for current.
+
 This script then uses jumpavg library to determine whether there was
 a regression, progression or no change for each testcase.
-If number of tests does not match, or there was a regression,
+
+If the set of test names does not match, or there was a regression,
 this script votes -1 (by exiting with code 1), otherwise it votes +1 (exit 0).
 """
 
 import json
+import os
 import sys
 
+from typing import Dict, List
+
 from resources.libraries.python import jumpavg
 
 
-def main():
-    """Execute the main logic, return the code to return as return code.
+def parse(dirpath: str, fake_value: float) -> Dict[str, List[float]]:
+    """Looks for test jsons, extract scalar results.
+
+    Files other than .json are skipped, jsons without test_id are skipped.
+    If the test failed, four fake values are used as a fake result.
+
+    Units are ignored, as both parent and current are tested
+    with the same CSIT code so the unit should be identical.
+
+    :param dirpath: Path to the directory tree to examine.
+    :param fail_value: Fake value to use for test cases that failed.
+    :type dirpath: str
+    :returns: Mapping from test IDs to list of measured values.
+    :rtype: Dict[str, List[float]]
+    :raises RuntimeError: On duplicate test ID or unknown test type.
+    """
+    results = {}
+    for root, _, files in os.walk(dirpath):
+        for filename in files:
+            if not filename.endswith(".json"):
+                continue
+            filepath = os.path.join(root, filename)
+            with open(filepath, "rt", encoding="utf8") as file_in:
+                data = json.load(file_in)
+            if "test_id" not in data:
+                continue
+            name = data["test_id"]
+            if name in results:
+                raise RuntimeError(f"Duplicate: {name}")
+            if not data["passed"]:
+                results[name] = [fake_value] * 4
+                continue
+            result_object = data["result"]
+            result_type = result_object["type"]
+            if result_type == "mrr":
+                results[name] = result_object["receive_rate"]["rate"]["values"]
+            elif result_type == "ndrpdr":
+                results[name] = [result_object["pdr"]["lower"]["rate"]["value"]]
+            elif result_type == "soak":
+                results[name] = [
+                    result_object["critical_rate"]["lower"]["rate"]["value"]
+                ]
+            elif result_type == "reconf":
+                results[name] = [result_object["loss"]["time"]["value"]]
+            elif result_type == "hoststack":
+                results[name] = [result_object["bandwidth"]["value"]]
+            else:
+                raise RuntimeError(f"Unknown result type: {result_type}")
+    return results
+
+
+def main() -> int:
+    """Execute the main logic, return a number to return as the return code.
+
+    Call parse to get parent and current data.
+    Use higher fake value for parent, so changes that keep a test failing
+    are marked as regressions.
+
+    If there are multiple iterations, the value lists are joined.
+    For each test, call jumpavg.classify to detect possible regression.
+
+    If there is at least one regression, return 3.
 
     :returns: Return code, 0 or 3 based on the comparison result.
     :rtype: int
     """
     iteration = -1
-    parent_iterations = list()
-    current_iterations = list()
-    num_tests = None
+    parent_aggregate = {}
+    current_aggregate = {}
+    test_names = None
     while 1:
         iteration += 1
-        parent_lines = list()
-        current_lines = list()
-        filename = f"csit_parent/{iteration}/results.txt"
-        try:
-            with open(filename) as parent_file:
-                parent_lines = parent_file.readlines()
-        except IOError:
+        parent_results = {}
+        current_results = {}
+        parent_results = parse(f"csit_parent/{iteration}", fake_value=2.0)
+        parent_names = set(parent_results.keys())
+        if test_names is None:
+            test_names = parent_names
+        if not parent_names:
+            # No more iterations.
             break
-        num_lines = len(parent_lines)
-        filename = f"csit_current/{iteration}/results.txt"
-        with open(filename) as current_file:
-            current_lines = current_file.readlines()
-        if num_lines != len(current_lines):
-            print(
-                f"Number of tests does not match within iteration {iteration}",
-                file=sys.stderr
-            )
-            return 1
-        if num_tests is None:
-            num_tests = num_lines
-        elif num_tests != num_lines:
-            print(
-                f"Number of tests does not match previous at iteration "
-                f"{iteration}", file=sys.stderr
-            )
-            return 1
-        parent_iterations.append(parent_lines)
-        current_iterations.append(current_lines)
+        assert parent_names == test_names, f"{parent_names} != {test_names}"
+        current_results = parse(f"csit_current/{iteration}", fake_value=1.0)
+        current_names = set(current_results.keys())
+        assert (
+            current_names == parent_names
+        ), f"{current_names} != {parent_names}"
+        for name in test_names:
+            if name not in parent_aggregate:
+                parent_aggregate[name] = []
+            if name not in current_aggregate:
+                current_aggregate[name] = []
+            parent_aggregate[name].extend(parent_results[name])
+            current_aggregate[name].extend(current_results[name])
     exit_code = 0
-    for test_index in range(num_tests):
-        parent_values = list()
-        current_values = list()
-        for iteration_index, _ in enumerate(parent_iterations):
-            parent_values.extend(
-                json.loads(parent_iterations[iteration_index][test_index])
-            )
-            current_values.extend(
-                json.loads(current_iterations[iteration_index][test_index])
-            )
+    for name in test_names:
+        print(f"Test name: {name}")
+        parent_values = parent_aggregate[name]
+        current_values = current_aggregate[name]
         print(f"Time-ordered MRR values for parent build: {parent_values}")
         print(f"Time-ordered MRR values for current build: {current_values}")
         parent_values = sorted(parent_values)
@@ -87,11 +142,14 @@ def main():
         parent_stats = jumpavg.AvgStdevStats.for_runs(parent_values)
         current_stats = jumpavg.AvgStdevStats.for_runs(current_values)
         parent_group_list = jumpavg.BitCountingGroupList(
-            max_value=max_value).append_group_of_runs([parent_stats])
-        combined_group_list = parent_group_list.copy(
-            ).extend_runs_to_last_group([current_stats])
+            max_value=max_value
+        ).append_group_of_runs([parent_stats])
+        combined_group_list = (
+            parent_group_list.copy().extend_runs_to_last_group([current_stats])
+        )
         separated_group_list = parent_group_list.append_group_of_runs(
-            [current_stats])
+            [current_stats]
+        )
         print(f"Value-ordered MRR values for parent build: {parent_values}")
         print(f"Value-ordered MRR values for current build: {current_values}")
         avg_diff = (current_stats.avg - parent_stats.avg) / parent_stats.avg
@@ -103,7 +161,7 @@ def main():
             f" {combined_group_list[0].stats}"
         )
         bits_diff = separated_group_list.bits - combined_group_list.bits
-        compared = u"longer" if bits_diff >= 0 else u"shorter"
+        compared = "longer" if bits_diff >= 0 else "shorter"
         print(
             f"Separate groups are {compared} than single group"
             f" by {abs(bits_diff)} bits"
@@ -112,16 +170,17 @@ def main():
         # That matters if only stats (not list of floats) are given.
         classified_list = jumpavg.classify([parent_values, current_values])
         if len(classified_list) < 2:
-            print(f"Test test_index {test_index}: normal (no anomaly)")
+            print(f"Test {name}: normal (no anomaly)")
             continue
         anomaly = classified_list[1].comment
-        if anomaly == u"regression":
-            print(f"Test test_index {test_index}: anomaly regression")
+        if anomaly == "regression":
+            print(f"Test {name}: anomaly regression")
             exit_code = 3  # 1 or 2 can be caused by other errors
             continue
-        print(f"Test test_index {test_index}: anomaly {anomaly}")
+        print(f"Test {name}: anomaly {anomaly}")
     print(f"Exit code: {exit_code}")
     return exit_code
 
-if __name__ == u"__main__":
+
+if __name__ == "__main__":
     sys.exit(main())