C-Dash: add "download row data" feature to comparison tables

[csit.git] / csit.infra.dash / app / cdash / comparisons / tables.py
diff --git a/csit.infra.dash/app/cdash/comparisons/tables.py b/csit.infra.dash/app/cdash/comparisons/tables.py

index 2cafc9f..18f9404 100644 (file)
--- a/csit.infra.dash/app/cdash/comparisons/tables.py
+++ b/csit.infra.dash/app/cdash/comparisons/tables.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at:
@@ -16,16 +16,19 @@
  
  import pandas as pd
  
-from numpy import mean, std
+from numpy import mean, std, percentile
  from copy import deepcopy
+
  from ..utils.constants import Constants as C
  from ..utils.utils import relative_change_stdev
  
  
-def select_comparison_data(
+def select_comp_data(
          data: pd.DataFrame,
          selected: dict,
-        normalize: bool=False
+        normalize: bool=False,
+        remove_outliers: bool=False,
+        raw_data: bool=False
      ) -> pd.DataFrame:
      """Select data for a comparison table.
  
@@ -34,9 +37,15 @@ def select_comparison_data(
          the user.
      :param normalize: If True, the data is normalized to CPU frequency
          Constants.NORM_FREQUENCY.
+    :param remove_outliers: If True the outliers are removed before
+        generating the table.
+    :param raw_data: If True, returns data as it is in parquets without any
+        processing. It is used for "download raw data" feature.
      :type data: pandas.DataFrame
      :type selected: dict
      :type normalize: bool
+    :type remove_outliers: bool
+    :type raw_data: bool
      :returns: A data frame with selected data.
      :rtype: pandas.DataFrame
      """
@@ -45,7 +54,8 @@ def select_comparison_data(
              data_in: pd.DataFrame,
              ttype: str,
              drv: str,
-            norm_factor: float
+            norm_factor: float,
+            remove_outliers: bool=False
          ) -> pd.DataFrame:
          """Calculates mean value and standard deviation for provided data.
  
@@ -53,10 +63,13 @@ def select_comparison_data(
          :param ttype: The test type.
          :param drv: The driver.
          :param norm_factor: The data normalization factor.
+        :param remove_outliers: If True the outliers are removed before
+            generating the table.
          :type data_in: pandas.DataFrame
          :type ttype: str
          :type drv: str
          :type norm_factor: float
+        :type remove_outliers: bool
          :returns: A pandas dataframe with: test name, mean value, standard
              deviation and unit.
          :rtype: pandas.DataFrame
@@ -81,6 +94,15 @@ def select_comparison_data(
                  for l_itm in l_df:
                      tmp_df.extend(l_itm)
                  l_df = tmp_df
+
+            if remove_outliers:
+                q1 = percentile(l_df, 25, method=C.COMP_PERCENTILE_METHOD)
+                q3 = percentile(l_df, 75, method=C.COMP_PERCENTILE_METHOD)
+                irq = q3 - q1
+                lif = q1 - C.COMP_OUTLIER_TYPE * irq
+                uif = q3 + C.COMP_OUTLIER_TYPE * irq
+                l_df = [i for i in l_df if i >= lif and i <= uif]
+
              try:
                  mean_val = mean(l_df)
                  std_val = std(l_df)
@@ -143,12 +165,14 @@ def select_comparison_data(
                      norm_factor = C.NORM_FREQUENCY / C.FREQUENCY[itm["tbed"]]
              else:
                  norm_factor = 1.0
-            tmp_df = _calculate_statistics(
-                tmp_df,
-                itm["ttype"].lower(),
-                itm["driver"],
-                norm_factor
-            )
+            if not raw_data:
+                tmp_df = _calculate_statistics(
+                    tmp_df,
+                    itm["ttype"].lower(),
+                    itm["driver"],
+                    norm_factor,
+                    remove_outliers=remove_outliers
+                )
  
          lst_df.append(tmp_df)
  
@@ -170,7 +194,9 @@ def comparison_table(
          data: pd.DataFrame,
          selected: dict,
          normalize: bool,
-        format: str="html"
+        format: str="html",
+        remove_outliers: bool=False,
+        raw_data: bool=False
      ) -> tuple:
      """Generate a comparison table.
  
@@ -184,10 +210,16 @@ def comparison_table(
            of the unit.
          - csv: To be downloaded as a CSV file the values are stored in base
            units.
+    :param remove_outliers: If True the outliers are removed before
+        generating the table.
+    :param raw_data: If True, returns data as it is in parquets without any
+        processing. It is used for "download raw data" feature.
      :type data: pandas.DataFrame
      :type selected: dict
      :type normalize: bool
      :type format: str
+    :type remove_outliers: bool
+    :type raw_data: bool
      :returns: A tuple with the tabe title and the comparison table.
      :rtype: tuple[str, pandas.DataFrame]
      """
@@ -218,9 +250,32 @@ def comparison_table(
                      })
          return selection
  
+    # Select reference data
      r_sel = deepcopy(selected["reference"]["selection"])
-    c_params = selected["compare"]
      r_selection = _create_selection(r_sel)
+    r_data = select_comp_data(
+        data, r_selection, normalize, remove_outliers, raw_data
+    )
+
+    # Select compare data
+    c_sel = deepcopy(selected["reference"]["selection"])
+    c_params = selected["compare"]
+    if c_params["parameter"] in ("core", "frmsize", "ttype"):
+        c_sel[c_params["parameter"]] = [c_params["value"], ]
+    else:
+        c_sel[c_params["parameter"]] = c_params["value"]
+    c_selection = _create_selection(c_sel)
+    c_data = select_comp_data(
+        data, c_selection, normalize, remove_outliers, raw_data
+    )
+
+    if raw_data:
+        r_data["ref/cmp"] = "reference"
+        c_data["ref/cmp"] = "compare"
+        return str(), pd.concat([r_data, c_data], ignore_index=True, copy=False)
+
+    if r_data.empty or c_data.empty:
+        return str(), pd.DataFrame()
  
      if format == "html" and "Latency" not in r_sel["ttype"]:
          unit_factor, s_unit_factor = (1e6, "M")
@@ -243,22 +298,6 @@ def comparison_table(
          r_name = "|".join(r_name)
      c_name = c_params["value"]
  
-    # Select reference data
-    r_data = select_comparison_data(data, r_selection, normalize)
-
-    # Select compare data
-    c_sel = deepcopy(selected["reference"]["selection"])
-    if c_params["parameter"] in ("core", "frmsize", "ttype"):
-        c_sel[c_params["parameter"]] = [c_params["value"], ]
-    else:
-        c_sel[c_params["parameter"]] = c_params["value"]
-
-    c_selection = _create_selection(c_sel)
-    c_data = select_comparison_data(data, c_selection, normalize)
-
-    if r_data.empty or c_data.empty:
-        return str(), pd.DataFrame()
-
      l_name, l_r_mean, l_r_std, l_c_mean, l_c_std, l_rc_mean, l_rc_std, unit = \
          list(), list(), list(), list(), list(), list(), list(), set()
      for _, row in r_data.iterrows():