From 157aa10abdfb9c53f8e687da97ac72e9a724b457 Mon Sep 17 00:00:00 2001
From: Tibor Frank <tifrank@cisco.com>
Date: Thu, 7 Mar 2024 12:00:04 +0000
Subject: [PATCH] C-Dash: Add possiblity to remove outliers from comparison
 data

- extreme outliers only for now

Change-Id: I2be7c291dd207d5a557947415a960fa6d0067c64
Signed-off-by: Tibor Frank <tifrank@cisco.com>
---
 csit.infra.dash/app/cdash/comparisons/layout.py | 70 ++++++++++++++++++-------
 csit.infra.dash/app/cdash/comparisons/tables.py | 38 +++++++++++---
 csit.infra.dash/app/cdash/utils/constants.py    | 18 +++++++
 3 files changed, 100 insertions(+), 26 deletions(-)

diff --git a/csit.infra.dash/app/cdash/comparisons/layout.py b/csit.infra.dash/app/cdash/comparisons/layout.py
index 45bc75aca1..82b5b2d544 100644
--- a/csit.infra.dash/app/cdash/comparisons/layout.py
+++ b/csit.infra.dash/app/cdash/comparisons/layout.py
@@ -54,7 +54,8 @@ CP_PARAMS = {
     "cmp-val-opt": list(),
     "cmp-val-dis": True,
     "cmp-val-val": str(),
-    "normalize-val": list()
+    "normalize-val": list(),
+    "outliers-val": list()
 }
 
 # List of comparable parameters.
@@ -420,21 +421,33 @@ class Layout:
             )
         ]
 
-        normalize = [
+        processing = [
             dbc.Row(
                 class_name="g-0 p-1",
                 children=[
                     dbc.InputGroup(
-                        dbc.Checklist(
-                            id="normalize",
-                            options=[{
-                                "value": "normalize",
-                                "label": "Normalize to 2GHz CPU frequency"
-                            }],
-                            value=[],
-                            inline=True,
-                            class_name="ms-2"
-                        ),
+                        children = [
+                            dbc.Checklist(
+                                id="normalize",
+                                options=[{
+                                    "value": "normalize",
+                                    "label": "Normalize to 2GHz CPU frequency"
+                                }],
+                                value=[],
+                                inline=True,
+                                class_name="ms-2"
+                            ),
+                            dbc.Checklist(
+                                id="outliers",
+                                options=[{
+                                    "value": "outliers",
+                                    "label": "Remove Extreme Outliers"
+                                }],
+                                value=[],
+                                inline=True,
+                                class_name="ms-2"
+                            )
+                        ],
                         style={"align-items": "center"},
                         size="sm"
                     )
@@ -479,10 +492,10 @@ class Layout:
                 dbc.Card(
                     [
                         dbc.CardHeader(
-                            html.H5("Normalization")
+                            html.H5("Data Manipulations")
                         ),
                         dbc.CardBody(
-                            children=normalize,
+                            children=processing,
                             class_name="g-0 p-0"
                         )
                     ],
@@ -659,7 +672,8 @@ class Layout:
                 Output({"type": "ctrl-dd", "index": "cmpval"}, "options"),
                 Output({"type": "ctrl-dd", "index": "cmpval"}, "disabled"),
                 Output({"type": "ctrl-dd", "index": "cmpval"}, "value"),
-                Output("normalize", "value")
+                Output("normalize", "value"),
+                Output("outliers", "value")
             ],
             [
                 State("store-control-panel", "data"),
@@ -671,6 +685,7 @@ class Layout:
             [
                 Input("url", "href"),
                 Input("normalize", "value"),
+                Input("outliers", "value"),
                 Input({"type": "table", "index": ALL}, "filter_query"),
                 Input({"type": "ctrl-dd", "index": ALL}, "value"),
                 Input({"type": "ctrl-cl", "index": ALL}, "value"),
@@ -685,6 +700,7 @@ class Layout:
                 table_data: list,
                 href: str,
                 normalize: list,
+                outliers: bool,
                 table_filter: str,
                 *_
             ) -> tuple:
@@ -721,6 +737,10 @@ class Layout:
                     r_sel = selected["reference"]["selection"]
                     c_sel = selected["compare"]
                     normalize = literal_eval(url_params["norm"][0])
+                    try:  # Necessary for backward compatibility
+                        outliers = literal_eval(url_params["outliers"][0])
+                    except (KeyError, IndexError, AttributeError):
+                        outliers = list()
                     process_url = bool(
                         (selected["reference"]["set"] == True) and
                         (c_sel["set"] == True)
@@ -755,7 +775,8 @@ class Layout:
                                 [r_sel["infra"]]["ttype"]
                         ),
                         "ttype-val": r_sel["ttype"],
-                        "normalize-val": normalize
+                        "normalize-val": normalize,
+                        "outliers-val": outliers
                     })
                     opts = list()
                     for itm, label in CMP_PARAMS.items():
@@ -784,6 +805,9 @@ class Layout:
             elif trigger.type == "normalize":
                 ctrl_panel.set({"normalize-val": normalize})
                 on_draw = True
+            elif trigger.type == "outliers":
+                ctrl_panel.set({"outliers-val": outliers})
+                on_draw = True
             elif trigger.type == "ctrl-dd":
                 if trigger.idx == "dut":
                     try:
@@ -942,13 +966,23 @@ class Layout:
 
             if all((on_draw, selected["reference"]["set"],
                     selected["compare"]["set"], )):
-                title, table = comparison_table(self._data, selected, normalize)
+                title, table = comparison_table(
+                    data=self._data,
+                    selected=selected,
+                    normalize=normalize,
+                    format="html",
+                    remove_outliers=outliers
+                )
                 plotting_area = self._get_plotting_area(
                     title=title,
                     table=table,
                     url=gen_new_url(
                         parsed_url,
-                        params={"selected": selected, "norm": normalize}
+                        params={
+                            "selected": selected,
+                            "norm": normalize,
+                            "outliers": outliers
+                        }
                     )
                 )
                 store_table_data = table.to_dict("records")
diff --git a/csit.infra.dash/app/cdash/comparisons/tables.py b/csit.infra.dash/app/cdash/comparisons/tables.py
index ab99f1839d..0c247e87c2 100644
--- a/csit.infra.dash/app/cdash/comparisons/tables.py
+++ b/csit.infra.dash/app/cdash/comparisons/tables.py
@@ -16,17 +16,18 @@
 
 import pandas as pd
 
-from numpy import mean, std
+from numpy import mean, std, percentile
 from copy import deepcopy
 
 from ..utils.constants import Constants as C
 from ..utils.utils import relative_change_stdev
 
 
-def select_comparison_data(
+def select_comp_data(
         data: pd.DataFrame,
         selected: dict,
-        normalize: bool=False
+        normalize: bool=False,
+        remove_outliers: bool=False
     ) -> pd.DataFrame:
     """Select data for a comparison table.
 
@@ -35,9 +36,12 @@ def select_comparison_data(
         the user.
     :param normalize: If True, the data is normalized to CPU frequency
         Constants.NORM_FREQUENCY.
+    :param remove_outliers: If True the outliers are removed before
+        generating the table.
     :type data: pandas.DataFrame
     :type selected: dict
     :type normalize: bool
+    :type remove_outliers: bool
     :returns: A data frame with selected data.
     :rtype: pandas.DataFrame
     """
@@ -46,7 +50,8 @@ def select_comparison_data(
             data_in: pd.DataFrame,
             ttype: str,
             drv: str,
-            norm_factor: float
+            norm_factor: float,
+            remove_outliers: bool=False
         ) -> pd.DataFrame:
         """Calculates mean value and standard deviation for provided data.
 
@@ -54,10 +59,13 @@ def select_comparison_data(
         :param ttype: The test type.
         :param drv: The driver.
         :param norm_factor: The data normalization factor.
+        :param remove_outliers: If True the outliers are removed before
+            generating the table.
         :type data_in: pandas.DataFrame
         :type ttype: str
         :type drv: str
         :type norm_factor: float
+        :type remove_outliers: bool
         :returns: A pandas dataframe with: test name, mean value, standard
             deviation and unit.
         :rtype: pandas.DataFrame
@@ -82,6 +90,15 @@ def select_comparison_data(
                 for l_itm in l_df:
                     tmp_df.extend(l_itm)
                 l_df = tmp_df
+
+            if remove_outliers:
+                q1 = percentile(l_df, 25, method=C.COMP_PERCENTILE_METHOD)
+                q3 = percentile(l_df, 75, method=C.COMP_PERCENTILE_METHOD)
+                irq = q3 - q1
+                lif = q1 - C.COMP_OUTLIER_TYPE * irq
+                uif = q3 + C.COMP_OUTLIER_TYPE * irq
+                l_df = [i for i in l_df if i >= lif and i <= uif]
+
             try:
                 mean_val = mean(l_df)
                 std_val = std(l_df)
@@ -148,7 +165,8 @@ def select_comparison_data(
                 tmp_df,
                 itm["ttype"].lower(),
                 itm["driver"],
-                norm_factor
+                norm_factor,
+                remove_outliers=remove_outliers
             )
 
         lst_df.append(tmp_df)
@@ -171,7 +189,8 @@ def comparison_table(
         data: pd.DataFrame,
         selected: dict,
         normalize: bool,
-        format: str="html"
+        format: str="html",
+        remove_outliers: bool=False
     ) -> tuple:
     """Generate a comparison table.
 
@@ -185,10 +204,13 @@ def comparison_table(
           of the unit.
         - csv: To be downloaded as a CSV file the values are stored in base
           units.
+    :param remove_outliers: If True the outliers are removed before
+        generating the table.
     :type data: pandas.DataFrame
     :type selected: dict
     :type normalize: bool
     :type format: str
+    :type remove_outliers: bool
     :returns: A tuple with the tabe title and the comparison table.
     :rtype: tuple[str, pandas.DataFrame]
     """
@@ -245,7 +267,7 @@ def comparison_table(
     c_name = c_params["value"]
 
     # Select reference data
-    r_data = select_comparison_data(data, r_selection, normalize)
+    r_data = select_comp_data(data, r_selection, normalize, remove_outliers)
 
     # Select compare data
     c_sel = deepcopy(selected["reference"]["selection"])
@@ -255,7 +277,7 @@ def comparison_table(
         c_sel[c_params["parameter"]] = c_params["value"]
 
     c_selection = _create_selection(c_sel)
-    c_data = select_comparison_data(data, c_selection, normalize)
+    c_data = select_comp_data(data, c_selection, normalize, remove_outliers)
 
     if r_data.empty or c_data.empty:
         return str(), pd.DataFrame()
diff --git a/csit.infra.dash/app/cdash/utils/constants.py b/csit.infra.dash/app/cdash/utils/constants.py
index c86f4d5136..5ed5a8cbd7 100644
--- a/csit.infra.dash/app/cdash/utils/constants.py
+++ b/csit.infra.dash/app/cdash/utils/constants.py
@@ -358,6 +358,24 @@ class Constants:
     # Default name of downloaded file with selected data.
     COMP_DOWNLOAD_FILE_NAME = "comparison_data.csv"
 
+    # This parameter specifies the method to use for estimating the percentile.
+    # Possible values:
+    # - inverted_cdf
+    # - averaged_inverted_cdf
+    # - closest_observation
+    # - interpolated_inverted_cdf
+    # - hazen
+    # - weibull
+    # - linear (default)
+    # - median_unbiased
+    # - normal_unbiased
+    COMP_PERCENTILE_METHOD = "linear"
+
+    # Extreme or mild outlier?
+    OUTLIER_EXTREME = 3
+    OUTLIER_MILD = 1.5
+    COMP_OUTLIER_TYPE = OUTLIER_EXTREME
+
     ############################################################################
     # Statistics.
 
-- 
2.16.6