From 157aa10abdfb9c53f8e687da97ac72e9a724b457 Mon Sep 17 00:00:00 2001 From: Tibor Frank Date: Thu, 7 Mar 2024 12:00:04 +0000 Subject: [PATCH] C-Dash: Add possiblity to remove outliers from comparison data - extreme outliers only for now Change-Id: I2be7c291dd207d5a557947415a960fa6d0067c64 Signed-off-by: Tibor Frank --- csit.infra.dash/app/cdash/comparisons/layout.py | 70 ++++++++++++++++++------- csit.infra.dash/app/cdash/comparisons/tables.py | 38 +++++++++++--- csit.infra.dash/app/cdash/utils/constants.py | 18 +++++++ 3 files changed, 100 insertions(+), 26 deletions(-) diff --git a/csit.infra.dash/app/cdash/comparisons/layout.py b/csit.infra.dash/app/cdash/comparisons/layout.py index 45bc75aca1..82b5b2d544 100644 --- a/csit.infra.dash/app/cdash/comparisons/layout.py +++ b/csit.infra.dash/app/cdash/comparisons/layout.py @@ -54,7 +54,8 @@ CP_PARAMS = { "cmp-val-opt": list(), "cmp-val-dis": True, "cmp-val-val": str(), - "normalize-val": list() + "normalize-val": list(), + "outliers-val": list() } # List of comparable parameters. @@ -420,21 +421,33 @@ class Layout: ) ] - normalize = [ + processing = [ dbc.Row( class_name="g-0 p-1", children=[ dbc.InputGroup( - dbc.Checklist( - id="normalize", - options=[{ - "value": "normalize", - "label": "Normalize to 2GHz CPU frequency" - }], - value=[], - inline=True, - class_name="ms-2" - ), + children = [ + dbc.Checklist( + id="normalize", + options=[{ + "value": "normalize", + "label": "Normalize to 2GHz CPU frequency" + }], + value=[], + inline=True, + class_name="ms-2" + ), + dbc.Checklist( + id="outliers", + options=[{ + "value": "outliers", + "label": "Remove Extreme Outliers" + }], + value=[], + inline=True, + class_name="ms-2" + ) + ], style={"align-items": "center"}, size="sm" ) @@ -479,10 +492,10 @@ class Layout: dbc.Card( [ dbc.CardHeader( - html.H5("Normalization") + html.H5("Data Manipulations") ), dbc.CardBody( - children=normalize, + children=processing, class_name="g-0 p-0" ) ], @@ -659,7 +672,8 @@ class Layout: Output({"type": "ctrl-dd", "index": "cmpval"}, "options"), Output({"type": "ctrl-dd", "index": "cmpval"}, "disabled"), Output({"type": "ctrl-dd", "index": "cmpval"}, "value"), - Output("normalize", "value") + Output("normalize", "value"), + Output("outliers", "value") ], [ State("store-control-panel", "data"), @@ -671,6 +685,7 @@ class Layout: [ Input("url", "href"), Input("normalize", "value"), + Input("outliers", "value"), Input({"type": "table", "index": ALL}, "filter_query"), Input({"type": "ctrl-dd", "index": ALL}, "value"), Input({"type": "ctrl-cl", "index": ALL}, "value"), @@ -685,6 +700,7 @@ class Layout: table_data: list, href: str, normalize: list, + outliers: bool, table_filter: str, *_ ) -> tuple: @@ -721,6 +737,10 @@ class Layout: r_sel = selected["reference"]["selection"] c_sel = selected["compare"] normalize = literal_eval(url_params["norm"][0]) + try: # Necessary for backward compatibility + outliers = literal_eval(url_params["outliers"][0]) + except (KeyError, IndexError, AttributeError): + outliers = list() process_url = bool( (selected["reference"]["set"] == True) and (c_sel["set"] == True) @@ -755,7 +775,8 @@ class Layout: [r_sel["infra"]]["ttype"] ), "ttype-val": r_sel["ttype"], - "normalize-val": normalize + "normalize-val": normalize, + "outliers-val": outliers }) opts = list() for itm, label in CMP_PARAMS.items(): @@ -784,6 +805,9 @@ class Layout: elif trigger.type == "normalize": ctrl_panel.set({"normalize-val": normalize}) on_draw = True + elif trigger.type == "outliers": + ctrl_panel.set({"outliers-val": outliers}) + on_draw = True elif trigger.type == "ctrl-dd": if trigger.idx == "dut": try: @@ -942,13 +966,23 @@ class Layout: if all((on_draw, selected["reference"]["set"], selected["compare"]["set"], )): - title, table = comparison_table(self._data, selected, normalize) + title, table = comparison_table( + data=self._data, + selected=selected, + normalize=normalize, + format="html", + remove_outliers=outliers + ) plotting_area = self._get_plotting_area( title=title, table=table, url=gen_new_url( parsed_url, - params={"selected": selected, "norm": normalize} + params={ + "selected": selected, + "norm": normalize, + "outliers": outliers + } ) ) store_table_data = table.to_dict("records") diff --git a/csit.infra.dash/app/cdash/comparisons/tables.py b/csit.infra.dash/app/cdash/comparisons/tables.py index ab99f1839d..0c247e87c2 100644 --- a/csit.infra.dash/app/cdash/comparisons/tables.py +++ b/csit.infra.dash/app/cdash/comparisons/tables.py @@ -16,17 +16,18 @@ import pandas as pd -from numpy import mean, std +from numpy import mean, std, percentile from copy import deepcopy from ..utils.constants import Constants as C from ..utils.utils import relative_change_stdev -def select_comparison_data( +def select_comp_data( data: pd.DataFrame, selected: dict, - normalize: bool=False + normalize: bool=False, + remove_outliers: bool=False ) -> pd.DataFrame: """Select data for a comparison table. @@ -35,9 +36,12 @@ def select_comparison_data( the user. :param normalize: If True, the data is normalized to CPU frequency Constants.NORM_FREQUENCY. + :param remove_outliers: If True the outliers are removed before + generating the table. :type data: pandas.DataFrame :type selected: dict :type normalize: bool + :type remove_outliers: bool :returns: A data frame with selected data. :rtype: pandas.DataFrame """ @@ -46,7 +50,8 @@ def select_comparison_data( data_in: pd.DataFrame, ttype: str, drv: str, - norm_factor: float + norm_factor: float, + remove_outliers: bool=False ) -> pd.DataFrame: """Calculates mean value and standard deviation for provided data. @@ -54,10 +59,13 @@ def select_comparison_data( :param ttype: The test type. :param drv: The driver. :param norm_factor: The data normalization factor. + :param remove_outliers: If True the outliers are removed before + generating the table. :type data_in: pandas.DataFrame :type ttype: str :type drv: str :type norm_factor: float + :type remove_outliers: bool :returns: A pandas dataframe with: test name, mean value, standard deviation and unit. :rtype: pandas.DataFrame @@ -82,6 +90,15 @@ def select_comparison_data( for l_itm in l_df: tmp_df.extend(l_itm) l_df = tmp_df + + if remove_outliers: + q1 = percentile(l_df, 25, method=C.COMP_PERCENTILE_METHOD) + q3 = percentile(l_df, 75, method=C.COMP_PERCENTILE_METHOD) + irq = q3 - q1 + lif = q1 - C.COMP_OUTLIER_TYPE * irq + uif = q3 + C.COMP_OUTLIER_TYPE * irq + l_df = [i for i in l_df if i >= lif and i <= uif] + try: mean_val = mean(l_df) std_val = std(l_df) @@ -148,7 +165,8 @@ def select_comparison_data( tmp_df, itm["ttype"].lower(), itm["driver"], - norm_factor + norm_factor, + remove_outliers=remove_outliers ) lst_df.append(tmp_df) @@ -171,7 +189,8 @@ def comparison_table( data: pd.DataFrame, selected: dict, normalize: bool, - format: str="html" + format: str="html", + remove_outliers: bool=False ) -> tuple: """Generate a comparison table. @@ -185,10 +204,13 @@ def comparison_table( of the unit. - csv: To be downloaded as a CSV file the values are stored in base units. + :param remove_outliers: If True the outliers are removed before + generating the table. :type data: pandas.DataFrame :type selected: dict :type normalize: bool :type format: str + :type remove_outliers: bool :returns: A tuple with the tabe title and the comparison table. :rtype: tuple[str, pandas.DataFrame] """ @@ -245,7 +267,7 @@ def comparison_table( c_name = c_params["value"] # Select reference data - r_data = select_comparison_data(data, r_selection, normalize) + r_data = select_comp_data(data, r_selection, normalize, remove_outliers) # Select compare data c_sel = deepcopy(selected["reference"]["selection"]) @@ -255,7 +277,7 @@ def comparison_table( c_sel[c_params["parameter"]] = c_params["value"] c_selection = _create_selection(c_sel) - c_data = select_comparison_data(data, c_selection, normalize) + c_data = select_comp_data(data, c_selection, normalize, remove_outliers) if r_data.empty or c_data.empty: return str(), pd.DataFrame() diff --git a/csit.infra.dash/app/cdash/utils/constants.py b/csit.infra.dash/app/cdash/utils/constants.py index c86f4d5136..5ed5a8cbd7 100644 --- a/csit.infra.dash/app/cdash/utils/constants.py +++ b/csit.infra.dash/app/cdash/utils/constants.py @@ -358,6 +358,24 @@ class Constants: # Default name of downloaded file with selected data. COMP_DOWNLOAD_FILE_NAME = "comparison_data.csv" + # This parameter specifies the method to use for estimating the percentile. + # Possible values: + # - inverted_cdf + # - averaged_inverted_cdf + # - closest_observation + # - interpolated_inverted_cdf + # - hazen + # - weibull + # - linear (default) + # - median_unbiased + # - normal_unbiased + COMP_PERCENTILE_METHOD = "linear" + + # Extreme or mild outlier? + OUTLIER_EXTREME = 3 + OUTLIER_MILD = 1.5 + COMP_OUTLIER_TYPE = OUTLIER_EXTREME + ############################################################################ # Statistics. -- 2.16.6