csit.infra.dash/app/cdash/comparisons/tables.py

   1 # Copyright (c) 2024 Cisco and/or its affiliates.
   2 # Licensed under the Apache License, Version 2.0 (the "License");
   3 # you may not use this file except in compliance with the License.
   4 # You may obtain a copy of the License at:
   5 #
   6 #     http://www.apache.org/licenses/LICENSE-2.0
   7 #
   8 # Unless required by applicable law or agreed to in writing, software
   9 # distributed under the License is distributed on an "AS IS" BASIS,
  10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11 # See the License for the specific language governing permissions and
  12 # limitations under the License.
  13
  14 """The comparison tables.
  15 """
  16
  17 import pandas as pd
  18
  19 from numpy import mean, std, percentile
  20 from copy import deepcopy
  21
  22 from ..utils.constants import Constants as C
  23 from ..utils.utils import relative_change_stdev
  24
  25
  26 def select_comp_data(
  27         data: pd.DataFrame,
  28         selected: dict,
  29         normalize: bool=False,
  30         remove_outliers: bool=False,
  31         raw_data: bool=False
  32     ) -> pd.DataFrame:
  33     """Select data for a comparison table.
  34
  35     :param data: Data to be filtered for the comparison table.
  36     :param selected: A dictionary with parameters and their values selected by
  37         the user.
  38     :param normalize: If True, the data is normalized to CPU frequency
  39         Constants.NORM_FREQUENCY.
  40     :param remove_outliers: If True the outliers are removed before
  41         generating the table.
  42     :param raw_data: If True, returns data as it is in parquets without any
  43         processing. It is used for "download raw data" feature.
  44     :type data: pandas.DataFrame
  45     :type selected: dict
  46     :type normalize: bool
  47     :type remove_outliers: bool
  48     :type raw_data: bool
  49     :returns: A data frame with selected data.
  50     :rtype: pandas.DataFrame
  51     """
  52
  53     def _calculate_statistics(
  54             data_in: pd.DataFrame,
  55             ttype: str,
  56             drv: str,
  57             norm_factor: float,
  58             remove_outliers: bool=False
  59         ) -> pd.DataFrame:
  60         """Calculates mean value and standard deviation for provided data.
  61
  62         :param data_in: Input data for calculations.
  63         :param ttype: The test type.
  64         :param drv: The driver.
  65         :param norm_factor: The data normalization factor.
  66         :param remove_outliers: If True the outliers are removed before
  67             generating the table.
  68         :type data_in: pandas.DataFrame
  69         :type ttype: str
  70         :type drv: str
  71         :type norm_factor: float
  72         :type remove_outliers: bool
  73         :returns: A pandas dataframe with: test name, mean value, standard
  74             deviation and unit.
  75         :rtype: pandas.DataFrame
  76         """
  77         d_data = {
  78             "name": list(),
  79             "mean": list(),
  80             "stdev": list(),
  81             "unit": list()
  82         }
  83         for itm in data_in["test_id"].unique().tolist():
  84             itm_lst = itm.split(".")
  85             test = itm_lst[-1].rsplit("-", 1)[0]
  86             if "hoststack" in itm:
  87                 test_type = f"hoststack-{ttype}"
  88             else:
  89                 test_type = ttype
  90             df = data_in.loc[(data_in["test_id"] == itm)]
  91             l_df = df[C.VALUE_ITER[test_type]].to_list()
  92             if len(l_df) and isinstance(l_df[0], list):
  93                 tmp_df = list()
  94                 for l_itm in l_df:
  95                     tmp_df.extend(l_itm)
  96                 l_df = tmp_df
  97
  98             if remove_outliers:
  99                 q1 = percentile(l_df, 25, method=C.COMP_PERCENTILE_METHOD)
 100                 q3 = percentile(l_df, 75, method=C.COMP_PERCENTILE_METHOD)
 101                 irq = q3 - q1
 102                 lif = q1 - C.COMP_OUTLIER_TYPE * irq
 103                 uif = q3 + C.COMP_OUTLIER_TYPE * irq
 104                 l_df = [i for i in l_df if i >= lif and i <= uif]
 105
 106             try:
 107                 mean_val = mean(l_df)
 108                 std_val = std(l_df)
 109             except (TypeError, ValueError):
 110                 continue
 111             d_data["name"].append(f"{test.replace(f'{drv}-', '')}-{ttype}")
 112             d_data["mean"].append(int(mean_val * norm_factor))
 113             d_data["stdev"].append(int(std_val * norm_factor))
 114             d_data["unit"].append(df[C.UNIT[test_type]].to_list()[0])
 115         return pd.DataFrame(d_data)
 116
 117     lst_df = list()
 118     for itm in selected:
 119         if itm["ttype"] in ("NDR", "PDR", "Latency"):
 120             test_type = "ndrpdr"
 121         elif itm["ttype"] in ("CPS", "RPS", "BPS"):
 122             test_type  = "hoststack"
 123         else:
 124             test_type = itm["ttype"].lower()
 125
 126         dutver = itm["dutver"].split("-", 1)  # 0 -> release, 1 -> dut version
 127         tmp_df = pd.DataFrame(data.loc[(
 128             (data["passed"] == True) &
 129             (data["dut_type"] == itm["dut"]) &
 130             (data["dut_version"] == dutver[1]) &
 131             (data["test_type"] == test_type) &
 132             (data["release"] == dutver[0])
 133         )])
 134
 135         drv = "" if itm["driver"] == "dpdk" else itm["driver"].replace("_", "-")
 136         core = str() if itm["dut"] == "trex" else itm["core"].lower()
 137         ttype = "ndrpdr" if itm["ttype"] in ("NDR", "PDR", "Latency") \
 138             else itm["ttype"].lower()
 139         tmp_df = tmp_df[
 140             (tmp_df.job.str.endswith(itm["tbed"])) &
 141             (tmp_df.test_id.str.contains(
 142                 (
 143                     f"^.*[.|-]{itm['nic']}.*{itm['frmsize'].lower()}-"
 144                     f"{core}-{drv}.*-{ttype}$"
 145                 ),
 146                 regex=True
 147             ))
 148         ]
 149         if itm["driver"] == "dpdk":
 150             for drv in C.DRIVERS:
 151                 tmp_df.drop(
 152                     tmp_df[tmp_df.test_id.str.contains(f"-{drv}-")].index,
 153                     inplace=True
 154                 )
 155
 156         # Change the data type from ndrpdr to one of ("NDR", "PDR", "Latency")
 157         if test_type == "ndrpdr":
 158             tmp_df = tmp_df.assign(test_type=itm["ttype"].lower())
 159
 160         if not tmp_df.empty:
 161             if normalize:
 162                 if itm["ttype"] == "Latency":
 163                     norm_factor = C.FREQUENCY[itm["tbed"]] / C.NORM_FREQUENCY
 164                 else:
 165                     norm_factor = C.NORM_FREQUENCY / C.FREQUENCY[itm["tbed"]]
 166             else:
 167                 norm_factor = 1.0
 168             if not raw_data:
 169                 tmp_df = _calculate_statistics(
 170                     tmp_df,
 171                     itm["ttype"].lower(),
 172                     itm["driver"],
 173                     norm_factor,
 174                     remove_outliers=remove_outliers
 175                 )
 176
 177         lst_df.append(tmp_df)
 178
 179     if len(lst_df) == 1:
 180         df = lst_df[0]
 181     elif len(lst_df) > 1:
 182         df = pd.concat(
 183             lst_df,
 184             ignore_index=True,
 185             copy=False
 186         )
 187     else:
 188         df = pd.DataFrame()
 189
 190     return df
 191
 192
 193 def comparison_table(
 194         data: pd.DataFrame,
 195         selected: dict,
 196         normalize: bool,
 197         format: str="html",
 198         remove_outliers: bool=False,
 199         raw_data: bool=False
 200     ) -> tuple:
 201     """Generate a comparison table.
 202
 203     :param data: Iterative data for the comparison table.
 204     :param selected: A dictionary with parameters and their values selected by
 205         the user.
 206     :param normalize: If True, the data is normalized to CPU frequency
 207         Constants.NORM_FREQUENCY.
 208     :param format: The output format of the table:
 209         - html: To be displayed on html page, the values are shown in millions
 210           of the unit.
 211         - csv: To be downloaded as a CSV file the values are stored in base
 212           units.
 213     :param remove_outliers: If True the outliers are removed before
 214         generating the table.
 215     :param raw_data: If True, returns data as it is in parquets without any
 216         processing. It is used for "download raw data" feature.
 217     :type data: pandas.DataFrame
 218     :type selected: dict
 219     :type normalize: bool
 220     :type format: str
 221     :type remove_outliers: bool
 222     :type raw_data: bool
 223     :returns: A tuple with the tabe title and the comparison table.
 224     :rtype: tuple[str, pandas.DataFrame]
 225     """
 226
 227     def _create_selection(sel: dict) -> list:
 228         """Transform the complex dictionary with user selection to list
 229             of simple items.
 230
 231         :param sel: A complex dictionary with user selection.
 232         :type sel: dict
 233         :returns: A list of simple items.
 234         :rtype: list
 235         """
 236         l_infra = sel["infra"].split("-")
 237         selection = list()
 238         for core in sel["core"]:
 239             for fsize in sel["frmsize"]:
 240                 for ttype in sel["ttype"]:
 241                     selection.append({
 242                         "dut": sel["dut"],
 243                         "dutver": sel["dutver"],
 244                         "tbed": f"{l_infra[0]}-{l_infra[1]}",
 245                         "nic": l_infra[2],
 246                         "driver": l_infra[-1].replace("_", "-"),
 247                         "core": core,
 248                         "frmsize": fsize,
 249                         "ttype": ttype
 250                     })
 251         return selection
 252
 253     # Select reference data
 254     r_sel = deepcopy(selected["reference"]["selection"])
 255     r_selection = _create_selection(r_sel)
 256     r_data = select_comp_data(
 257         data, r_selection, normalize, remove_outliers, raw_data
 258     )
 259
 260     # Select compare data
 261     c_sel = deepcopy(selected["reference"]["selection"])
 262     c_params = selected["compare"]
 263     if c_params["parameter"] in ("core", "frmsize", "ttype"):
 264         c_sel[c_params["parameter"]] = [c_params["value"], ]
 265     else:
 266         c_sel[c_params["parameter"]] = c_params["value"]
 267     c_selection = _create_selection(c_sel)
 268     c_data = select_comp_data(
 269         data, c_selection, normalize, remove_outliers, raw_data
 270     )
 271
 272     if raw_data:
 273         r_data["ref/cmp"] = "reference"
 274         c_data["ref/cmp"] = "compare"
 275         return str(), pd.concat([r_data, c_data], ignore_index=True, copy=False)
 276
 277     if r_data.empty or c_data.empty:
 278         return str(), pd.DataFrame()
 279
 280     if format == "html" and "Latency" not in r_sel["ttype"]:
 281         unit_factor, s_unit_factor = (1e6, "M")
 282     else:
 283         unit_factor, s_unit_factor = (1, str())
 284
 285     # Create Table title and titles of columns with data
 286     params = list(r_sel)
 287     params.remove(c_params["parameter"])
 288     lst_title = list()
 289     for param in params:
 290         value = r_sel[param]
 291         if isinstance(value, list):
 292             lst_title.append("|".join(value))
 293         else:
 294             lst_title.append(value)
 295     title = "Comparison for: " + "-".join(lst_title)
 296     r_name = r_sel[c_params["parameter"]]
 297     if isinstance(r_name, list):
 298         r_name = "|".join(r_name)
 299     c_name = c_params["value"]
 300
 301     l_name, l_r_mean, l_r_std, l_c_mean, l_c_std, l_rc_mean, l_rc_std, unit = \
 302         list(), list(), list(), list(), list(), list(), list(), set()
 303     for _, row in r_data.iterrows():
 304         if c_params["parameter"] in ("core", "frmsize", "ttype"):
 305             l_cmp = row["name"].split("-")
 306             if c_params["parameter"] == "core":
 307                 c_row = c_data[
 308                     (c_data.name.str.contains(l_cmp[0])) &
 309                     (c_data.name.str.contains("-".join(l_cmp[2:])))
 310                 ]
 311             elif c_params["parameter"] == "frmsize":
 312                 c_row = c_data[c_data.name.str.contains("-".join(l_cmp[1:]))]
 313             elif c_params["parameter"] == "ttype":
 314                 regex = r"^" + f"{'-'.join(l_cmp[:-1])}" + r"-.{3}$"
 315                 c_row = c_data[c_data.name.str.contains(regex, regex=True)]
 316         else:
 317             c_row = c_data[c_data["name"] == row["name"]]
 318         if not c_row.empty:
 319             unit.add(f"{s_unit_factor}{row['unit']}")
 320             r_mean = row["mean"]
 321             r_std = row["stdev"]
 322             c_mean = c_row["mean"].values[0]
 323             c_std = c_row["stdev"].values[0]
 324             l_name.append(row["name"])
 325             l_r_mean.append(r_mean / unit_factor)
 326             l_r_std.append(r_std / unit_factor)
 327             l_c_mean.append(c_mean / unit_factor)
 328             l_c_std.append(c_std / unit_factor)
 329             delta, d_stdev = relative_change_stdev(r_mean, c_mean, r_std, c_std)
 330             l_rc_mean.append(delta)
 331             l_rc_std.append(d_stdev)
 332
 333     s_unit = "|".join(unit)
 334     df_cmp = pd.DataFrame.from_dict({
 335         "Test Name": l_name,
 336         f"{r_name} Mean [{s_unit}]": l_r_mean,
 337         f"{r_name} Stdev [{s_unit}]": l_r_std,
 338         f"{c_name} Mean [{s_unit}]": l_c_mean,
 339         f"{c_name} Stdev [{s_unit}]": l_c_std,
 340         "Relative Change Mean [%]": l_rc_mean,
 341         "Relative Change Stdev [%]": l_rc_std
 342     })
 343     df_cmp.sort_values(
 344         by="Relative Change Mean [%]",
 345         ascending=False,
 346         inplace=True
 347     )
 348
 349     return (title, df_cmp)