csit.infra.dash/app/cdash/comparisons/tables.py

   1 # Copyright (c) 2024 Cisco and/or its affiliates.
   2 # Licensed under the Apache License, Version 2.0 (the "License");
   3 # you may not use this file except in compliance with the License.
   4 # You may obtain a copy of the License at:
   5 #
   6 #     http://www.apache.org/licenses/LICENSE-2.0
   7 #
   8 # Unless required by applicable law or agreed to in writing, software
   9 # distributed under the License is distributed on an "AS IS" BASIS,
  10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11 # See the License for the specific language governing permissions and
  12 # limitations under the License.
  13
  14 """The comparison tables.
  15 """
  16
  17 import pandas as pd
  18
  19 from numpy import mean, std, percentile
  20 from copy import deepcopy
  21
  22 from ..utils.constants import Constants as C
  23 from ..utils.utils import relative_change_stdev
  24
  25
  26 def select_comp_data(
  27         data: pd.DataFrame,
  28         selected: dict,
  29         normalize: bool=False,
  30         remove_outliers: bool=False,
  31         raw_data: bool=False
  32     ) -> pd.DataFrame:
  33     """Select data for a comparison table.
  34
  35     :param data: Data to be filtered for the comparison table.
  36     :param selected: A dictionary with parameters and their values selected by
  37         the user.
  38     :param normalize: If True, the data is normalized to CPU frequency
  39         Constants.NORM_FREQUENCY.
  40     :param remove_outliers: If True the outliers are removed before
  41         generating the table.
  42     :param raw_data: If True, returns data as it is in parquets without any
  43         processing. It is used for "download raw data" feature.
  44     :type data: pandas.DataFrame
  45     :type selected: dict
  46     :type normalize: bool
  47     :type remove_outliers: bool
  48     :type raw_data: bool
  49     :returns: A data frame with selected data.
  50     :rtype: pandas.DataFrame
  51     """
  52
  53     def _calculate_statistics(
  54             data_in: pd.DataFrame,
  55             ttype: str,
  56             drv: str,
  57             norm_factor: float,
  58             remove_outliers: bool=False
  59         ) -> pd.DataFrame:
  60         """Calculates mean value and standard deviation for provided data.
  61
  62         :param data_in: Input data for calculations.
  63         :param ttype: The test type.
  64         :param drv: The driver.
  65         :param norm_factor: The data normalization factor.
  66         :param remove_outliers: If True the outliers are removed before
  67             generating the table.
  68         :type data_in: pandas.DataFrame
  69         :type ttype: str
  70         :type drv: str
  71         :type norm_factor: float
  72         :type remove_outliers: bool
  73         :returns: A pandas dataframe with: test name, mean value, standard
  74             deviation and unit.
  75         :rtype: pandas.DataFrame
  76         """
  77         d_data = {
  78             "name": list(),
  79             "mean": list(),
  80             "stdev": list(),
  81             "unit": list()
  82         }
  83         for itm in data_in["test_id"].unique().tolist():
  84             itm_lst = itm.split(".")
  85             test = itm_lst[-1].rsplit("-", 1)[0]
  86             if "hoststack" in itm:
  87                 test_type = f"hoststack-{ttype}"
  88             else:
  89                 test_type = ttype
  90             df = data_in.loc[(data_in["test_id"] == itm)]
  91             l_df = df[C.VALUE_ITER[test_type]].to_list()
  92             if len(l_df) and isinstance(l_df[0], list):
  93                 tmp_df = list()
  94                 for l_itm in l_df:
  95                     tmp_df.extend(l_itm)
  96                 l_df = tmp_df
  97
  98             try:
  99                 if remove_outliers:
 100                     q1 = percentile(l_df, 25, method=C.COMP_PERCENTILE_METHOD)
 101                     q3 = percentile(l_df, 75, method=C.COMP_PERCENTILE_METHOD)
 102                     irq = q3 - q1
 103                     lif = q1 - C.COMP_OUTLIER_TYPE * irq
 104                     uif = q3 + C.COMP_OUTLIER_TYPE * irq
 105                     l_df = [i for i in l_df if i >= lif and i <= uif]
 106                 mean_val = mean(l_df)
 107                 std_val = std(l_df)
 108             except (TypeError, ValueError):
 109                 continue
 110             d_data["name"].append(f"{test.replace(f'{drv}-', '')}-{ttype}")
 111             d_data["mean"].append(int(mean_val * norm_factor))
 112             d_data["stdev"].append(int(std_val * norm_factor))
 113             d_data["unit"].append(df[C.UNIT[test_type]].to_list()[0])
 114         return pd.DataFrame(d_data)
 115
 116     lst_df = list()
 117     for itm in selected:
 118         if itm["ttype"] in ("NDR", "PDR", "Latency"):
 119             test_type = "ndrpdr"
 120         elif itm["ttype"] in ("CPS", "RPS", "BPS"):
 121             test_type  = "hoststack"
 122         else:
 123             test_type = itm["ttype"].lower()
 124
 125         dutver = itm["dutver"].split("-", 1)  # 0 -> release, 1 -> dut version
 126         tmp_df = pd.DataFrame(data.loc[(
 127             (data["passed"] == True) &
 128             (data["dut_type"] == itm["dut"]) &
 129             (data["dut_version"] == dutver[1]) &
 130             (data["test_type"] == test_type) &
 131             (data["release"] == dutver[0])
 132         )])
 133
 134         drv = "" if itm["driver"] == "dpdk" else itm["driver"].replace("_", "-")
 135         core = str() if itm["dut"] == "trex" else itm["core"].lower()
 136         ttype = "ndrpdr" if itm["ttype"] in ("NDR", "PDR", "Latency") \
 137             else itm["ttype"].lower()
 138         tmp_df = tmp_df[
 139             (tmp_df.job.str.endswith(itm["tbed"])) &
 140             (tmp_df.test_id.str.contains(
 141                 (
 142                     f"^.*[.|-]{itm['nic']}.*{itm['frmsize'].lower()}-"
 143                     f"{core}-{drv}.*-{ttype}$"
 144                 ),
 145                 regex=True
 146             ))
 147         ]
 148         if itm["driver"] == "dpdk":
 149             for drv in C.DRIVERS:
 150                 tmp_df.drop(
 151                     tmp_df[tmp_df.test_id.str.contains(f"-{drv}-")].index,
 152                     inplace=True
 153                 )
 154
 155         # Change the data type from ndrpdr to one of ("NDR", "PDR", "Latency")
 156         if test_type == "ndrpdr":
 157             tmp_df = tmp_df.assign(test_type=itm["ttype"].lower())
 158
 159         if not tmp_df.empty:
 160             if normalize:
 161                 if itm["ttype"] == "Latency":
 162                     norm_factor = C.FREQUENCY[itm["tbed"]] / C.NORM_FREQUENCY
 163                 else:
 164                     norm_factor = C.NORM_FREQUENCY / C.FREQUENCY[itm["tbed"]]
 165             else:
 166                 norm_factor = 1.0
 167             if not raw_data:
 168                 tmp_df = _calculate_statistics(
 169                     tmp_df,
 170                     itm["ttype"].lower(),
 171                     itm["driver"],
 172                     norm_factor,
 173                     remove_outliers=remove_outliers
 174                 )
 175
 176         lst_df.append(tmp_df)
 177
 178     if len(lst_df) == 1:
 179         df = lst_df[0]
 180     elif len(lst_df) > 1:
 181         df = pd.concat(
 182             lst_df,
 183             ignore_index=True,
 184             copy=False
 185         )
 186     else:
 187         df = pd.DataFrame()
 188
 189     return df
 190
 191
 192 def comparison_table(
 193         data: pd.DataFrame,
 194         selected: dict,
 195         normalize: bool,
 196         format: str="html",
 197         remove_outliers: bool=False,
 198         raw_data: bool=False
 199     ) -> tuple:
 200     """Generate a comparison table.
 201
 202     :param data: Iterative data for the comparison table.
 203     :param selected: A dictionary with parameters and their values selected by
 204         the user.
 205     :param normalize: If True, the data is normalized to CPU frequency
 206         Constants.NORM_FREQUENCY.
 207     :param format: The output format of the table:
 208         - html: To be displayed on html page, the values are shown in millions
 209           of the unit.
 210         - csv: To be downloaded as a CSV file the values are stored in base
 211           units.
 212     :param remove_outliers: If True the outliers are removed before
 213         generating the table.
 214     :param raw_data: If True, returns data as it is in parquets without any
 215         processing. It is used for "download raw data" feature.
 216     :type data: pandas.DataFrame
 217     :type selected: dict
 218     :type normalize: bool
 219     :type format: str
 220     :type remove_outliers: bool
 221     :type raw_data: bool
 222     :returns: A tuple with the tabe title and the comparison table.
 223     :rtype: tuple[str, pandas.DataFrame]
 224     """
 225
 226     def _create_selection(sel: dict) -> list:
 227         """Transform the complex dictionary with user selection to list
 228             of simple items.
 229
 230         :param sel: A complex dictionary with user selection.
 231         :type sel: dict
 232         :returns: A list of simple items.
 233         :rtype: list
 234         """
 235         l_infra = sel["infra"].split("-")
 236         selection = list()
 237         for core in sel["core"]:
 238             for fsize in sel["frmsize"]:
 239                 for ttype in sel["ttype"]:
 240                     selection.append({
 241                         "dut": sel["dut"],
 242                         "dutver": sel["dutver"],
 243                         "tbed": f"{l_infra[0]}-{l_infra[1]}",
 244                         "nic": l_infra[2],
 245                         "driver": l_infra[-1].replace("_", "-"),
 246                         "core": core,
 247                         "frmsize": fsize,
 248                         "ttype": ttype
 249                     })
 250         return selection
 251
 252     # Select reference data
 253     r_sel = deepcopy(selected["reference"]["selection"])
 254     r_selection = _create_selection(r_sel)
 255     r_data = select_comp_data(
 256         data, r_selection, normalize, remove_outliers, raw_data
 257     )
 258
 259     # Select compare data
 260     c_sel = deepcopy(selected["reference"]["selection"])
 261     c_params = selected["compare"]
 262     if c_params["parameter"] in ("core", "frmsize", "ttype"):
 263         c_sel[c_params["parameter"]] = [c_params["value"], ]
 264     else:
 265         c_sel[c_params["parameter"]] = c_params["value"]
 266     c_selection = _create_selection(c_sel)
 267     c_data = select_comp_data(
 268         data, c_selection, normalize, remove_outliers, raw_data
 269     )
 270
 271     if raw_data:
 272         r_data["ref/cmp"] = "reference"
 273         c_data["ref/cmp"] = "compare"
 274         return str(), pd.concat([r_data, c_data], ignore_index=True, copy=False)
 275
 276     if r_data.empty or c_data.empty:
 277         return str(), pd.DataFrame()
 278
 279     if format == "html" and "Latency" not in r_sel["ttype"]:
 280         unit_factor, s_unit_factor = (1e6, "M")
 281     else:
 282         unit_factor, s_unit_factor = (1, str())
 283
 284     # Create Table title and titles of columns with data
 285     params = list(r_sel)
 286     params.remove(c_params["parameter"])
 287     lst_title = list()
 288     for param in params:
 289         value = r_sel[param]
 290         if isinstance(value, list):
 291             lst_title.append("|".join(value))
 292         else:
 293             lst_title.append(value)
 294     title = "Comparison for: " + "-".join(lst_title)
 295     r_name = r_sel[c_params["parameter"]]
 296     if isinstance(r_name, list):
 297         r_name = "|".join(r_name)
 298     c_name = c_params["value"]
 299
 300     l_name, l_r_mean, l_r_std, l_c_mean, l_c_std, l_rc_mean, l_rc_std, unit = \
 301         list(), list(), list(), list(), list(), list(), list(), set()
 302     for _, row in r_data.iterrows():
 303         if c_params["parameter"] in ("core", "frmsize", "ttype"):
 304             l_cmp = row["name"].split("-")
 305             if c_params["parameter"] == "core":
 306                 c_row = c_data[
 307                     (c_data.name.str.contains(l_cmp[0])) &
 308                     (c_data.name.str.contains("-".join(l_cmp[2:])))
 309                 ]
 310             elif c_params["parameter"] == "frmsize":
 311                 c_row = c_data[c_data.name.str.contains("-".join(l_cmp[1:]))]
 312             elif c_params["parameter"] == "ttype":
 313                 regex = r"^" + f"{'-'.join(l_cmp[:-1])}" + r"-.{3}$"
 314                 c_row = c_data[c_data.name.str.contains(regex, regex=True)]
 315         else:
 316             c_row = c_data[c_data["name"] == row["name"]]
 317         if not c_row.empty:
 318             unit.add(f"{s_unit_factor}{row['unit']}")
 319             r_mean = row["mean"]
 320             r_std = row["stdev"]
 321             c_mean = c_row["mean"].values[0]
 322             c_std = c_row["stdev"].values[0]
 323             l_name.append(row["name"])
 324             l_r_mean.append(r_mean / unit_factor)
 325             l_r_std.append(r_std / unit_factor)
 326             l_c_mean.append(c_mean / unit_factor)
 327             l_c_std.append(c_std / unit_factor)
 328             delta, d_stdev = relative_change_stdev(r_mean, c_mean, r_std, c_std)
 329             l_rc_mean.append(delta)
 330             l_rc_std.append(d_stdev)
 331
 332     s_unit = "|".join(unit)
 333     df_cmp = pd.DataFrame.from_dict({
 334         "Test Name": l_name,
 335         f"{r_name} Mean [{s_unit}]": l_r_mean,
 336         f"{r_name} Stdev [{s_unit}]": l_r_std,
 337         f"{c_name} Mean [{s_unit}]": l_c_mean,
 338         f"{c_name} Stdev [{s_unit}]": l_c_std,
 339         "Relative Change Mean [%]": l_rc_mean,
 340         "Relative Change Stdev [%]": l_rc_std
 341     })
 342     df_cmp.sort_values(
 343         by="Relative Change Mean [%]",
 344         ascending=False,
 345         inplace=True
 346     )
 347
 348     return (title, df_cmp)