CSIT-1504: Soak tests - box plots

[csit.git] / resources / tools / presentation / utils.py
diff --git a/resources/tools/presentation/utils.py b/resources/tools/presentation/utils.py

index ba32932..c350fae 100644 (file)
--- a/resources/tools/presentation/utils.py
+++ b/resources/tools/presentation/utils.py
@@ -17,7 +17,6 @@
  import multiprocessing
  import subprocess
  import numpy as np
  import multiprocessing
  import subprocess
  import numpy as np
-import pandas as pd
  import logging
  import csv
  import prettytable
  import logging
  import csv
  import prettytable
@@ -25,9 +24,11 @@ import prettytable
  from os import walk, makedirs, environ
  from os.path import join, isdir
  from shutil import move, Error
  from os import walk, makedirs, environ
  from os.path import join, isdir
  from shutil import move, Error
-from math import sqrt
+from datetime import datetime
+from pandas import Series
  
  from errors import PresentationError
  
  from errors import PresentationError
+from jumpavg.BitCountingClassifier import BitCountingClassifier
  
  
  def mean(items):
  
  
  def mean(items):
@@ -50,11 +51,7 @@ def stdev(items):
      :returns: Stdev.
      :rtype: float
      """
      :returns: Stdev.
      :rtype: float
      """
-
-    avg = mean(items)
-    variance = [(x - avg) ** 2 for x in items]
-    stddev = sqrt(mean(variance))
-    return stddev
+    return Series.std(Series(items))
  
  
  def relative_change(nr1, nr2):
  
  
  def relative_change(nr1, nr2):
@@ -71,73 +68,6 @@ def relative_change(nr1, nr2):
      return float(((nr2 - nr1) / nr1) * 100)
  
  
      return float(((nr2 - nr1) / nr1) * 100)
  
  
-def remove_outliers(input_list, outlier_const=1.5, window=14):
-    """Return list with outliers removed, using split_outliers.
-
-    :param input_list: Data from which the outliers will be removed.
-    :param outlier_const: Outlier constant.
-    :param window: How many preceding values to take into account.
-    :type input_list: list of floats
-    :type outlier_const: float
-    :type window: int
-    :returns: The input list without outliers.
-    :rtype: list of floats
-    """
-
-    data = np.array(input_list)
-    upper_quartile = np.percentile(data, 75)
-    lower_quartile = np.percentile(data, 25)
-    iqr = (upper_quartile - lower_quartile) * outlier_const
-    quartile_set = (lower_quartile - iqr, upper_quartile + iqr)
-    result_lst = list()
-    for y in input_list:
-        if quartile_set[0] <= y <= quartile_set[1]:
-            result_lst.append(y)
-    return result_lst
-
-
-def split_outliers(input_series, outlier_const=1.5, window=14):
-    """Go through the input data and generate two pandas series:
-    - input data with outliers replaced by NAN
-    - outliers.
-    The function uses IQR to detect outliers.
-
-    :param input_series: Data to be examined for outliers.
-    :param outlier_const: Outlier constant.
-    :param window: How many preceding values to take into account.
-    :type input_series: pandas.Series
-    :type outlier_const: float
-    :type window: int
-    :returns: Input data with NAN outliers and Outliers.
-    :rtype: (pandas.Series, pandas.Series)
-    """
-
-    list_data = list(input_series.items())
-    head_size = min(window, len(list_data))
-    head_list = list_data[:head_size]
-    trimmed_data = pd.Series()
-    outliers = pd.Series()
-    for item_x, item_y in head_list:
-        item_pd = pd.Series([item_y, ], index=[item_x, ])
-        trimmed_data = trimmed_data.append(item_pd)
-    for index, (item_x, item_y) in list(enumerate(list_data))[head_size:]:
-        y_rolling_list = [y for (x, y) in list_data[index - head_size:index]]
-        y_rolling_array = np.array(y_rolling_list)
-        q1 = np.percentile(y_rolling_array, 25)
-        q3 = np.percentile(y_rolling_array, 75)
-        iqr = (q3 - q1) * outlier_const
-        low = q1 - iqr
-        item_pd = pd.Series([item_y, ], index=[item_x, ])
-        if low <= item_y:
-            trimmed_data = trimmed_data.append(item_pd)
-        else:
-            outliers = outliers.append(item_pd)
-            nan_pd = pd.Series([np.nan, ], index=[item_x, ])
-            trimmed_data = trimmed_data.append(nan_pd)
-
-    return trimmed_data, outliers
-
-
  def get_files(path, extension=None, full_path=True):
      """Generates the list of files to process.
  
  def get_files(path, extension=None, full_path=True):
      """Generates the list of files to process.
  
@@ -187,8 +117,8 @@ def execute_command(cmd):
  
      :param cmd: Command to execute.
      :type cmd: str
  
      :param cmd: Command to execute.
      :type cmd: str
-    :returns: Return code of the executed command.
-    :rtype: int
+    :returns: Return code of the executed command, stdout and stderr.
+    :rtype: tuple(int, str, str)
      """
  
      env = environ.copy()
      """
  
      env = environ.copy()
@@ -245,6 +175,29 @@ def get_last_completed_build_number(jenkins_url, job_name):
      return execute_command(cmd)
  
  
      return execute_command(cmd)
  
  
+def get_build_timestamp(jenkins_url, job_name, build_nr):
+    """Get the timestamp of the build of the given job.
+
+    :param jenkins_url: Jenkins URL.
+    :param job_name: Job name.
+    :param build_nr: Build number.
+    :type jenkins_url: str
+    :type job_name: str
+    :type build_nr: int
+    :returns: The timestamp.
+    :rtype: datetime.datetime
+    """
+
+    url = "{jenkins_url}/{job_name}/{build_nr}".format(jenkins_url=jenkins_url,
+                                                       job_name=job_name,
+                                                       build_nr=build_nr)
+    cmd = "wget -qO- {url}".format(url=url)
+
+    timestamp = execute_command(cmd)
+
+    return datetime.fromtimestamp(timestamp/1000)
+
+
  def archive_input_data(spec):
      """Archive the report.
  
  def archive_input_data(spec):
      """Archive the report.
  
@@ -276,46 +229,49 @@ def archive_input_data(spec):
      logging.info("    Done.")
  
  
      logging.info("    Done.")
  
  
-def classify_anomalies(data, window):
-    """Evaluates if the sample value is an outlier, regression, normal or
-    progression compared to the previous data within the window.
-    We use the intervals defined as:
-    - regress: less than trimmed moving median - 3 * stdev
-    - normal: between trimmed moving median - 3 * stdev and median + 3 * stdev
-    - progress: more than trimmed moving median + 3 * stdev
-    where stdev is trimmed moving standard deviation.
-
-    :param data: Full data set with the outliers replaced by nan.
-    :param window: Window size used to calculate moving average and moving
-        stdev.
-    :type data: pandas.Series
-    :type window: int
-    :returns: Evaluated results.
-    :rtype: list
-    """
-
-    if data.size < 3:
-        return None
+def classify_anomalies(data):
+    """Process the data and return anomalies and trending values.
  
  
-    win_size = data.size if data.size < window else window
-    tmm = data.rolling(window=win_size, min_periods=2).median()
-    tmstd = data.rolling(window=win_size, min_periods=2).std()
+    Gather data into groups with average as trend value.
+    Decorate values within groups to be normal,
+    the first value of changed average as a regression, or a progression.
  
  
-    classification = ["normal", ]
-    first = True
-    for build, value in data.iteritems():
-        if first:
-            first = False
-            continue
-        if np.isnan(value) or np.isnan(tmm[build]) or np.isnan(tmstd[build]):
+    :param data: Full data set with unavailable samples replaced by nan.
+    :type data: OrderedDict
+    :returns: Classification and trend values
+    :rtype: 2-tuple, list of strings and list of floats
+    """
+    # Nan mean something went wrong.
+    # Use 0.0 to cause that being reported as a severe regression.
+    bare_data = [0.0 if np.isnan(sample.avg) else sample
+                 for _, sample in data.iteritems()]
+    # TODO: Put analogous iterator into jumpavg library.
+    groups = BitCountingClassifier().classify(bare_data)
+    groups.reverse()  # Just to use .pop() for FIFO.
+    classification = []
+    avgs = []
+    active_group = None
+    values_left = 0
+    avg = 0.0
+    for _, sample in data.iteritems():
+        if np.isnan(sample.avg):
              classification.append("outlier")
              classification.append("outlier")
-        elif value < (tmm[build] - 3 * tmstd[build]):
-            classification.append("regression")
-        elif value > (tmm[build] + 3 * tmstd[build]):
-            classification.append("progression")
-        else:
-            classification.append("normal")
-    return classification
+            avgs.append(sample.avg)
+            continue
+        if values_left < 1 or active_group is None:
+            values_left = 0
+            while values_left < 1:  # Ignore empty groups (should not happen).
+                active_group = groups.pop()
+                values_left = len(active_group.values)
+            avg = active_group.metadata.avg
+            classification.append(active_group.metadata.classification)
+            avgs.append(avg)
+            values_left -= 1
+            continue
+        classification.append("normal")
+        avgs.append(avg)
+        values_left -= 1
+    return classification, avgs
  
  
  def convert_csv_to_pretty_txt(csv_file, txt_file):
  
  
  def convert_csv_to_pretty_txt(csv_file, txt_file):