import multiprocessing
import subprocess
+import math
import numpy as np
-import pandas as pd
import logging
import csv
import prettytable
from os import walk, makedirs, environ
from os.path import join, isdir
from shutil import move, Error
-from math import sqrt
+from datetime import datetime
+from pandas import Series
from errors import PresentationError
+from jumpavg.BitCountingClassifier import BitCountingClassifier
def mean(items):
:returns: Stdev.
:rtype: float
"""
-
- avg = mean(items)
- variance = [(x - avg) ** 2 for x in items]
- stddev = sqrt(mean(variance))
- return stddev
+ return Series.std(Series(items))
def relative_change(nr1, nr2):
return float(((nr2 - nr1) / nr1) * 100)
-def remove_outliers(input_list, outlier_const=1.5, window=14):
- """Return list with outliers removed, using split_outliers.
+def relative_change_stdev(mean1, mean2, std1, std2):
+ """Compute relative standard deviation of change of two values.
- :param input_list: Data from which the outliers will be removed.
- :param outlier_const: Outlier constant.
- :param window: How many preceding values to take into account.
- :type input_list: list of floats
- :type outlier_const: float
- :type window: int
- :returns: The input list without outliers.
- :rtype: list of floats
- """
+ The "1" values are the base for comparison.
+ Results are returned as percentage (and percentual points for stdev).
+ Linearized theory is used, so results are wrong for relatively large stdev.
- data = np.array(input_list)
- upper_quartile = np.percentile(data, 75)
- lower_quartile = np.percentile(data, 25)
- iqr = (upper_quartile - lower_quartile) * outlier_const
- quartile_set = (lower_quartile - iqr, upper_quartile + iqr)
- result_lst = list()
- for y in input_list:
- if quartile_set[0] <= y <= quartile_set[1]:
- result_lst.append(y)
- return result_lst
-
-
-def split_outliers(input_series, outlier_const=1.5, window=14):
- """Go through the input data and generate two pandas series:
- - input data with outliers replaced by NAN
- - outliers.
- The function uses IQR to detect outliers.
-
- :param input_series: Data to be examined for outliers.
- :param outlier_const: Outlier constant.
- :param window: How many preceding values to take into account.
- :type input_series: pandas.Series
- :type outlier_const: float
- :type window: int
- :returns: Input data with NAN outliers and Outliers.
- :rtype: (pandas.Series, pandas.Series)
+ :param mean1: Mean of the first number.
+ :param mean2: Mean of the second number.
+ :param std1: Standard deviation estimate of the first number.
+ :param std2: Standard deviation estimate of the second number.
+ :type mean1: float
+ :type mean2: float
+ :type std1: float
+ :type std2: float
+ :returns: Relative change and its stdev.
+ :rtype: float
"""
-
- list_data = list(input_series.items())
- head_size = min(window, len(list_data))
- head_list = list_data[:head_size]
- trimmed_data = pd.Series()
- outliers = pd.Series()
- for item_x, item_y in head_list:
- item_pd = pd.Series([item_y, ], index=[item_x, ])
- trimmed_data = trimmed_data.append(item_pd)
- for index, (item_x, item_y) in list(enumerate(list_data))[head_size:]:
- y_rolling_list = [y for (x, y) in list_data[index - head_size:index]]
- y_rolling_array = np.array(y_rolling_list)
- q1 = np.percentile(y_rolling_array, 25)
- q3 = np.percentile(y_rolling_array, 75)
- iqr = (q3 - q1) * outlier_const
- low = q1 - iqr
- item_pd = pd.Series([item_y, ], index=[item_x, ])
- if low <= item_y:
- trimmed_data = trimmed_data.append(item_pd)
- else:
- outliers = outliers.append(item_pd)
- nan_pd = pd.Series([np.nan, ], index=[item_x, ])
- trimmed_data = trimmed_data.append(nan_pd)
-
- return trimmed_data, outliers
+ mean1, mean2 = float(mean1), float(mean2)
+ quotient = mean2 / mean1
+ first = std1 / mean1
+ second = std2 / mean2
+ std = quotient * math.sqrt(first * first + second * second)
+ return (quotient - 1) * 100, std * 100
def get_files(path, extension=None, full_path=True):
:param cmd: Command to execute.
:type cmd: str
- :returns: Return code of the executed command.
- :rtype: int
+ :returns: Return code of the executed command, stdout and stderr.
+ :rtype: tuple(int, str, str)
"""
env = environ.copy()
return execute_command(cmd)
+def get_build_timestamp(jenkins_url, job_name, build_nr):
+ """Get the timestamp of the build of the given job.
+
+ :param jenkins_url: Jenkins URL.
+ :param job_name: Job name.
+ :param build_nr: Build number.
+ :type jenkins_url: str
+ :type job_name: str
+ :type build_nr: int
+ :returns: The timestamp.
+ :rtype: datetime.datetime
+ """
+
+ url = "{jenkins_url}/{job_name}/{build_nr}".format(jenkins_url=jenkins_url,
+ job_name=job_name,
+ build_nr=build_nr)
+ cmd = "wget -qO- {url}".format(url=url)
+
+ timestamp = execute_command(cmd)
+
+ return datetime.fromtimestamp(timestamp/1000)
+
+
def archive_input_data(spec):
"""Archive the report.
logging.info(" Archiving the input data files ...")
- extension = spec.input["file-format"]
- data_files = get_files(spec.environment["paths"]["DIR[WORKING,DATA]"],
- extension=extension)
+ extension = spec.input["arch-file-format"]
+ data_files = list()
+ for ext in extension:
+ data_files.extend(get_files(
+ spec.environment["paths"]["DIR[WORKING,DATA]"], extension=ext))
dst = spec.environment["paths"]["DIR[STATIC,ARCH]"]
logging.info(" Destination: {0}".format(dst))
logging.info(" Done.")
-def classify_anomalies(data, window):
- """Evaluates if the sample value is an outlier, regression, normal or
- progression compared to the previous data within the window.
- We use the intervals defined as:
- - regress: less than trimmed moving median - 3 * stdev
- - normal: between trimmed moving median - 3 * stdev and median + 3 * stdev
- - progress: more than trimmed moving median + 3 * stdev
- where stdev is trimmed moving standard deviation.
-
- :param data: Full data set with the outliers replaced by nan.
- :param window: Window size used to calculate moving average and moving
- stdev.
- :type data: pandas.Series
- :type window: int
- :returns: Evaluated results.
- :rtype: list
- """
+def classify_anomalies(data):
+ """Process the data and return anomalies and trending values.
- if data.size < 3:
- return None
+ Gather data into groups with average as trend value.
+ Decorate values within groups to be normal,
+ the first value of changed average as a regression, or a progression.
- win_size = data.size if data.size < window else window
- tmm = data.rolling(window=win_size, min_periods=2).median()
- tmstd = data.rolling(window=win_size, min_periods=2).std()
-
- classification = ["normal", ]
- first = True
- for build, value in data.iteritems():
- if first:
- first = False
- continue
- if np.isnan(value) or np.isnan(tmm[build]) or np.isnan(tmstd[build]):
+ :param data: Full data set with unavailable samples replaced by nan.
+ :type data: OrderedDict
+ :returns: Classification and trend values
+ :rtype: 2-tuple, list of strings and list of floats
+ """
+ # Nan mean something went wrong.
+ # Use 0.0 to cause that being reported as a severe regression.
+ bare_data = [0.0 if np.isnan(sample.avg) else sample
+ for _, sample in data.iteritems()]
+ # TODO: Put analogous iterator into jumpavg library.
+ groups = BitCountingClassifier().classify(bare_data)
+ groups.reverse() # Just to use .pop() for FIFO.
+ classification = []
+ avgs = []
+ active_group = None
+ values_left = 0
+ avg = 0.0
+ for _, sample in data.iteritems():
+ if np.isnan(sample.avg):
classification.append("outlier")
- elif value < (tmm[build] - 3 * tmstd[build]):
- classification.append("regression")
- elif value > (tmm[build] + 3 * tmstd[build]):
- classification.append("progression")
- else:
- classification.append("normal")
- return classification
+ avgs.append(sample.avg)
+ continue
+ if values_left < 1 or active_group is None:
+ values_left = 0
+ while values_left < 1: # Ignore empty groups (should not happen).
+ active_group = groups.pop()
+ values_left = len(active_group.values)
+ avg = active_group.metadata.avg
+ classification.append(active_group.metadata.classification)
+ avgs.append(avg)
+ values_left -= 1
+ continue
+ classification.append("normal")
+ avgs.append(avg)
+ values_left -= 1
+ return classification, avgs
def convert_csv_to_pretty_txt(csv_file, txt_file):