-def remove_outliers(input_list, outlier_const=1.5, window=14):
- """Return list with outliers removed, using split_outliers.
-
- :param input_list: Data from which the outliers will be removed.
- :param outlier_const: Outlier constant.
- :param window: How many preceding values to take into account.
- :type input_list: list of floats
- :type outlier_const: float
- :type window: int
- :returns: The input list without outliers.
- :rtype: list of floats
- """
-
- data = np.array(input_list)
- upper_quartile = np.percentile(data, 75)
- lower_quartile = np.percentile(data, 25)
- iqr = (upper_quartile - lower_quartile) * outlier_const
- quartile_set = (lower_quartile - iqr, upper_quartile + iqr)
- result_lst = list()
- for y in data.tolist():
- if quartile_set[0] <= y <= quartile_set[1]:
- result_lst.append(y)
- return result_lst
-
- # input_series = pd.Series()
- # for index, value in enumerate(input_list):
- # item_pd = pd.Series([value, ], index=[index, ])
- # input_series.append(item_pd)
- # output_series, _ = split_outliers(input_series, outlier_const=outlier_const,
- # window=window)
- # output_list = [y for x, y in output_series.items() if not np.isnan(y)]
- #
- # return output_list
-
-
-def split_outliers(input_series, outlier_const=1.5, window=14):
- """Go through the input data and generate two pandas series:
- - input data with outliers replaced by NAN
- - outliers.
- The function uses IQR to detect outliers.
-
- :param input_series: Data to be examined for outliers.
- :param outlier_const: Outlier constant.
- :param window: How many preceding values to take into account.
- :type input_series: pandas.Series
- :type outlier_const: float
- :type window: int
- :returns: Input data with NAN outliers and Outliers.
- :rtype: (pandas.Series, pandas.Series)
- """
-
- list_data = list(input_series.items())
- head_size = min(window, len(list_data))
- head_list = list_data[:head_size]
- trimmed_data = pd.Series()
- outliers = pd.Series()
- for item_x, item_y in head_list:
- item_pd = pd.Series([item_y, ], index=[item_x, ])
- trimmed_data = trimmed_data.append(item_pd)
- for index, (item_x, item_y) in list(enumerate(list_data))[head_size:]:
- y_rolling_list = [y for (x, y) in list_data[index - head_size:index]]
- y_rolling_array = np.array(y_rolling_list)
- q1 = np.percentile(y_rolling_array, 25)
- q3 = np.percentile(y_rolling_array, 75)
- iqr = (q3 - q1) * outlier_const
- low, high = q1 - iqr, q3 + iqr
- item_pd = pd.Series([item_y, ], index=[item_x, ])
- if low <= item_y <= high:
- trimmed_data = trimmed_data.append(item_pd)
- else:
- outliers = outliers.append(item_pd)
- nan_pd = pd.Series([np.nan, ], index=[item_x, ])
- trimmed_data = trimmed_data.append(nan_pd)
-
- return trimmed_data, outliers
-
-