resources/libraries/python/PLRsearch/stat_trackers.py

   1 # Copyright (c) 2019 Cisco and/or its affiliates.
   2 # Licensed under the Apache License, Version 2.0 (the "License");
   3 # you may not use this file except in compliance with the License.
   4 # You may obtain a copy of the License at:
   5 #
   6 #     http://www.apache.org/licenses/LICENSE-2.0
   7 #
   8 # Unless required by applicable law or agreed to in writing, software
   9 # distributed under the License is distributed on an "AS IS" BASIS,
  10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11 # See the License for the specific language governing permissions and
  12 # limitations under the License.
  13
  14 """Module for tracking average and variance for data of weighted samples.
  15
  16 See log_plus for an explanation why None acts as a special case "float" number.
  17
  18 TODO: Current implementation sets zero averages on empty tracker.
  19 Should we set None instead?
  20
  21 TODO: Implement __str__ and __repr__ for easier debugging.
  22 """
  23
  24 import copy
  25 import math
  26
  27 import numpy
  28
  29 # TODO: Teach FD.io CSIT to use multiple dirs in PYTHONPATH,
  30 # then switch to absolute imports within PLRsearch package.
  31 # Current usage of relative imports is just a short term workaround.
  32 from log_plus import log_plus  # pylint: disable=relative-import
  33
  34
  35 class ScalarStatTracker(object):
  36     """Class for tracking one-dimensional samples.
  37
  38     Variance of one-dimensional data cannot be negative,
  39     so this class avoids possible underflows by tracking logarithm
  40     of the variance instead.
  41
  42     Similarly, sum of weights is also tracked as a logarithm.
  43     """
  44
  45     def __init__(self, log_sum_weight=None, average=0.0, log_variance=None):
  46         """Initialize new tracker instance, empty by default.
  47
  48         :param log_sum_weight: Natural logarithm of sum of weights
  49             of samples so far. Default: None (as log of 0.0).
  50         :param average: Weighted average of the samples. Default: 0.0
  51         :param log_variance: Natural logarithm of variance.
  52             Default: None (as log of 0.0).
  53         :type log_sum_weight: float or None
  54         :type average: float
  55         :type log_variance: float or None
  56         """
  57         self.log_sum_weight = log_sum_weight
  58         self.average = average
  59         self.log_variance = log_variance
  60
  61     def __repr__(self):
  62         """Return string, which interpreted constructs state of self."""
  63         return ("ScalarStatTracker(log_sum_weight={lsw!r},average={a!r},"
  64                 "log_variance={lv!r})".format(
  65                     lsw=self.log_sum_weight, a=self.average,
  66                     lv=self.log_variance))
  67
  68     def copy(self):
  69         """Return new ScalarStatTracker instance with the same state as self.
  70
  71         The point of this method is the return type, instances of subclasses
  72         can get rid of their additional data this way.
  73
  74         :returns: New instance with the same core state.
  75         :rtype: ScalarStatTracker
  76         """
  77         return ScalarStatTracker(
  78             self.log_sum_weight, self.average, self.log_variance)
  79
  80     def add(self, scalar_value, log_weight=0.0):
  81         """Return updated stats corresponding to addition of another sample.
  82
  83         :param scalar_value: The scalar value of the sample.
  84         :param log_weight: Natural logarithm of weight of the sample.
  85             Default: 0.0 (as log of 1.0).
  86         :type scalar_value: float
  87         :type log_weight: float
  88         :returns: Updated self.
  89         :rtype: ScalarStatTracker
  90         """
  91         old_log_sum_weight = self.log_sum_weight
  92         if old_log_sum_weight is None:
  93             # First sample.
  94             self.log_sum_weight = log_weight
  95             self.average = scalar_value
  96             return self
  97         old_average = self.average
  98         log_variance = self.log_variance
  99         new_log_sum_weight = log_plus(old_log_sum_weight, log_weight)
 100         log_sample_ratio = log_weight - new_log_sum_weight
 101         sample_ratio = math.exp(log_sample_ratio)
 102         shift = scalar_value - old_average
 103         new_average = old_average + shift * sample_ratio
 104         # The log_variance is updated in-place (instead of new_ vs old_)
 105         # because of if-s detecting None-s where the value does not change.
 106         absolute_shift = abs(shift)
 107         if absolute_shift > 0.0:
 108             log_square_shift = 2 * math.log(absolute_shift)
 109             log_variance = log_plus(
 110                 log_variance, log_square_shift + log_sample_ratio)
 111         if log_variance is not None:
 112             log_variance += old_log_sum_weight - new_log_sum_weight
 113         self.log_sum_weight = new_log_sum_weight
 114         self.average = new_average
 115         self.log_variance = log_variance
 116         return self
 117
 118
 119 class ScalarDualStatTracker(ScalarStatTracker):
 120     """Class for tracking one-dimensional samples, offering dual stats.
 121
 122     It means that instead of just primary stats (identical to what
 123     ScalarStatTracker offers, that is why this is its subclass),
 124     this tracker allso offers secondary stats.
 125     Secondary stats are scalar stats that track the same data,
 126     except that the most weighty (or later if tied) sample is not considered.
 127
 128     Users can analyze how do secondary stats differ from the primary ones,
 129     and based on that decide whether they processed "enough" data.
 130     One typical use is for Monte Carlo integrator to decide whether
 131     the partial sums so far are reliable enough.
 132     """
 133
 134     def __init__(
 135             self, log_sum_weight=None, average=0.0, log_variance=None,
 136             log_sum_secondary_weight=None, secondary_average=0.0,
 137             log_secondary_variance=None, max_log_weight=None):
 138         """Initialize new tracker instance, empty by default.
 139
 140         :param log_sum_weight: Natural logarithm of sum of weights
 141             of samples so far. Default: None (as log of 0.0).
 142         :param average: Weighted average of the samples. Default: 0.0
 143         :param log_variance: Natural logarithm of variance.
 144             Default: None (as log of 0.0).
 145         :param log_sum_secondary_weight: Natural logarithm of sum of weights
 146             of samples so far except weightest one.
 147             Default: None (as log of 0.0).
 148         :param secondary_average: Weighted average of the samples
 149             except the weightest one. Default: 0.0
 150         :param log_variance: Natural logarithm of variance od samples
 151             except the weightest one. Default: None (as log of 0.0).
 152         :param max_log_weight: Natural logarithm of weight of sample
 153             counted in primary but not secondary stats.
 154             Default: None (as log of 0.0).
 155         :type log_sum_weight: float or None
 156         :type average: float
 157         :type log_variance: float or None
 158         :type log_sum_secondary_weight: float or None
 159         :type secondary_average: float
 160         :type log_secondary_variance: float or None
 161         :type max_log_weight: float or None
 162         """
 163         # Not using super() as the constructor signature is different,
 164         # so in case of diamond inheritance mismatch would be probable.
 165         ScalarStatTracker.__init__(self, log_sum_weight, average, log_variance)
 166         self.secondary = ScalarStatTracker(
 167             log_sum_secondary_weight, secondary_average, log_secondary_variance)
 168         self.max_log_weight = max_log_weight
 169
 170     def __repr__(self):
 171         """Return string, which interpreted constructs state of self."""
 172         sec = self.secondary
 173         return (
 174             "ScalarDualStatTracker(log_sum_weight={lsw!r},average={a!r},"
 175             "log_variance={lv!r},log_sum_secondary_weight={lssw!r},"
 176             "secondary_average={sa!r},log_secondary_variance={lsv!r},"
 177             "max_log_weight={mlw!r})".format(
 178                 lsw=self.log_sum_weight, a=self.average, lv=self.log_variance,
 179                 lssw=sec.log_sum_weight, sa=sec.average, lsv=sec.log_variance,
 180                 mlw=self.max_log_weight))
 181
 182     def add(self, scalar_value, log_weight=0.0):
 183         """Return updated both stats after addition of another sample.
 184
 185         :param scalar_value: The scalar value of the sample.
 186         :param log_weight: Natural logarithm of weight of the sample.
 187             Default: 0.0 (as log of 1.0).
 188         :type scalar_value: float
 189         :type log_weight: float
 190         :returns: Updated self.
 191         :rtype: ScalarDualStatTracker
 192         """
 193         # Using super() as copy() and add() are not expected to change
 194         # signature, so this way diamond inheritance will be supported.
 195         primary = super(ScalarDualStatTracker, self)
 196         if self.max_log_weight is None or log_weight >= self.max_log_weight:
 197             self.max_log_weight = log_weight
 198             self.secondary = primary.copy()
 199         else:
 200             self.secondary.add(scalar_value, log_weight)
 201         primary.add(scalar_value, log_weight)
 202         return self
 203
 204
 205 class VectorStatTracker(object):
 206     """Class for tracking multi-dimensional samples.
 207
 208     Contrary to one-dimensional data, multi-dimensional covariance matrix
 209     contains off-diagonal elements which can be negative.
 210
 211     But, sum of weights is never negative, so it is tracked as a logarithm.
 212
 213     The code assumes every method gets arguments with correct dimensionality
 214     as set in constructor. No checks are performed (for performance reasons).
 215
 216     TODO: Should we provide a subclass with the dimensionality checks?
 217     """
 218
 219     def __init__(
 220             self, dimension=2, log_sum_weight=None, averages=None,
 221             covariance_matrix=None):
 222         """Initialize new tracker instance, two-dimenstional empty by default.
 223
 224         If any of latter two arguments is None, it means
 225         the tracker state is invalid. Use reset method
 226         to create empty tracker of constructed dimentionality.
 227
 228         :param dimension: Number of scalar components of samples.
 229         :param log_sum_weight: Natural logarithm of sum of weights
 230             of samples so far. Default: None (as log of 0.0).
 231         :param averages: Weighted average of the samples.
 232             Default: None
 233         :param covariance_matrix: Variance matrix elements.
 234             Default: None
 235         :type log_sum_weight: float or None
 236         :type averages: None or tuple of float
 237         :type covariance_matrix: None or tuple of tuple of float
 238         """
 239         self.dimension = dimension
 240         self.log_sum_weight = log_sum_weight
 241         self.averages = averages
 242         self.covariance_matrix = covariance_matrix
 243
 244     def __repr__(self):
 245         """Return string, which interpreted constructs state of self.
 246
 247         :returns: Expression contructing an equivalent instance.
 248         :rtype: str"""
 249         return (
 250             "VectorStatTracker(dimension={d!r},log_sum_weight={lsw!r},"
 251             "averages={a!r},covariance_matrix={cm!r})".format(
 252                 d=self.dimension, lsw=self.log_sum_weight, a=self.averages,
 253                 cm=self.covariance_matrix))
 254
 255     def copy(self):
 256         """Return new instance with the same state as self.
 257
 258         The main usage is to simplify debugging. This method allows
 259         to store the old state, while self continues to track towards new state.
 260
 261         :returns: Created tracker instance.
 262         :rtype: VectorStatTracker
 263         """
 264         return VectorStatTracker(
 265             self.dimension, self.log_sum_weight, self.averages,
 266             self.covariance_matrix)
 267
 268     def reset(self):
 269         """Return state set to empty data of proper dimensionality.
 270
 271         :returns: Updated self.
 272         :rtype: VectorStatTracker
 273         """
 274         self.averages = [0.0 for _ in range(self.dimension)]
 275         # TODO: Examine whether we can gain speed by tracking triangle only.
 276         self.covariance_matrix = [[0.0 for _ in range(self.dimension)]
 277                                   for _ in range(self.dimension)]
 278         # TODO: In Python3, list comprehensions are generators,
 279         # so they are not indexable. Put list() when converting.
 280         return self
 281
 282     def unit_reset(self):
 283         """Reset state but use unit matric as covariance.
 284
 285         :returns: Updated self.
 286         :rtype: VectorStatTracker
 287         """
 288         self.reset()
 289         for index in range(self.dimension):
 290             self.covariance_matrix[index][index] = 1.0
 291
 292     def add_get_shift(self, vector_value, log_weight=0.0):
 293         """Return shift and update state to addition of another sample.
 294
 295         Shift is the vector from old average to new sample.
 296         For most callers, returning shift is more useful than returning self.
 297
 298         :param vector_value: The value of the sample.
 299         :param log_weight: Natural logarithm of weight of the sample.
 300             Default: 0.0 (as log of 1.0).
 301         :type vector_value: iterable of float
 302         :type log_weight: float
 303         :returns: Updated self.
 304         :rtype: VectorStatTracker
 305         """
 306         dimension = self.dimension
 307         old_log_sum_weight = self.log_sum_weight
 308         old_averages = self.averages
 309         if not old_averages:
 310             shift = [0.0 for index in range(dimension)]
 311         else:
 312             shift = [vector_value[index] - old_averages[index]
 313                      for index in range(dimension)]
 314         if old_log_sum_weight is None:
 315             # First sample.
 316             self.log_sum_weight = log_weight
 317             self.averages = [vector_value[index] for index in range(dimension)]
 318             # Not touching covariance matrix.
 319             return shift
 320         covariance_matrix = self.covariance_matrix
 321         new_log_sum_weight = log_plus(old_log_sum_weight, log_weight)
 322         data_ratio = math.exp(old_log_sum_weight - new_log_sum_weight)
 323         sample_ratio = math.exp(log_weight - new_log_sum_weight)
 324         new_averages = [old_averages[index] + shift[index] * sample_ratio
 325                         for index in range(dimension)]
 326         # It is easier to update covariance matrix in-place.
 327         for second in range(dimension):
 328             for first in range(dimension):
 329                 element = covariance_matrix[first][second]
 330                 element += shift[first] * shift[second] * sample_ratio
 331                 element *= data_ratio
 332                 covariance_matrix[first][second] = element
 333         self.log_sum_weight = new_log_sum_weight
 334         self.averages = new_averages
 335         # self.covariance_matrix still points to the object we updated in-place.
 336         return shift
 337
 338     # TODO: There are some uses for such a vector tracker,
 339     # that does not track average, but weightest (latest if tied) value,
 340     # and computes covariance matrix centered around that.
 341     # But perhaps the following method would work similarly well?
 342     def add_without_dominance_get_distance(self, vector_value, log_weight=0.0):
 343         """Update stats, avoid having the sample dominate, return old distance.
 344
 345         If the weight of the incoming sample is far bigger
 346         than the weight of all the previous data together,
 347         convariance matrix would suffer from underflows.
 348         To avoid that, this method manipulates both weights
 349         before calling add().
 350
 351         The old covariance matrix (before update) can define a metric.
 352         Shift is a vector, so the metric can be used to compute a distance
 353         between old average and new sample.
 354
 355         :param vector_value: The value of the sample.
 356         :param log_weight: Natural logarithm of weight of the sample.
 357             Default: 0.0 (as log of 1.0).
 358         :type vector_value: iterable of float
 359         :type log_weight: float
 360         :returns: Updated self.
 361         :rtype: VectorStatTracker
 362         """
 363         lsw = self.log_sum_weight
 364         if lsw is not None and lsw < log_weight - 1.0:
 365             lsw = (lsw + log_weight) / 2.0
 366             log_weight = lsw
 367             self.log_sum_weight = lsw
 368         old_metric = copy.deepcopy(self.covariance_matrix)
 369         shift = self.add_get_shift(vector_value, log_weight)
 370         gradient = numpy.linalg.solve(old_metric, shift)
 371         distance = numpy.vdot(shift, gradient)
 372         return distance