1 # Copyright (c) 2019 Cisco and/or its affiliates.
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at:
6 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
14 """Module for tracking average and variance for data of weighted samples.
16 See log_plus for an explanation why None acts as a special case "float" number.
18 TODO: Current implementation sets zero averages on empty tracker.
19 Should we set None instead?
21 TODO: Implement __str__ and __repr__ for easier debugging.
29 # TODO: Teach FD.io CSIT to use multiple dirs in PYTHONPATH,
30 # then switch to absolute imports within PLRsearch package.
31 # Current usage of relative imports is just a short term workaround.
32 from log_plus import log_plus # pylint: disable=relative-import
35 class ScalarStatTracker(object):
36 """Class for tracking one-dimensional samples.
38 Variance of one-dimensional data cannot be negative,
39 so this class avoids possible underflows by tracking logarithm
40 of the variance instead.
42 Similarly, sum of weights is also tracked as a logarithm.
45 def __init__(self, log_sum_weight=None, average=0.0, log_variance=None):
46 """Initialize new tracker instance, empty by default.
48 :param log_sum_weight: Natural logarithm of sum of weights
49 of samples so far. Default: None (as log of 0.0).
50 :param average: Weighted average of the samples. Default: 0.0
51 :param log_variance: Natural logarithm of variance.
52 Default: None (as log of 0.0).
53 :type log_sum_weight: float or None
55 :type log_variance: float or None
57 self.log_sum_weight = log_sum_weight
58 self.average = average
59 self.log_variance = log_variance
62 """Return string, which interpreted constructs state of self."""
63 return ("ScalarStatTracker(log_sum_weight={lsw!r},average={a!r},"
64 "log_variance={lv!r})".format(
65 lsw=self.log_sum_weight, a=self.average,
66 lv=self.log_variance))
69 """Return new ScalarStatTracker instance with the same state as self.
71 The point of this method is the return type, instances of subclasses
72 can get rid of their additional data this way.
74 :returns: New instance with the same core state.
75 :rtype: ScalarStatTracker
77 return ScalarStatTracker(
78 self.log_sum_weight, self.average, self.log_variance)
80 def add(self, scalar_value, log_weight=0.0):
81 """Return updated stats corresponding to addition of another sample.
83 :param scalar_value: The scalar value of the sample.
84 :param log_weight: Natural logarithm of weight of the sample.
85 Default: 0.0 (as log of 1.0).
86 :type scalar_value: float
87 :type log_weight: float
88 :returns: Updated self.
89 :rtype: ScalarStatTracker
91 old_log_sum_weight = self.log_sum_weight
92 if old_log_sum_weight is None:
94 self.log_sum_weight = log_weight
95 self.average = scalar_value
97 old_average = self.average
98 log_variance = self.log_variance
99 new_log_sum_weight = log_plus(old_log_sum_weight, log_weight)
100 log_sample_ratio = log_weight - new_log_sum_weight
101 sample_ratio = math.exp(log_sample_ratio)
102 shift = scalar_value - old_average
103 new_average = old_average + shift * sample_ratio
104 # The log_variance is updated in-place (instead of new_ vs old_)
105 # because of if-s detecting None-s where the value does not change.
106 absolute_shift = abs(shift)
107 if absolute_shift > 0.0:
108 log_square_shift = 2 * math.log(absolute_shift)
109 log_variance = log_plus(
110 log_variance, log_square_shift + log_sample_ratio)
111 if log_variance is not None:
112 log_variance += old_log_sum_weight - new_log_sum_weight
113 self.log_sum_weight = new_log_sum_weight
114 self.average = new_average
115 self.log_variance = log_variance
119 class ScalarDualStatTracker(ScalarStatTracker):
120 """Class for tracking one-dimensional samples, offering dual stats.
122 It means that instead of just primary stats (identical to what
123 ScalarStatTracker offers, that is why this is its subclass),
124 this tracker allso offers secondary stats.
125 Secondary stats are scalar stats that track the same data,
126 except that the most weighty (or later if tied) sample is not considered.
128 Users can analyze how do secondary stats differ from the primary ones,
129 and based on that decide whether they processed "enough" data.
130 One typical use is for Monte Carlo integrator to decide whether
131 the partial sums so far are reliable enough.
135 self, log_sum_weight=None, average=0.0, log_variance=None,
136 log_sum_secondary_weight=None, secondary_average=0.0,
137 log_secondary_variance=None, max_log_weight=None):
138 """Initialize new tracker instance, empty by default.
140 :param log_sum_weight: Natural logarithm of sum of weights
141 of samples so far. Default: None (as log of 0.0).
142 :param average: Weighted average of the samples. Default: 0.0
143 :param log_variance: Natural logarithm of variance.
144 Default: None (as log of 0.0).
145 :param log_sum_secondary_weight: Natural logarithm of sum of weights
146 of samples so far except weightest one.
147 Default: None (as log of 0.0).
148 :param secondary_average: Weighted average of the samples
149 except the weightest one. Default: 0.0
150 :param log_variance: Natural logarithm of variance od samples
151 except the weightest one. Default: None (as log of 0.0).
152 :param max_log_weight: Natural logarithm of weight of sample
153 counted in primary but not secondary stats.
154 Default: None (as log of 0.0).
155 :type log_sum_weight: float or None
157 :type log_variance: float or None
158 :type log_sum_secondary_weight: float or None
159 :type secondary_average: float
160 :type log_secondary_variance: float or None
161 :type max_log_weight: float or None
163 # Not using super() as the constructor signature is different,
164 # so in case of diamond inheritance mismatch would be probable.
165 ScalarStatTracker.__init__(self, log_sum_weight, average, log_variance)
166 self.secondary = ScalarStatTracker(
167 log_sum_secondary_weight, secondary_average, log_secondary_variance)
168 self.max_log_weight = max_log_weight
171 """Return string, which interpreted constructs state of self."""
174 "ScalarDualStatTracker(log_sum_weight={lsw!r},average={a!r},"
175 "log_variance={lv!r},log_sum_secondary_weight={lssw!r},"
176 "secondary_average={sa!r},log_secondary_variance={lsv!r},"
177 "max_log_weight={mlw!r})".format(
178 lsw=self.log_sum_weight, a=self.average, lv=self.log_variance,
179 lssw=sec.log_sum_weight, sa=sec.average, lsv=sec.log_variance,
180 mlw=self.max_log_weight))
182 def add(self, scalar_value, log_weight=0.0):
183 """Return updated both stats after addition of another sample.
185 :param scalar_value: The scalar value of the sample.
186 :param log_weight: Natural logarithm of weight of the sample.
187 Default: 0.0 (as log of 1.0).
188 :type scalar_value: float
189 :type log_weight: float
190 :returns: Updated self.
191 :rtype: ScalarDualStatTracker
193 # Using super() as copy() and add() are not expected to change
194 # signature, so this way diamond inheritance will be supported.
195 primary = super(ScalarDualStatTracker, self)
196 if self.max_log_weight is None or log_weight >= self.max_log_weight:
197 self.max_log_weight = log_weight
198 self.secondary = primary.copy()
200 self.secondary.add(scalar_value, log_weight)
201 primary.add(scalar_value, log_weight)
205 class VectorStatTracker(object):
206 """Class for tracking multi-dimensional samples.
208 Contrary to one-dimensional data, multi-dimensional covariance matrix
209 contains off-diagonal elements which can be negative.
211 But, sum of weights is never negative, so it is tracked as a logarithm.
213 The code assumes every method gets arguments with correct dimensionality
214 as set in constructor. No checks are performed (for performance reasons).
216 TODO: Should we provide a subclass with the dimensionality checks?
220 self, dimension=2, log_sum_weight=None, averages=None,
221 covariance_matrix=None):
222 """Initialize new tracker instance, two-dimenstional empty by default.
224 If any of latter two arguments is None, it means
225 the tracker state is invalid. Use reset method
226 to create empty tracker of constructed dimentionality.
228 :param dimension: Number of scalar components of samples.
229 :param log_sum_weight: Natural logarithm of sum of weights
230 of samples so far. Default: None (as log of 0.0).
231 :param averages: Weighted average of the samples.
233 :param covariance_matrix: Variance matrix elements.
235 :type log_sum_weight: float or None
236 :type averages: None or tuple of float
237 :type covariance_matrix: None or tuple of tuple of float
239 self.dimension = dimension
240 self.log_sum_weight = log_sum_weight
241 self.averages = averages
242 self.covariance_matrix = covariance_matrix
245 """Return string, which interpreted constructs state of self.
247 :returns: Expression contructing an equivalent instance.
250 "VectorStatTracker(dimension={d!r},log_sum_weight={lsw!r},"
251 "averages={a!r},covariance_matrix={cm!r})".format(
252 d=self.dimension, lsw=self.log_sum_weight, a=self.averages,
253 cm=self.covariance_matrix))
256 """Return new instance with the same state as self.
258 The main usage is to simplify debugging. This method allows
259 to store the old state, while self continues to track towards new state.
261 :returns: Created tracker instance.
262 :rtype: VectorStatTracker
264 return VectorStatTracker(
265 self.dimension, self.log_sum_weight, self.averages,
266 self.covariance_matrix)
269 """Return state set to empty data of proper dimensionality.
271 :returns: Updated self.
272 :rtype: VectorStatTracker
274 self.averages = [0.0 for _ in range(self.dimension)]
275 # TODO: Examine whether we can gain speed by tracking triangle only.
276 self.covariance_matrix = [[0.0 for _ in range(self.dimension)]
277 for _ in range(self.dimension)]
278 # TODO: In Python3, list comprehensions are generators,
279 # so they are not indexable. Put list() when converting.
282 def unit_reset(self):
283 """Reset state but use unit matric as covariance.
285 :returns: Updated self.
286 :rtype: VectorStatTracker
289 for index in range(self.dimension):
290 self.covariance_matrix[index][index] = 1.0
292 def add_get_shift(self, vector_value, log_weight=0.0):
293 """Return shift and update state to addition of another sample.
295 Shift is the vector from old average to new sample.
296 For most callers, returning shift is more useful than returning self.
298 :param vector_value: The value of the sample.
299 :param log_weight: Natural logarithm of weight of the sample.
300 Default: 0.0 (as log of 1.0).
301 :type vector_value: iterable of float
302 :type log_weight: float
303 :returns: Updated self.
304 :rtype: VectorStatTracker
306 dimension = self.dimension
307 old_log_sum_weight = self.log_sum_weight
308 old_averages = self.averages
310 shift = [0.0 for index in range(dimension)]
312 shift = [vector_value[index] - old_averages[index]
313 for index in range(dimension)]
314 if old_log_sum_weight is None:
316 self.log_sum_weight = log_weight
317 self.averages = [vector_value[index] for index in range(dimension)]
318 # Not touching covariance matrix.
320 covariance_matrix = self.covariance_matrix
321 new_log_sum_weight = log_plus(old_log_sum_weight, log_weight)
322 data_ratio = math.exp(old_log_sum_weight - new_log_sum_weight)
323 sample_ratio = math.exp(log_weight - new_log_sum_weight)
324 new_averages = [old_averages[index] + shift[index] * sample_ratio
325 for index in range(dimension)]
326 # It is easier to update covariance matrix in-place.
327 for second in range(dimension):
328 for first in range(dimension):
329 element = covariance_matrix[first][second]
330 element += shift[first] * shift[second] * sample_ratio
331 element *= data_ratio
332 covariance_matrix[first][second] = element
333 self.log_sum_weight = new_log_sum_weight
334 self.averages = new_averages
335 # self.covariance_matrix still points to the object we updated in-place.
338 # TODO: There are some uses for such a vector tracker,
339 # that does not track average, but weightest (latest if tied) value,
340 # and computes covariance matrix centered around that.
341 # But perhaps the following method would work similarly well?
342 def add_without_dominance_get_distance(self, vector_value, log_weight=0.0):
343 """Update stats, avoid having the sample dominate, return old distance.
345 If the weight of the incoming sample is far bigger
346 than the weight of all the previous data together,
347 convariance matrix would suffer from underflows.
348 To avoid that, this method manipulates both weights
349 before calling add().
351 The old covariance matrix (before update) can define a metric.
352 Shift is a vector, so the metric can be used to compute a distance
353 between old average and new sample.
355 :param vector_value: The value of the sample.
356 :param log_weight: Natural logarithm of weight of the sample.
357 Default: 0.0 (as log of 1.0).
358 :type vector_value: iterable of float
359 :type log_weight: float
360 :returns: Updated self.
361 :rtype: VectorStatTracker
363 lsw = self.log_sum_weight
364 if lsw is not None and lsw < log_weight - 1.0:
365 lsw = (lsw + log_weight) / 2.0
367 self.log_sum_weight = lsw
368 old_metric = copy.deepcopy(self.covariance_matrix)
369 shift = self.add_get_shift(vector_value, log_weight)
370 gradient = numpy.linalg.solve(old_metric, shift)
371 distance = numpy.vdot(shift, gradient)