X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=resources%2Flibraries%2Fpython%2FPLRsearch%2Fstat_trackers.py;h=2a7a05cae63412b87075b9fc9c6b8e3577cebbf3;hb=9780b57a9640e9ab40e40ec122ac80e09cd74c79;hp=168b09a14e416629ac3736cb1457fc2cf30646ff;hpb=752a4c3304581fa375f520fdb15a9f87604e11be;p=csit.git diff --git a/resources/libraries/python/PLRsearch/stat_trackers.py b/resources/libraries/python/PLRsearch/stat_trackers.py index 168b09a14e..2a7a05cae6 100644 --- a/resources/libraries/python/PLRsearch/stat_trackers.py +++ b/resources/libraries/python/PLRsearch/stat_trackers.py @@ -29,10 +29,10 @@ import numpy # TODO: Teach FD.io CSIT to use multiple dirs in PYTHONPATH, # then switch to absolute imports within PLRsearch package. # Current usage of relative imports is just a short term workaround. -from log_plus import log_plus # pylint: disable=relative-import +from .log_plus import log_plus, safe_exp -class ScalarStatTracker(object): +class ScalarStatTracker: """Class for tracking one-dimensional samples. Variance of one-dimensional data cannot be negative, @@ -59,11 +59,13 @@ class ScalarStatTracker(object): self.log_variance = log_variance def __repr__(self): - """Return string, which interpreted constructs state of self.""" - return ("ScalarStatTracker(log_sum_weight={lsw!r},average={a!r}," - "log_variance={lv!r})".format( - lsw=self.log_sum_weight, a=self.average, - lv=self.log_variance)) + """Return string, which interpreted constructs state of self. + + :returns: Expression constructing an equivalent instance. + :rtype: str + """ + return f"ScalarStatTracker(log_sum_weight={self.log_sum_weight!r}," \ + f"average={self.average!r},log_variance={self.log_variance!r})" def copy(self): """Return new ScalarStatTracker instance with the same state as self. @@ -75,7 +77,8 @@ class ScalarStatTracker(object): :rtype: ScalarStatTracker """ return ScalarStatTracker( - self.log_sum_weight, self.average, self.log_variance) + self.log_sum_weight, self.average, self.log_variance + ) def add(self, scalar_value, log_weight=0.0): """Return updated stats corresponding to addition of another sample. @@ -130,7 +133,6 @@ class ScalarDualStatTracker(ScalarStatTracker): One typical use is for Monte Carlo integrator to decide whether the partial sums so far are reliable enough. """ - def __init__( self, log_sum_weight=None, average=0.0, log_variance=None, log_sum_secondary_weight=None, secondary_average=0.0, @@ -164,20 +166,23 @@ class ScalarDualStatTracker(ScalarStatTracker): # so in case of diamond inheritance mismatch would be probable. ScalarStatTracker.__init__(self, log_sum_weight, average, log_variance) self.secondary = ScalarStatTracker( - log_sum_secondary_weight, secondary_average, log_secondary_variance) + log_sum_secondary_weight, secondary_average, log_secondary_variance + ) self.max_log_weight = max_log_weight def __repr__(self): - """Return string, which interpreted constructs state of self.""" + """Return string, which interpreted constructs state of self. + + :returns: Expression contructing an equivalent instance. + :rtype: str + """ sec = self.secondary - return ( - "ScalarDualStatTracker(log_sum_weight={lsw!r},average={a!r}," - "log_variance={lv!r},log_sum_secondary_weight={lssw!r}," - "secondary_average={sa!r},log_secondary_variance={lsv!r}," - "max_log_weight={mlw!r})".format( - lsw=self.log_sum_weight, a=self.average, lv=self.log_variance, - lssw=sec.log_sum_weight, sa=sec.average, lsv=sec.log_variance, - mlw=self.max_log_weight)) + return f"ScalarDualStatTracker(log_sum_weight={self.log_sum_weight!r},"\ + f"average={self.average!r},log_variance={self.log_variance!r}," \ + f"log_sum_secondary_weight={sec.log_sum_weight!r}," \ + f"secondary_average={sec.average!r}," \ + f"log_secondary_variance={sec.log_variance!r}," \ + f"max_log_weight={self.max_log_weight!r})" def add(self, scalar_value, log_weight=0.0): """Return updated both stats after addition of another sample. @@ -201,8 +206,28 @@ class ScalarDualStatTracker(ScalarStatTracker): primary.add(scalar_value, log_weight) return self + def get_pessimistic_variance(self): + """Return estimate of variance reflecting weight effects. + + Typical scenario is the primary tracker dominated by a single sample. + In worse case, secondary tracker is also dominated by + a single (but different) sample. -class VectorStatTracker(object): + Current implementation simply returns variance of average + of the two trackers, as if they were independent. + + :returns: Pessimistic estimate of variance (not stdev, no log). + :rtype: float + """ + var_primary = safe_exp(self.log_variance) + var_secondary = safe_exp(self.secondary.log_variance) + var_combined = (var_primary + var_secondary) / 2 + avg_half_diff = (self.average - self.secondary.average) / 2 + var_combined += avg_half_diff * avg_half_diff + return var_combined + + +class VectorStatTracker: """Class for tracking multi-dimensional samples. Contrary to one-dimensional data, multi-dimensional covariance matrix @@ -219,11 +244,11 @@ class VectorStatTracker(object): def __init__( self, dimension=2, log_sum_weight=None, averages=None, covariance_matrix=None): - """Initialize new tracker instance, two-dimenstional empty by default. + """Initialize new tracker instance, two-dimensional empty by default. If any of latter two arguments is None, it means the tracker state is invalid. Use reset method - to create empty tracker of constructed dimentionality. + to create empty tracker of constructed dimensionality. :param dimension: Number of scalar components of samples. :param log_sum_weight: Natural logarithm of sum of weights @@ -244,13 +269,13 @@ class VectorStatTracker(object): def __repr__(self): """Return string, which interpreted constructs state of self. - :returns: Expression contructing an equivalent instance. - :rtype: str""" - return ( - "VectorStatTracker(dimension={d!r},log_sum_weight={lsw!r}," - "averages={a!r},covariance_matrix={cm!r})".format( - d=self.dimension, lsw=self.log_sum_weight, a=self.averages, - cm=self.covariance_matrix)) + :returns: Expression constructing an equivalent instance. + :rtype: str + """ + return f"VectorStatTracker(dimension={self.dimension!r}," \ + f"log_sum_weight={self.log_sum_weight!r}," \ + f"averages={self.averages!r}," \ + f"covariance_matrix={self.covariance_matrix!r})" def copy(self): """Return new instance with the same state as self. @@ -262,8 +287,9 @@ class VectorStatTracker(object): :rtype: VectorStatTracker """ return VectorStatTracker( - self.dimension, self.log_sum_weight, self.averages, - self.covariance_matrix) + self.dimension, self.log_sum_weight, self.averages[:], + copy.deepcopy(self.covariance_matrix) + ) def reset(self): """Return state set to empty data of proper dimensionality. @@ -273,8 +299,9 @@ class VectorStatTracker(object): """ self.averages = [0.0 for _ in range(self.dimension)] # TODO: Examine whether we can gain speed by tracking triangle only. - self.covariance_matrix = [[0.0 for _ in range(self.dimension)] - for _ in range(self.dimension)] + self.covariance_matrix = [ + [0.0 for _ in range(self.dimension)] for _ in range(self.dimension) + ] # TODO: In Python3, list comprehensions are generators, # so they are not indexable. Put list() when converting. return self @@ -288,6 +315,7 @@ class VectorStatTracker(object): self.reset() for index in range(self.dimension): self.covariance_matrix[index][index] = 1.0 + return self def add_get_shift(self, vector_value, log_weight=0.0): """Return shift and update state to addition of another sample. @@ -300,17 +328,19 @@ class VectorStatTracker(object): Default: 0.0 (as log of 1.0). :type vector_value: iterable of float :type log_weight: float - :returns: Updated self. - :rtype: VectorStatTracker + :returns: Shift vector + :rtype: list of float """ dimension = self.dimension old_log_sum_weight = self.log_sum_weight old_averages = self.averages if not old_averages: - shift = [0.0 for index in range(dimension)] + shift = [0.0 for _ in range(dimension)] else: - shift = [vector_value[index] - old_averages[index] - for index in range(dimension)] + shift = [ + vector_value[index] - old_averages[index] + for index in range(dimension) + ] if old_log_sum_weight is None: # First sample. self.log_sum_weight = log_weight @@ -321,8 +351,10 @@ class VectorStatTracker(object): new_log_sum_weight = log_plus(old_log_sum_weight, log_weight) data_ratio = math.exp(old_log_sum_weight - new_log_sum_weight) sample_ratio = math.exp(log_weight - new_log_sum_weight) - new_averages = [old_averages[index] + shift[index] * sample_ratio - for index in range(dimension)] + new_averages = [ + old_averages[index] + shift[index] * sample_ratio + for index in range(dimension) + ] # It is easier to update covariance matrix in-place. for second in range(dimension): for first in range(dimension): @@ -344,7 +376,7 @@ class VectorStatTracker(object): If the weight of the incoming sample is far bigger than the weight of all the previous data together, - convariance matrix would suffer from underflows. + covariance matrix would suffer from underflow. To avoid that, this method manipulates both weights before calling add().