X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=resources%2Flibraries%2Fpython%2FPLRsearch%2Fstat_trackers.py;h=2a7a05cae63412b87075b9fc9c6b8e3577cebbf3;hb=9780b57a9640e9ab40e40ec122ac80e09cd74c79;hp=168b09a14e416629ac3736cb1457fc2cf30646ff;hpb=752a4c3304581fa375f520fdb15a9f87604e11be;p=csit.git

diff --git a/resources/libraries/python/PLRsearch/stat_trackers.py b/resources/libraries/python/PLRsearch/stat_trackers.py
index 168b09a14e..2a7a05cae6 100644
--- a/resources/libraries/python/PLRsearch/stat_trackers.py
+++ b/resources/libraries/python/PLRsearch/stat_trackers.py
@@ -29,10 +29,10 @@ import numpy
 # TODO: Teach FD.io CSIT to use multiple dirs in PYTHONPATH,
 # then switch to absolute imports within PLRsearch package.
 # Current usage of relative imports is just a short term workaround.
-from log_plus import log_plus  # pylint: disable=relative-import
+from .log_plus import log_plus, safe_exp
 
 
-class ScalarStatTracker(object):
+class ScalarStatTracker:
     """Class for tracking one-dimensional samples.
 
     Variance of one-dimensional data cannot be negative,
@@ -59,11 +59,13 @@ class ScalarStatTracker(object):
         self.log_variance = log_variance
 
     def __repr__(self):
-        """Return string, which interpreted constructs state of self."""
-        return ("ScalarStatTracker(log_sum_weight={lsw!r},average={a!r},"
-                "log_variance={lv!r})".format(
-                    lsw=self.log_sum_weight, a=self.average,
-                    lv=self.log_variance))
+        """Return string, which interpreted constructs state of self.
+
+        :returns: Expression constructing an equivalent instance.
+        :rtype: str
+        """
+        return f"ScalarStatTracker(log_sum_weight={self.log_sum_weight!r}," \
+            f"average={self.average!r},log_variance={self.log_variance!r})"
 
     def copy(self):
         """Return new ScalarStatTracker instance with the same state as self.
@@ -75,7 +77,8 @@ class ScalarStatTracker(object):
         :rtype: ScalarStatTracker
         """
         return ScalarStatTracker(
-            self.log_sum_weight, self.average, self.log_variance)
+            self.log_sum_weight, self.average, self.log_variance
+        )
 
     def add(self, scalar_value, log_weight=0.0):
         """Return updated stats corresponding to addition of another sample.
@@ -130,7 +133,6 @@ class ScalarDualStatTracker(ScalarStatTracker):
     One typical use is for Monte Carlo integrator to decide whether
     the partial sums so far are reliable enough.
     """
-
     def __init__(
             self, log_sum_weight=None, average=0.0, log_variance=None,
             log_sum_secondary_weight=None, secondary_average=0.0,
@@ -164,20 +166,23 @@ class ScalarDualStatTracker(ScalarStatTracker):
         # so in case of diamond inheritance mismatch would be probable.
         ScalarStatTracker.__init__(self, log_sum_weight, average, log_variance)
         self.secondary = ScalarStatTracker(
-            log_sum_secondary_weight, secondary_average, log_secondary_variance)
+            log_sum_secondary_weight, secondary_average, log_secondary_variance
+        )
         self.max_log_weight = max_log_weight
 
     def __repr__(self):
-        """Return string, which interpreted constructs state of self."""
+        """Return string, which interpreted constructs state of self.
+
+        :returns: Expression contructing an equivalent instance.
+        :rtype: str
+        """
         sec = self.secondary
-        return (
-            "ScalarDualStatTracker(log_sum_weight={lsw!r},average={a!r},"
-            "log_variance={lv!r},log_sum_secondary_weight={lssw!r},"
-            "secondary_average={sa!r},log_secondary_variance={lsv!r},"
-            "max_log_weight={mlw!r})".format(
-                lsw=self.log_sum_weight, a=self.average, lv=self.log_variance,
-                lssw=sec.log_sum_weight, sa=sec.average, lsv=sec.log_variance,
-                mlw=self.max_log_weight))
+        return f"ScalarDualStatTracker(log_sum_weight={self.log_sum_weight!r},"\
+            f"average={self.average!r},log_variance={self.log_variance!r}," \
+            f"log_sum_secondary_weight={sec.log_sum_weight!r}," \
+            f"secondary_average={sec.average!r}," \
+            f"log_secondary_variance={sec.log_variance!r}," \
+            f"max_log_weight={self.max_log_weight!r})"
 
     def add(self, scalar_value, log_weight=0.0):
         """Return updated both stats after addition of another sample.
@@ -201,8 +206,28 @@ class ScalarDualStatTracker(ScalarStatTracker):
         primary.add(scalar_value, log_weight)
         return self
 
+    def get_pessimistic_variance(self):
+        """Return estimate of variance reflecting weight effects.
+
+        Typical scenario is the primary tracker dominated by a single sample.
+        In worse case, secondary tracker is also dominated by
+        a single (but different) sample.
 
-class VectorStatTracker(object):
+        Current implementation simply returns variance of average
+        of the two trackers, as if they were independent.
+
+        :returns: Pessimistic estimate of variance (not stdev, no log).
+        :rtype: float
+        """
+        var_primary = safe_exp(self.log_variance)
+        var_secondary = safe_exp(self.secondary.log_variance)
+        var_combined = (var_primary + var_secondary) / 2
+        avg_half_diff = (self.average - self.secondary.average) / 2
+        var_combined += avg_half_diff * avg_half_diff
+        return var_combined
+
+
+class VectorStatTracker:
     """Class for tracking multi-dimensional samples.
 
     Contrary to one-dimensional data, multi-dimensional covariance matrix
@@ -219,11 +244,11 @@ class VectorStatTracker(object):
     def __init__(
             self, dimension=2, log_sum_weight=None, averages=None,
             covariance_matrix=None):
-        """Initialize new tracker instance, two-dimenstional empty by default.
+        """Initialize new tracker instance, two-dimensional empty by default.
 
         If any of latter two arguments is None, it means
         the tracker state is invalid. Use reset method
-        to create empty tracker of constructed dimentionality.
+        to create empty tracker of constructed dimensionality.
 
         :param dimension: Number of scalar components of samples.
         :param log_sum_weight: Natural logarithm of sum of weights
@@ -244,13 +269,13 @@ class VectorStatTracker(object):
     def __repr__(self):
         """Return string, which interpreted constructs state of self.
 
-        :returns: Expression contructing an equivalent instance.
-        :rtype: str"""
-        return (
-            "VectorStatTracker(dimension={d!r},log_sum_weight={lsw!r},"
-            "averages={a!r},covariance_matrix={cm!r})".format(
-                d=self.dimension, lsw=self.log_sum_weight, a=self.averages,
-                cm=self.covariance_matrix))
+        :returns: Expression constructing an equivalent instance.
+        :rtype: str
+        """
+        return f"VectorStatTracker(dimension={self.dimension!r}," \
+            f"log_sum_weight={self.log_sum_weight!r}," \
+            f"averages={self.averages!r}," \
+            f"covariance_matrix={self.covariance_matrix!r})"
 
     def copy(self):
         """Return new instance with the same state as self.
@@ -262,8 +287,9 @@ class VectorStatTracker(object):
         :rtype: VectorStatTracker
         """
         return VectorStatTracker(
-            self.dimension, self.log_sum_weight, self.averages,
-            self.covariance_matrix)
+            self.dimension, self.log_sum_weight, self.averages[:],
+            copy.deepcopy(self.covariance_matrix)
+        )
 
     def reset(self):
         """Return state set to empty data of proper dimensionality.
@@ -273,8 +299,9 @@ class VectorStatTracker(object):
         """
         self.averages = [0.0 for _ in range(self.dimension)]
         # TODO: Examine whether we can gain speed by tracking triangle only.
-        self.covariance_matrix = [[0.0 for _ in range(self.dimension)]
-                                  for _ in range(self.dimension)]
+        self.covariance_matrix = [
+            [0.0 for _ in range(self.dimension)] for _ in range(self.dimension)
+        ]
         # TODO: In Python3, list comprehensions are generators,
         # so they are not indexable. Put list() when converting.
         return self
@@ -288,6 +315,7 @@ class VectorStatTracker(object):
         self.reset()
         for index in range(self.dimension):
             self.covariance_matrix[index][index] = 1.0
+        return self
 
     def add_get_shift(self, vector_value, log_weight=0.0):
         """Return shift and update state to addition of another sample.
@@ -300,17 +328,19 @@ class VectorStatTracker(object):
             Default: 0.0 (as log of 1.0).
         :type vector_value: iterable of float
         :type log_weight: float
-        :returns: Updated self.
-        :rtype: VectorStatTracker
+        :returns: Shift vector
+        :rtype: list of float
         """
         dimension = self.dimension
         old_log_sum_weight = self.log_sum_weight
         old_averages = self.averages
         if not old_averages:
-            shift = [0.0 for index in range(dimension)]
+            shift = [0.0 for _ in range(dimension)]
         else:
-            shift = [vector_value[index] - old_averages[index]
-                     for index in range(dimension)]
+            shift = [
+                vector_value[index] - old_averages[index]
+                for index in range(dimension)
+            ]
         if old_log_sum_weight is None:
             # First sample.
             self.log_sum_weight = log_weight
@@ -321,8 +351,10 @@ class VectorStatTracker(object):
         new_log_sum_weight = log_plus(old_log_sum_weight, log_weight)
         data_ratio = math.exp(old_log_sum_weight - new_log_sum_weight)
         sample_ratio = math.exp(log_weight - new_log_sum_weight)
-        new_averages = [old_averages[index] + shift[index] * sample_ratio
-                        for index in range(dimension)]
+        new_averages = [
+            old_averages[index] + shift[index] * sample_ratio
+            for index in range(dimension)
+        ]
         # It is easier to update covariance matrix in-place.
         for second in range(dimension):
             for first in range(dimension):
@@ -344,7 +376,7 @@ class VectorStatTracker(object):
 
         If the weight of the incoming sample is far bigger
         than the weight of all the previous data together,
-        convariance matrix would suffer from underflows.
+        covariance matrix would suffer from underflow.
         To avoid that, this method manipulates both weights
         before calling add().