Docs: update links by S3

[csit.git] / resources / libraries / python / PLRsearch / stat_trackers.py
diff --git a/resources/libraries/python/PLRsearch/stat_trackers.py b/resources/libraries/python/PLRsearch/stat_trackers.py

index 168b09a..e0b21dc 100644 (file)
--- a/resources/libraries/python/PLRsearch/stat_trackers.py
+++ b/resources/libraries/python/PLRsearch/stat_trackers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at:
@@ -29,10 +29,10 @@ import numpy
  # TODO: Teach FD.io CSIT to use multiple dirs in PYTHONPATH,
  # then switch to absolute imports within PLRsearch package.
  # Current usage of relative imports is just a short term workaround.
-from log_plus import log_plus  # pylint: disable=relative-import
+from .log_plus import log_plus, safe_exp
  
  
-class ScalarStatTracker(object):
+class ScalarStatTracker:
      """Class for tracking one-dimensional samples.
  
      Variance of one-dimensional data cannot be negative,
@@ -59,11 +59,13 @@ class ScalarStatTracker(object):
          self.log_variance = log_variance
  
      def __repr__(self):
-        """Return string, which interpreted constructs state of self."""
-        return ("ScalarStatTracker(log_sum_weight={lsw!r},average={a!r},"
-                "log_variance={lv!r})".format(
-                    lsw=self.log_sum_weight, a=self.average,
-                    lv=self.log_variance))
+        """Return string, which interpreted constructs state of self.
+
+        :returns: Expression constructing an equivalent instance.
+        :rtype: str
+        """
+        return f"ScalarStatTracker(log_sum_weight={self.log_sum_weight!r}," \
+            f"average={self.average!r},log_variance={self.log_variance!r})"
  
      def copy(self):
          """Return new ScalarStatTracker instance with the same state as self.
@@ -75,7 +77,8 @@ class ScalarStatTracker(object):
          :rtype: ScalarStatTracker
          """
          return ScalarStatTracker(
-            self.log_sum_weight, self.average, self.log_variance)
+            self.log_sum_weight, self.average, self.log_variance
+        )
  
      def add(self, scalar_value, log_weight=0.0):
          """Return updated stats corresponding to addition of another sample.
@@ -130,7 +133,6 @@ class ScalarDualStatTracker(ScalarStatTracker):
      One typical use is for Monte Carlo integrator to decide whether
      the partial sums so far are reliable enough.
      """
-
      def __init__(
              self, log_sum_weight=None, average=0.0, log_variance=None,
              log_sum_secondary_weight=None, secondary_average=0.0,
@@ -164,20 +166,23 @@ class ScalarDualStatTracker(ScalarStatTracker):
          # so in case of diamond inheritance mismatch would be probable.
          ScalarStatTracker.__init__(self, log_sum_weight, average, log_variance)
          self.secondary = ScalarStatTracker(
-            log_sum_secondary_weight, secondary_average, log_secondary_variance)
+            log_sum_secondary_weight, secondary_average, log_secondary_variance
+        )
          self.max_log_weight = max_log_weight
  
      def __repr__(self):
-        """Return string, which interpreted constructs state of self."""
+        """Return string, which interpreted constructs state of self.
+
+        :returns: Expression contructing an equivalent instance.
+        :rtype: str
+        """
          sec = self.secondary
-        return (
-            "ScalarDualStatTracker(log_sum_weight={lsw!r},average={a!r},"
-            "log_variance={lv!r},log_sum_secondary_weight={lssw!r},"
-            "secondary_average={sa!r},log_secondary_variance={lsv!r},"
-            "max_log_weight={mlw!r})".format(
-                lsw=self.log_sum_weight, a=self.average, lv=self.log_variance,
-                lssw=sec.log_sum_weight, sa=sec.average, lsv=sec.log_variance,
-                mlw=self.max_log_weight))
+        return f"ScalarDualStatTracker(log_sum_weight={self.log_sum_weight!r},"\
+            f"average={self.average!r},log_variance={self.log_variance!r}," \
+            f"log_sum_secondary_weight={sec.log_sum_weight!r}," \
+            f"secondary_average={sec.average!r}," \
+            f"log_secondary_variance={sec.log_variance!r}," \
+            f"max_log_weight={self.max_log_weight!r})"
  
      def add(self, scalar_value, log_weight=0.0):
          """Return updated both stats after addition of another sample.
@@ -201,8 +206,28 @@ class ScalarDualStatTracker(ScalarStatTracker):
          primary.add(scalar_value, log_weight)
          return self
  
+    def get_pessimistic_variance(self):
+        """Return estimate of variance reflecting weight effects.
+
+        Typical scenario is the primary tracker dominated by a single sample.
+        In worse case, secondary tracker is also dominated by
+        a single (but different) sample.
  
-class VectorStatTracker(object):
+        Current implementation simply returns variance of average
+        of the two trackers, as if they were independent.
+
+        :returns: Pessimistic estimate of variance (not stdev, no log).
+        :rtype: float
+        """
+        var_primary = safe_exp(self.log_variance)
+        var_secondary = safe_exp(self.secondary.log_variance)
+        var_combined = (var_primary + var_secondary) / 2
+        avg_half_diff = (self.average - self.secondary.average) / 2
+        var_combined += avg_half_diff * avg_half_diff
+        return var_combined
+
+
+class VectorStatTracker:
      """Class for tracking multi-dimensional samples.
  
      Contrary to one-dimensional data, multi-dimensional covariance matrix
@@ -219,11 +244,11 @@ class VectorStatTracker(object):
      def __init__(
              self, dimension=2, log_sum_weight=None, averages=None,
              covariance_matrix=None):
-        """Initialize new tracker instance, two-dimenstional empty by default.
+        """Initialize new tracker instance, two-dimensional empty by default.
  
          If any of latter two arguments is None, it means
          the tracker state is invalid. Use reset method
-        to create empty tracker of constructed dimentionality.
+        to create empty tracker of constructed dimensionality.
  
          :param dimension: Number of scalar components of samples.
          :param log_sum_weight: Natural logarithm of sum of weights
@@ -244,13 +269,13 @@ class VectorStatTracker(object):
      def __repr__(self):
          """Return string, which interpreted constructs state of self.
  
-        :returns: Expression contructing an equivalent instance.
-        :rtype: str"""
-        return (
-            "VectorStatTracker(dimension={d!r},log_sum_weight={lsw!r},"
-            "averages={a!r},covariance_matrix={cm!r})".format(
-                d=self.dimension, lsw=self.log_sum_weight, a=self.averages,
-                cm=self.covariance_matrix))
+        :returns: Expression constructing an equivalent instance.
+        :rtype: str
+        """
+        return f"VectorStatTracker(dimension={self.dimension!r}," \
+            f"log_sum_weight={self.log_sum_weight!r}," \
+            f"averages={self.averages!r}," \
+            f"covariance_matrix={self.covariance_matrix!r})"
  
      def copy(self):
          """Return new instance with the same state as self.
@@ -262,8 +287,9 @@ class VectorStatTracker(object):
          :rtype: VectorStatTracker
          """
          return VectorStatTracker(
-            self.dimension, self.log_sum_weight, self.averages,
-            self.covariance_matrix)
+            self.dimension, self.log_sum_weight, self.averages[:],
+            copy.deepcopy(self.covariance_matrix)
+        )
  
      def reset(self):
          """Return state set to empty data of proper dimensionality.
@@ -273,8 +299,9 @@ class VectorStatTracker(object):
          """
          self.averages = [0.0 for _ in range(self.dimension)]
          # TODO: Examine whether we can gain speed by tracking triangle only.
-        self.covariance_matrix = [[0.0 for _ in range(self.dimension)]
-                                  for _ in range(self.dimension)]
+        self.covariance_matrix = [
+            [0.0 for _ in range(self.dimension)] for _ in range(self.dimension)
+        ]
          # TODO: In Python3, list comprehensions are generators,
          # so they are not indexable. Put list() when converting.
          return self
@@ -288,6 +315,7 @@ class VectorStatTracker(object):
          self.reset()
          for index in range(self.dimension):
              self.covariance_matrix[index][index] = 1.0
+        return self
  
      def add_get_shift(self, vector_value, log_weight=0.0):
          """Return shift and update state to addition of another sample.
@@ -300,17 +328,19 @@ class VectorStatTracker(object):
              Default: 0.0 (as log of 1.0).
          :type vector_value: iterable of float
          :type log_weight: float
-        :returns: Updated self.
-        :rtype: VectorStatTracker
+        :returns: Shift vector
+        :rtype: list of float
          """
          dimension = self.dimension
          old_log_sum_weight = self.log_sum_weight
          old_averages = self.averages
          if not old_averages:
-            shift = [0.0 for index in range(dimension)]
+            shift = [0.0 for _ in range(dimension)]
          else:
-            shift = [vector_value[index] - old_averages[index]
-                     for index in range(dimension)]
+            shift = [
+                vector_value[index] - old_averages[index]
+                for index in range(dimension)
+            ]
          if old_log_sum_weight is None:
              # First sample.
              self.log_sum_weight = log_weight
@@ -321,8 +351,10 @@ class VectorStatTracker(object):
          new_log_sum_weight = log_plus(old_log_sum_weight, log_weight)
          data_ratio = math.exp(old_log_sum_weight - new_log_sum_weight)
          sample_ratio = math.exp(log_weight - new_log_sum_weight)
-        new_averages = [old_averages[index] + shift[index] * sample_ratio
-                        for index in range(dimension)]
+        new_averages = [
+            old_averages[index] + shift[index] * sample_ratio
+            for index in range(dimension)
+        ]
          # It is easier to update covariance matrix in-place.
          for second in range(dimension):
              for first in range(dimension):
@@ -344,7 +376,7 @@ class VectorStatTracker(object):
  
          If the weight of the incoming sample is far bigger
          than the weight of all the previous data together,
-        convariance matrix would suffer from underflows.
+        covariance matrix would suffer from underflow.
          To avoid that, this method manipulates both weights
          before calling add().