fix(jumpavg): penalize stdev also for size=2

author Vratko Polak <vrpolak@cisco.com>

Thu, 13 Jul 2023 12:19:37 +0000 (14:19 +0200)

committer Tibor Frank <tifrank@cisco.com>

Fri, 14 Jul 2023 08:16:52 +0000 (08:16 +0000)
author Vratko Polak <vrpolak@cisco.com>
Thu, 13 Jul 2023 12:19:37 +0000 (14:19 +0200)
committer Tibor Frank <tifrank@cisco.com>
Fri, 14 Jul 2023 08:16:52 +0000 (08:16 +0000)
diff --git a/PyPI/jumpavg/README.md b/PyPI/jumpavg/README.md

index e3cae0d..e93e4dc 100644 (file)
--- a/PyPI/jumpavg/README.md
+++ b/PyPI/jumpavg/README.md
@@ -23,6 +23,8 @@ TODO.
  
  TODO: Move into a separate file?
  
++ 0.4.1: Fixed bug of not penalizing large stdev enough (at all for size 2 stats).
+
  + 0.4.0: Added "unit" and "sbps" parameters so information content
    is reasonable even if sample values are below one.
  
diff --git a/PyPI/jumpavg/pyproject.toml b/PyPI/jumpavg/pyproject.toml

index 275482e..ee6b4ca 100644 (file)
--- a/PyPI/jumpavg/pyproject.toml
+++ b/PyPI/jumpavg/pyproject.toml
@@ -1,6 +1,6 @@
  [project]
  name = "jumpavg"
-version = "0.4.0"
+version = "0.4.1"
  description = "Library for locating changes in time series by grouping results."
  authors = [
      { name = "Cisco Systems Inc. and/or its affiliates", email = "csit-dev@lists.fd.io" },
diff --git a/resources/libraries/python/jumpavg/bit_counting_stats.py b/resources/libraries/python/jumpavg/bit_counting_stats.py

index caece2c..3d1cb8a 100644 (file)
--- a/resources/libraries/python/jumpavg/bit_counting_stats.py
+++ b/resources/libraries/python/jumpavg/bit_counting_stats.py
@@ -97,10 +97,13 @@ class BitCountingStats(AvgStdevStats):
          if self.size < 2:
              return
          stdev = self.stdev / self.unit
-        # Stdev is considered to be uniformly distributed
-        # from zero to max_value. That is quite a bad expectation,
-        # but resilient to negative samples etc.
-        self.bits += math.log(max_value + 1, 2)
+        # Stdev can be anything between zero and max value.
+        # For size==2, sphere surface is 2 points regardless of radius,
+        # we need to penalize large stdev already when encoding the stdev.
+        # The simplest way is to use the same distribution as with size...
+        self.bits += math.log((stdev + 1) * (stdev + 2), 2)
+        # .. just with added normalization from the max value cut-off.
+        self.bits += math.log(1 - 1 / (max_value + 2), 2)
          # Now we know the samples lie on sphere in size-1 dimensions.
          # So it is (size-2)-sphere, with radius^2 == stdev^2 * size.
          # https://en.wikipedia.org/wiki/N-sphere
author	Vratko Polak <vrpolak@cisco.com>
	Thu, 13 Jul 2023 12:19:37 +0000 (14:19 +0200)
committer	Tibor Frank <tifrank@cisco.com>
	Fri, 14 Jul 2023 08:16:52 +0000 (08:16 +0000)
PyPI/jumpavg/README.md		patch \| blob \| history
PyPI/jumpavg/pyproject.toml		patch \| blob \| history
resources/libraries/python/jumpavg/bit_counting_stats.py		patch \| blob \| history