Review feedback tuneups

* Consolidate common code for entropy limit back into a single method, and rework properties related to it so they are cleaner. * Invert sensitivity scale; adjust math and doc to match. It's still weird but aligns more closely to the underlying entropy metric.
godaddy · Nov 12, 2021 · 61497ac · 61497ac
1 parent 88f5c3e
commit 61497ac
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -136,11 +136,11 @@ Options:
   --entropy-sensitivity INTEGER RANGE
                                   Modify entropy detection sensitivity. This
                                   is expressed as on a scale of 0 to 100,
-                                  where 0 means "totally random" and 100 means
-                                  "totally nonrandom". Increasing the
+                                  where 0 means "totally nonrandom" and 100
+                                  means "totally random". Decreasing the
                                   scanner's sensitivity increases the
                                   likelihood that a given string will be
-                                  identified as suspicious.  [default: 25]
+                                  identified as suspicious.  [default: 75]
 
   -b64, --b64-entropy-score TEXT  [DEPRECATED] Use `--entropy-sensitivity`.
                                   Modify the base64 entropy score. If a value

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,7 @@ sphinx-click = {version = "^2.5.0", optional = true}
 sphinx-rtd-theme = {version = "^0.5.0", optional = true}
 sphinxcontrib-spelling = {version = "^5.4.0", optional = true}
 tomlkit = "^0.7.2"
+cached-property = "^1.5.2"
 
 [tool.poetry.dev-dependencies]
 black = {version = "21.5b2", allow-prereleases = true, markers = "platform_python_implementation == 'CPython'"}

diff --git a/tartufo/cli.py b/tartufo/cli.py
@@ -203,11 +203,11 @@ def get_command(self, ctx: click.Context, cmd_name: str) -> Optional[click.Comma
 @click.option(
     "--entropy-sensitivity",
     type=click.IntRange(0, 100),
-    default=25,
+    default=75,
     show_default=True,
     help="""Modify entropy detection sensitivity. This is expressed as on a scale
-    of 0 to 100, where 0 means "totally random" and 100 means "totally nonrandom".
-    Increasing the scanner's sensitivity increases the likelihood that a given
+    of 0 to 100, where 0 means "totally nonrandom" and 100 means "totally random".
+    Decreasing the scanner's sensitivity increases the likelihood that a given
     string will be identified as suspicious.""",
 )
 @click.option(

diff --git a/tartufo/scanner.py b/tartufo/scanner.py
@@ -21,9 +21,9 @@
 )
 import warnings
 
+from cached_property import cached_property
 import click
 import git
-
 import pygit2
 
 from tartufo import config, types, util
@@ -147,25 +147,22 @@ def __init__(self, options: types.GlobalOptions) -> None:
         self.global_options = options
         self.logger = logging.getLogger(__name__)
 
-    @lru_cache(maxsize=None)
-    def compute_hex_entropy_limit(self) -> float:
-        """Determine low limit for suspicious hexadecimal encodings"""
-
-        # entropy_score is a probability (between 0.0 and 1.0) that a given string
-        # is random. Strings that are at least this likely to be random will result
-        # in findings. We convert this from "sensitivity" (0-100) which is inverted
-        # so that intuitively "more sensitive" means "more likely to flag a given
-        # string as suspicious."
+    def compute_scaled_entropy_limit(self, maximum_bitrate: float) -> float:
+        """Determine low entropy cutoff for specified bitrate
+
+        :param maximum_bitrate: How many bits does each character represent?
+        :returns: Entropy detection threshold scaled to the input bitrate
+        """
+
         if self.global_options.entropy_sensitivity is None:
-            sensitivity = 25
+            sensitivity = 75
         else:
             sensitivity = self.global_options.entropy_sensitivity
-        entropy_score = float(100 - sensitivity) / 100.0
+        return float(sensitivity) / 100.0 * maximum_bitrate
 
-        # Each hexadecimal digit represents a 4-bit number, so we want to scale
-        # the base score by this amount to account for the efficiency of the
-        # string representation we're examining.
-        hex_entropy_score = entropy_score * 4.0
+    @cached_property
+    def hex_entropy_limit(self) -> float:
+        """Returns low entropy limit for suspicious hexadecimal encodings"""
 
         # For backwards compatibility, allow the caller to manipulate this score
         # # directly (but complain about it).
@@ -174,36 +171,16 @@ def compute_hex_entropy_limit(self) -> float:
                 "--hex-entropy-score is deprecated. Use --entropy-sensitivity instead.",
                 DeprecationWarning,
             )
-            hex_entropy_score = self.global_options.hex_entropy_score
+            return self.global_options.hex_entropy_score
 
-        return hex_entropy_score
-
-    @property
-    def hex_entropy_limit(self) -> float:
-        """Returns low limit for suspicious hexadecimal encodings"""
-
-        return self.compute_hex_entropy_limit()
-
-    @lru_cache(maxsize=None)
-    def compute_b64_entropy_limit(self) -> float:
-        """Returns low limit for suspicious base64 encodings"""
-
-        # entropy_score is a probability (between 0.0 and 1.0) that a given string
-        # is random. Strings that are at least this likely to be random will result
-        # in findings. We convert this from "sensitivity" (0-100) which is inverted
-        # so that intuitively "more sensitive" means "more likely to flag a given
-        # string as suspicious."
-        if self.global_options.entropy_sensitivity is None:
-            sensitivity = 25
-        else:
-            sensitivity = self.global_options.entropy_sensitivity
-        entropy_score = float(100 - sensitivity) / 100.0
+        # Each hexadecimal digit represents a 4-bit number, so we want to scale
+        # the base score by this amount to account for the efficiency of the
+        # string representation we're examining.
+        return self.compute_scaled_entropy_limit(4.0)
 
-        # Each 4-character base64 group represents 3 8-bit bytes, i.e. an effective
-        # bit rate of 24/4 = 6 bits per character. We want to scale the base score
-        # by this amount to account for the efficiency of the string representation
-        # we're examining.
-        b64_entropy_score = entropy_score * 6.0
+    @cached_property
+    def b64_entropy_limit(self) -> float:
+        """Returns low entropy limit for suspicious base64 encodings"""
 
         # For backwards compatibility, allow the caller to manipulate this score
         # # directly (but complain about it).
@@ -212,15 +189,13 @@ def compute_b64_entropy_limit(self) -> float:
                 "--b64-entropy-score is deprecated. Use --entropy-sensitivity instead.",
                 DeprecationWarning,
             )
-            b64_entropy_score = self.global_options.b64_entropy_score
+            return self.global_options.b64_entropy_score
 
-        return b64_entropy_score
-
-    @property
-    def b64_entropy_limit(self) -> float:
-        """Returns low limit for suspicious base64 encodings"""
-
-        return self.compute_b64_entropy_limit()
+        # Each 4-character base64 group represents 3 8-bit bytes, i.e. an effective
+        # bit rate of 24/4 = 6 bits per character. We want to scale the base score
+        # by this amount to account for the efficiency of the string representation
+        # we're examining.
+        return self.compute_scaled_entropy_limit(6.0)
 
     @property
     def completed(self) -> bool:

diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py
@@ -578,16 +578,24 @@ def test_sensitivity_low_end_calculation(self):
         test_scanner = TestScanner(self.options)
 
         # 0% sensitivity means entropy rate must equal bit rate
-        self.assertEqual(test_scanner.b64_entropy_limit, 6.0)
-        self.assertEqual(test_scanner.hex_entropy_limit, 4.0)
+        self.assertEqual(test_scanner.b64_entropy_limit, 0.0)
+        self.assertEqual(test_scanner.hex_entropy_limit, 0.0)
 
     def test_sensitivity_high_end_calculation(self):
         self.options.entropy_sensitivity = 100
         test_scanner = TestScanner(self.options)
 
         # 100% sensitivity means required entropy rate will be zero
-        self.assertEqual(test_scanner.b64_entropy_limit, 0.0)
-        self.assertEqual(test_scanner.hex_entropy_limit, 0.0)
+        self.assertEqual(test_scanner.b64_entropy_limit, 6.0)
+        self.assertEqual(test_scanner.hex_entropy_limit, 4.0)
+
+    def test_sensitivity_deprecated_overrides(self):
+        self.options.b64_entropy_score = 11.1
+        self.options.hex_entropy_score = 22.2
+        test_scanner = TestScanner(self.options)
+
+        self.assertEqual(test_scanner.b64_entropy_limit, 11.1)
+        self.assertEqual(test_scanner.hex_entropy_limit, 22.2)
 
 
 if __name__ == "__main__":