Add --sensitivity option

Fixes #265 Provide more comprehensible alternative for tuning entropy checking. This is applied consistently across all target character sets, and stated in a way that is slightly easier to understand ("higher means more likely to flag a given string"). The older `--b64-entropy-score` and `--hex-entropy-score` options are marked as deprecated but retained for backwards compatibility (and they override `--sensitivity` if used together with it).
godaddy · Nov 12, 2021 · 42ad8ac · 42ad8ac
1 parent ca46e5c
commit 42ad8ac
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 67 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,10 @@ Features:
   `--scan-filenames/--no-scan-filenames` flag which allows users to enable or disable file name scanning.
 * [#254](https://github.com/godaddy/tartufo/pull/260) - Changes the default value of
   `--regex/--no-regex` to True.
+* [#265](https://github.com/godaddy/tartufo/issues/265) - Adds new `--sensitivity`
+  option which provides a friendlier way to adjust entropy detection sensitivity.
+  This replaces `--b64-entropy-score` and `--hex-entropy-score`, which now are
+  marked as deprecated.
 
 Misc:
 

diff --git a/tartufo/cli.py b/tartufo/cli.py
@@ -200,25 +200,33 @@ def get_command(self, ctx: click.Context, cmd_name: str) -> Optional[click.Comma
     show_default=True,
     help="Enable or disable timestamps in logging messages.",
 )
+@click.option(
+    "--sensitivity",
+    type=click.IntRange(0, 100),
+    default=25,
+    show_default=True,
+    help="""Modify entropy detection sensitivity. This is expressed as the probability
+    (between 0 and 100) a given string is *not* random and should be ignored.
+    A higher value increases the likelihood a string will be considered suspicious
+    and result in a finding.""",
+)
 @click.option(
     "-b64",
     "--b64-entropy-score",
-    default=4.5,
-    show_default=True,
-    help="Modify the base64 entropy score. If a value greater than the default is "
-    "specified, tartufo lists higher entropy base64 strings (longer or more randomized "
-    "strings). A lower value lists lower entropy base64 strings (shorter or less "
-    "randomized strings).",
+    help="""[DEPRECATED] Use `--sensitivity`. Modify the base64 entropy score. If
+    a value greater than the default (4.5 in a range of 0.0-6.0) is specified,
+    tartufo lists higher entropy base64 strings (longer or more randomized strings.
+    A lower value lists lower entropy base64 strings (shorter or less randomized
+    strings).""",
 )
 @click.option(
     "-hex",
     "--hex-entropy-score",
-    default=3.0,
-    show_default=True,
-    help="Modify the hexadecimal entropy score. If a value greater than the default is "
-    "specified, tartufo lists higher entropy hexadecimal strings (longer or more randomized "
-    "strings). A lower value lists lower entropy hexadecimal strings (shorter or less "
-    "randomized strings).",
+    help="""[DEPRECATED] Use `--sensitivity`. Modify the hexadecimal entropy score.
+    If a value greater than the default (3.0 in a range of 0.0-4.0) is specified,
+    tartufo lists higher entropy hexadecimal strings (longer or more randomized
+    strings). A lower value lists lower entropy hexadecimal strings (shorter or less
+    randomized strings).""",
 )
 # The first positional argument here would be a hard-coded version, hence the `None`
 @click.version_option(None, "-V", "--version")

diff --git a/tartufo/scanner.py b/tartufo/scanner.py
@@ -141,11 +141,46 @@ class ScannerBase(abc.ABC):  # pylint: disable=too-many-instance-attributes
     global_options: types.GlobalOptions
     logger: logging.Logger
     _scan_lock: threading.Lock = threading.Lock()
+    _hex_entropy_score: float = 3.0
+    _b64_entropy_score: float = 4.5
 
     def __init__(self, options: types.GlobalOptions) -> None:
         self.global_options = options
         self.logger = logging.getLogger(__name__)
 
+        # entropy_score is a probability (between 0.0 and 1.0) that a given string
+        # is random. Strings that are at least this likely to be random will result
+        # in findings. We convert this from "sensitivity" (0-100) which is inverted
+        # so that intuitively "more sensitive" means "more likely to flag a given
+        # string as suspicious."
+        if self.global_options.sensitivity is None:
+            sensitivity = 25
+        else:
+            sensitivity = self.global_options.sensitivity
+        entropy_score = float(100 - sensitivity) / 100.0
+
+        # We now compute an effective score for each type of entropy string by
+        # multiplying by the number of bits expressed in each character of the
+        # string's character set:
+        # hex character 0-f is 16 digits = 2^4 digits = 4 bits/character
+        # base64 represents 24 bits using 4 characters = 6 bits/character
+        self._hex_entropy_score = entropy_score * 4.0
+        self._b64_entropy_score = entropy_score * 6.0
+
+        # For backwards compatibility, allow the caller to manipulate each of
+        # these representation-specific scores directly (but complain about it).
+        if self.global_options.hex_entropy_score:
+            warnings.warn(
+                "--hex-entropy-score is deprecated. Use --sensitivity instead."
+            )
+            self._hex_entropy_score = self.global_options.hex_entropy_score
+
+        if self.global_options.b64_entropy_score:
+            warnings.warn(
+                "--b64-entropy-score is deprecated. Use --sensitivity instead."
+            )
+            self._b64_entropy_score = self.global_options.b64_entropy_score
+
     @property
     def completed(self) -> bool:
         """Return True if scan has completed
@@ -398,22 +433,19 @@ def scan(self) -> Generator[Issue, None, None]:
                 if self.global_options.entropy:
                     for issue in self.scan_entropy(
                         chunk,
-                        self.global_options.b64_entropy_score,
-                        self.global_options.hex_entropy_score,
                     ):
                         self._issues.append(issue)
                         yield issue
             self._completed = True
             self.logger.info("Found %d issues.", len(self._issues))
 
     def scan_entropy(
-        self, chunk: types.Chunk, b64_entropy_score: float, hex_entropy_score: float
+        self,
+        chunk: types.Chunk,
     ) -> Generator[Issue, None, None]:
         """Scan a chunk of data for apparent high entropy.
 
         :param chunk: The chunk of data to be scanned
-        :param b64_entropy_score: Base64 entropy score
-        :param hex_entropy_score: Hexadecimal entropy score
         """
 
         for line in chunk.contents.split("\n"):
@@ -423,12 +455,12 @@ def scan_entropy(
 
                 for string in b64_strings:
                     yield from self.evaluate_entropy_string(
-                        chunk, line, string, BASE64_CHARS, b64_entropy_score
+                        chunk, line, string, BASE64_CHARS, self._b64_entropy_score
                     )
 
                 for string in hex_strings:
                     yield from self.evaluate_entropy_string(
-                        chunk, line, string, HEX_CHARS, hex_entropy_score
+                        chunk, line, string, HEX_CHARS, self._hex_entropy_score
                     )
 
     def evaluate_entropy_string(

diff --git a/tartufo/types.py b/tartufo/types.py
@@ -26,6 +26,7 @@ class GlobalOptions:
         "output_format",
         "b64_entropy_score",
         "hex_entropy_score",
+        "sensitivity",
     )
     rules: Tuple[TextIO, ...]
     default_regexes: bool
@@ -46,6 +47,7 @@ class GlobalOptions:
     output_format: Optional[str]
     b64_entropy_score: float
     hex_entropy_score: float
+    sensitivity: int
 
 
 @dataclass

diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py
@@ -66,9 +66,9 @@ def test_scan_iterates_through_all_chunks(self, mock_entropy: mock.MagicMock):
         list(test_scanner.scan())
         mock_entropy.assert_has_calls(
             (
-                mock.call("foo", 4.5, 3),
-                mock.call("bar", 4.5, 3),
-                mock.call("baz", 4.5, 3),
+                mock.call("foo"),
+                mock.call("bar"),
+                mock.call("baz"),
             ),
             any_order=True,
         )
@@ -431,11 +431,7 @@ def test_scan_entropy_find_b64_strings_for_every_word_in_diff(
         self, mock_strings: mock.MagicMock
     ):
         mock_strings.return_value = []
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        list(self.scanner.scan_entropy(self.chunk))
         mock_strings.assert_has_calls(
             (
                 mock.call("foo", scanner.BASE64_CHARS),
@@ -458,11 +454,7 @@ def test_issues_are_not_created_for_b64_string_excluded_signatures(
     ):
         mock_strings.side_effect = (["foo"], [], [], [], [], [])
         mock_signature.return_value = True
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        issues = list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        issues = list(self.scanner.scan_entropy(self.chunk))
         mock_calculate.assert_not_called()
         self.assertEqual(issues, [])
 
@@ -477,11 +469,7 @@ def test_issues_are_not_created_for_hex_string_excluded_signatures(
     ):
         mock_strings.side_effect = ([], ["foo"], [], [], [], [])
         mock_signature.return_value = True
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        issues = list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        issues = list(self.scanner.scan_entropy(self.chunk))
         mock_calculate.assert_not_called()
         self.assertEqual(issues, [])
 
@@ -497,11 +485,7 @@ def test_issues_are_created_for_high_entropy_b64_strings(
         mock_strings.side_effect = (["foo"], [], [], [], [], [])
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        issues = list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        issues = list(self.scanner.scan_entropy(self.chunk))
         self.assertEqual(len(issues), 1)
         self.assertEqual(issues[0].issue_type, types.IssueType.Entropy)
         self.assertEqual(issues[0].matched_string, "foo")
@@ -518,11 +502,7 @@ def test_issues_are_created_for_high_entropy_hex_strings(
         mock_strings.side_effect = ([], ["foo"], [], [], [], [])
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        issues = list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        issues = list(self.scanner.scan_entropy(self.chunk))
         self.assertEqual(len(issues), 1)
         self.assertEqual(issues[0].issue_type, types.IssueType.Entropy)
         self.assertEqual(issues[0].matched_string, "foo")
@@ -542,11 +522,7 @@ def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_ex
         mock_entropy.return_value = True
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        issues = list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        issues = list(self.scanner.scan_entropy(self.chunk))
         self.assertEqual(len(issues), 0)
 
     @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy")
@@ -564,11 +540,7 @@ def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_exc
         mock_entropy.return_value = True
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        issues = list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        issues = list(self.scanner.scan_entropy(self.chunk))
         self.assertEqual(len(issues), 0)
 
     @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy")
@@ -583,11 +555,7 @@ def test_issues_are_not_created_for_low_entropy_b64_strings(
         mock_strings.side_effect = (["foo"], [], [], [], [], [])
         mock_signature.return_value = False
         mock_calculate.return_value = 1.0
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        issues = list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        issues = list(self.scanner.scan_entropy(self.chunk))
         self.assertEqual(len(issues), 0)
 
     @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy")
@@ -602,13 +570,25 @@ def test_issues_are_not_created_for_low_entropy_hex_strings(
         mock_strings.side_effect = ([], ["foo"], [], [], [], [])
         mock_signature.return_value = False
         mock_calculate.return_value = 1.0
-        b64_entropy_score = 4.5
-        hex_entropy_score = 3
-        issues = list(
-            self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)
-        )
+        issues = list(self.scanner.scan_entropy(self.chunk))
         self.assertEqual(len(issues), 0)
 
+    def test_sensitivity_low_end_calculation(self):
+        self.options.sensitivity = 0
+        scanner = TestScanner(self.options)
+
+        # 0% sensitivity means entropy rate must equal bit rate
+        self.assertEqual(scanner._b64_entropy_score, 6.0)
+        self.assertEqual(scanner._hex_entropy_score, 4.0)
+
+    def test_sensitivity_high_end_calculation(self):
+        self.options.sensitivity = 100
+        scanner = TestScanner(self.options)
+
+        # 100% sensitivity means required entropy rate will be zero
+        self.assertEqual(scanner._b64_entropy_score, 0.0)
+        self.assertEqual(scanner._hex_entropy_score, 0.0)
+
 
 if __name__ == "__main__":
     unittest.main()