godaddy · rbailey-godaddy · Jan 11, 2022 · Jan 12, 2022
@@ -1,3 +1,13 @@
+v3.0.x - TBD
+------------
+
+Bug fixes:
+
+* [315](https://github.com/godaddy/tartufo/issues/315) - Rework entropy scanning
+  to reduce number of new issues generated for text that passed 2.x scans without
+  issues. Some corner cases may remain that will require exclusions to silence.
+  See [319](https://github.com/godaddy/tartufo/pull/319) for details.
+
 v3.0.0 - 5 January 2022
 -----------------------
 

@@ -36,8 +36,9 @@
     Scope,
 )
 
-BASE64_REGEX = re.compile(r"[A-Z0-9=+/_-]+", re.IGNORECASE)
-HEX_REGEX = re.compile(r"[0-9A-F]+", re.IGNORECASE)
+BASE64_REGEX = re.compile(r"[A-Z0-9=+/]{20,}", re.IGNORECASE)
+BASE64URL_REGEX = re.compile(r"[A-Z0-9=_-]{20,}", re.IGNORECASE)
+HEX_REGEX = re.compile(r"[0-9A-F]{20,}", re.IGNORECASE)
 
 
 class Issue:
@@ -561,38 +562,53 @@ def scan_entropy(
         """
 
         for line in chunk.contents.split("\n"):
-            for word in line.split():
-                for string in util.find_strings_by_regex(word, BASE64_REGEX):
-                    yield from self.evaluate_entropy_string(
+            # Using a set eliminates exact duplicates, which are common
+            base64_strings = set(util.find_strings_by_regex(line, BASE64_REGEX))
+            base64_strings.update(util.find_strings_by_regex(line, BASE64URL_REGEX))
+            # Don't report a string which is a substring of a string we already reported
+            reported_strings: List[str] = []
+            for string in sorted(base64_strings, key=len, reverse=True):
+                if all((string not in already for already in reported_strings)):
+                    report = self.evaluate_entropy_string(
                         chunk, line, string, self.b64_entropy_limit
                     )
-                for string in util.find_strings_by_regex(word, HEX_REGEX):
-                    yield from self.evaluate_entropy_string(
+                    if report is not None:
+                        reported_strings.append(string)
+                        yield report
+            # Hex alphabet is subset of base64, so check duplicates here too;
+            # but it's easier because hex string will never be substring of another
+            # hex string.
+            for string in util.find_strings_by_regex(line, HEX_REGEX):
+                if all((string not in already for already in reported_strings)):
+                    report = self.evaluate_entropy_string(
                         chunk, line, string, self.hex_entropy_limit
                     )
+                    if report is not None:
+                        yield report
 
     def evaluate_entropy_string(
         self,
         chunk: types.Chunk,
         line: str,
         string: str,
         min_entropy_score: float,
-    ) -> Generator[Issue, None, None]:
+    ) -> Optional[Issue]:
         """Check entropy string using entropy characters and score.
 
         :param chunk: The chunk of data to check
         :param line: Source line containing string of interest
         :param string: String to check
         :param min_entropy_score: Minimum entropy score to flag
-        :return: Generator of issues flagged
+        :return: Issue, if one is detected; otherwise None
         """
         if not self.signature_is_excluded(string, chunk.file_path):
             entropy_score = self.calculate_entropy(string)
             if entropy_score > min_entropy_score:
                 if self.entropy_string_is_excluded(string, line, chunk.file_path):
                     self.logger.debug("line containing entropy was excluded: %s", line)
-                else:
-                    yield Issue(types.IssueType.Entropy, string, chunk)
+                    return None
+                return Issue(types.IssueType.Entropy, string, chunk)
+        return None
 
     def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]:
         """Scan a chunk of data for matches against the configured regexes.

@@ -209,6 +209,12 @@ def find_strings_by_regex(
     :param text: The text string to be analyzed
     :param regex: A pattern which matches all character sequences of interest
     :param threshold: The minimum acceptable length of a matching string
+
+    WARNING: If you are passing one of the scanner.py regex variables, note
+    they are compiled with a minimum length of 20, and using a lower threshold
+    here will not have the intended effect. Nothing in this codebase passes a
+    threshold parameter, but external users might need to adjust or use their
+    own expressions.
     """
 
     for match in regex.finditer(text):

@@ -444,7 +444,20 @@ def test_issues_are_not_created_for_b64_string_excluded_signatures(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = (["foo"], [], [], [], [], [])
+        mock_strings.side_effect = (
+            ["foo"],
+            ["foo"],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mock_signature.return_value = True
         issues = list(self.scanner.scan_entropy(self.chunk))
         mock_calculate.assert_not_called()
@@ -459,7 +472,7 @@ def test_issues_are_not_created_for_hex_string_excluded_signatures(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = ([], ["foo"], [], [], [], [])
+        mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
         mock_signature.return_value = True
         issues = list(self.scanner.scan_entropy(self.chunk))
         mock_calculate.assert_not_called()
@@ -474,7 +487,20 @@ def test_issues_are_created_for_high_entropy_b64_strings(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = (["foo"], [], [], [], [], [])
+        mock_strings.side_effect = (
+            ["foo"],
+            ["foo"],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
         issues = list(self.scanner.scan_entropy(self.chunk))
@@ -491,7 +517,7 @@ def test_issues_are_created_for_high_entropy_hex_strings(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = ([], ["foo"], [], [], [], [])
+        mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
         issues = list(self.scanner.scan_entropy(self.chunk))
@@ -510,7 +536,7 @@ def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_ex
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = ([], ["foo"], [], [], [], [])
+        mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
         mock_entropy.return_value = True
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
@@ -528,7 +554,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_exc
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = (["foo"], [], [], [], [], [])
+        mock_strings.side_effect = (
+            ["foo"],
+            ["foo"],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mock_entropy.return_value = True
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
@@ -544,7 +583,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = (["foo"], [], [], [], [], [])
+        mock_strings.side_effect = (
+            ["foo"],
+            ["foo"],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mock_signature.return_value = False
         mock_calculate.return_value = 1.0
         issues = list(self.scanner.scan_entropy(self.chunk))
@@ -559,7 +611,7 @@ def test_issues_are_not_created_for_low_entropy_hex_strings(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = ([], ["foo"], [], [], [], [])
+        mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
         mock_signature.return_value = False
         mock_calculate.return_value = 1.0
         issues = list(self.scanner.scan_entropy(self.chunk))
@@ -604,12 +656,15 @@ def test_scan_entropy_find_b64_strings_for_every_word_in_diff(
         list(self.scanner.scan_entropy(self.chunk))
         mock_strings.assert_has_calls(
             (
-                mock.call("foo", scanner.BASE64_REGEX),
-                mock.call("foo", scanner.HEX_REGEX),
-                mock.call("bar", scanner.BASE64_REGEX),
-                mock.call("bar", scanner.HEX_REGEX),
-                mock.call("asdfqwer", scanner.BASE64_REGEX),
-                mock.call("asdfqwer", scanner.HEX_REGEX),
+                mock.call("        foo bar", scanner.BASE64_REGEX),
+                mock.call("        foo bar", scanner.BASE64URL_REGEX),
+                mock.call("        foo bar", scanner.HEX_REGEX),
+                mock.call("        asdfqwer", scanner.BASE64_REGEX),
+                mock.call("        asdfqwer", scanner.BASE64URL_REGEX),
+                mock.call("        asdfqwer", scanner.HEX_REGEX),
+                mock.call("        ", scanner.BASE64_REGEX),
+                mock.call("        ", scanner.BASE64URL_REGEX),
+                mock.call("        ", scanner.HEX_REGEX),
             )
         )
 

@@ -427,14 +427,14 @@ def test_find_strings_by_regex_recognizes_base64url(self):
         """
 
         strings = list(
-            util.find_strings_by_regex(sample_input, scanner.BASE64_REGEX, 20)
+            util.find_strings_by_regex(sample_input, scanner.BASE64URL_REGEX, 20)
         )
         self.assertEqual(strings, ["111111111-ffffCCCC=="])
 
     def test_find_strings_by_regex_recognizes_mutant_base64(self):
 
         sample_input = """
-        +111111111-ffffCCCC= Can't mix + and - but both are in regex
+        +111111111-ffffCCCC= Can't mix + and - and substrings are too short
         111111111111111111111== Not a valid length but we don't care
         ==111111111111111111 = Is supposed to be end only but we don't care
         """
@@ -444,5 +444,5 @@ def test_find_strings_by_regex_recognizes_mutant_base64(self):
         )
         self.assertEqual(
             strings,
-            ["+111111111-ffffCCCC=", "111111111111111111111==", "==111111111111111111"],
+            ["111111111111111111111==", "==111111111111111111"],
         )