From a40efab857b7019c732ef3fcc2d689fc6260a30a Mon Sep 17 00:00:00 2001
From: Ned Batchelder <ned@nedbatchelder.com>
Date: Wed, 14 Jul 2021 06:59:30 -0400
Subject: [PATCH] fix: generate flat file names differently

Fixes a few unusual issues with reports:

- #580: HTML report generation fails on too long path
- #584: File collisions in coverage report html
- #1167: Remove leading underscore in coverage html
---
 CHANGES.rst                                   | 13 ++++++++--
 coverage/files.py                             | 17 +++++++------
 ...r => d_80084bf2fba02475___init__.py,cover} |  0
 ...py,cover => d_80084bf2fba02475_a.py,cover} |  0
 ...r => d_b039179a8a4ce2c2___init__.py,cover} |  0
 ...py,cover => d_b039179a8a4ce2c2_b.py,cover} |  0
 tests/goldtest.py                             | 16 +++++++++++-
 tests/test_files.py                           | 25 +++++++++++--------
 tests/test_html.py                            | 13 +++++++---
 tests/test_process.py                         |  5 ++--
 10 files changed, 63 insertions(+), 26 deletions(-)
 rename tests/gold/annotate/anno_dir/{a___init__.py,cover => d_80084bf2fba02475___init__.py,cover} (100%)
 rename tests/gold/annotate/anno_dir/{a_a.py,cover => d_80084bf2fba02475_a.py,cover} (100%)
 rename tests/gold/annotate/anno_dir/{b___init__.py,cover => d_b039179a8a4ce2c2___init__.py,cover} (100%)
 rename tests/gold/annotate/anno_dir/{b_b.py,cover => d_b039179a8a4ce2c2_b.py,cover} (100%)

diff --git a/CHANGES.rst b/CHANGES.rst
index 850c30ecf..d2c4ae06d 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -46,11 +46,20 @@ Unreleased
 
 - TOML parsing now uses the `tomli`_ library.
 
-- Use a modern hash algorithm when fingerprinting to speed HTML reports
-  (`issue 1189`_).
+- Some minor changes to usually invisible details of the HTML report:
+
+  - Use a modern hash algorithm when fingerprinting, for high-security
+    environments (`issue 1189`_).
+
+  - Change how report file names are generated, to avoid leading underscores
+    (`issue 1167`_), to avoid rare file name collisions (`issue 584`_), and to
+    avoid file names becoming too long (`issue 580`_).
 
 .. _Django coverage plugin: https://pypi.org/project/django-coverage-plugin/
+.. _issue 580: https://github.com/nedbat/coveragepy/issues/580
+.. _issue 584: https://github.com/nedbat/coveragepy/issues/584
 .. _issue 1150: https://github.com/nedbat/coveragepy/issues/1150
+.. _issue 1167: https://github.com/nedbat/coveragepy/issues/1167
 .. _issue 1168: https://github.com/nedbat/coveragepy/issues/1168
 .. _issue 1189: https://github.com/nedbat/coveragepy/issues/1189
 .. _tomli: https://pypi.org/project/tomli/
diff --git a/coverage/files.py b/coverage/files.py
index 8de2ec676..252e42ec5 100644
--- a/coverage/files.py
+++ b/coverage/files.py
@@ -77,7 +77,7 @@ def canonical_filename(filename):
     return CANONICAL_FILENAME_CACHE[filename]
 
 
-MAX_FLAT = 200
+MAX_FLAT = 100
 
 @contract(filename='unicode', returns='unicode')
 def flat_rootname(filename):
@@ -87,15 +87,16 @@ def flat_rootname(filename):
     the same directory, but need to differentiate same-named files from
     different directories.
 
-    For example, the file a/b/c.py will return 'a_b_c_py'
+    For example, the file a/b/c.py will return 'd_86bbcbe134d28fd2_c_py'
 
     """
-    name = ntpath.splitdrive(filename)[1]
-    name = re.sub(r"[\\/.:]", "_", name)
-    if len(name) > MAX_FLAT:
-        h = hashlib.sha1(name.encode('UTF-8')).hexdigest()
-        name = name[-(MAX_FLAT-len(h)-1):] + '_' + h
-    return name
+    dirname, basename = ntpath.split(filename)
+    if dirname:
+        fp = hashlib.new("sha3_256", dirname.encode("UTF-8")).hexdigest()[:16]
+        prefix = f"d_{fp}_"
+    else:
+        prefix = ""
+    return prefix + basename.replace(".", "_")
 
 
 if env.WINDOWS:
diff --git a/tests/gold/annotate/anno_dir/a___init__.py,cover b/tests/gold/annotate/anno_dir/d_80084bf2fba02475___init__.py,cover
similarity index 100%
rename from tests/gold/annotate/anno_dir/a___init__.py,cover
rename to tests/gold/annotate/anno_dir/d_80084bf2fba02475___init__.py,cover
diff --git a/tests/gold/annotate/anno_dir/a_a.py,cover b/tests/gold/annotate/anno_dir/d_80084bf2fba02475_a.py,cover
similarity index 100%
rename from tests/gold/annotate/anno_dir/a_a.py,cover
rename to tests/gold/annotate/anno_dir/d_80084bf2fba02475_a.py,cover
diff --git a/tests/gold/annotate/anno_dir/b___init__.py,cover b/tests/gold/annotate/anno_dir/d_b039179a8a4ce2c2___init__.py,cover
similarity index 100%
rename from tests/gold/annotate/anno_dir/b___init__.py,cover
rename to tests/gold/annotate/anno_dir/d_b039179a8a4ce2c2___init__.py,cover
diff --git a/tests/gold/annotate/anno_dir/b_b.py,cover b/tests/gold/annotate/anno_dir/d_b039179a8a4ce2c2_b.py,cover
similarity index 100%
rename from tests/gold/annotate/anno_dir/b_b.py,cover
rename to tests/gold/annotate/anno_dir/d_b039179a8a4ce2c2_b.py,cover
diff --git a/tests/goldtest.py b/tests/goldtest.py
index b9d59217c..bafb6f552 100644
--- a/tests/goldtest.py
+++ b/tests/goldtest.py
@@ -81,7 +81,7 @@ def compare(
 
         actual_file = os.path.join(actual_dir, f)
         with open(actual_file) as fobj:
-            actual = fobj.read()
+            actual = actual_original = fobj.read()
         if actual_file.endswith(".xml"):
             actual = canonicalize_xml(actual)
 
@@ -95,6 +95,20 @@ def compare(
             print(f":::: diff {expected_file!r} and {actual_file!r}")
             print("\n".join(difflib.Differ().compare(expected, actual)))
             print(f":::: end diff {expected_file!r} and {actual_file!r}")
+
+            save_path = expected_dir.replace("/gold/", "/actual/")
+            os.makedirs(save_path, exist_ok=True)
+            with open(os.path.join(save_path, f), "w") as savef:
+                savef.write(actual_original)
+
+    if not actual_extra:
+        for f in actual_only:
+            save_path = expected_dir.replace("/gold/", "/actual/")
+            os.makedirs(save_path, exist_ok=True)
+            with open(os.path.join(save_path, f), "w") as savef:
+                with open(os.path.join(actual_dir, f)) as readf:
+                    savef.write(readf.read())
+
     assert not text_diff, "Files differ: %s" % '\n'.join(text_diff)
 
     assert not expected_only, f"Files in {expected_dir} only: {expected_only}"
diff --git a/tests/test_files.py b/tests/test_files.py
index 98ece632f..39a51d8c2 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -69,18 +69,23 @@ def test_canonical_filename_ensure_cache_hit(self):
 
 
 @pytest.mark.parametrize("original, flat", [
-    ("a/b/c.py", "a_b_c_py"),
-    (r"c:\foo\bar.html", "_foo_bar_html"),
-    ("Montréal/☺/conf.py", "Montréal_☺_conf_py"),
+    ("abc.py", "abc_py"),
+    ("hellothere", "hellothere"),
+    ("a/b/c.py", "d_86bbcbe134d28fd2_c_py"),
+    ("a/b/defghi.py", "d_86bbcbe134d28fd2_defghi_py"),
+    ("/a/b/c.py", "d_bb25e0ada04227c6_c_py"),
+    ("/a/b/defghi.py", "d_bb25e0ada04227c6_defghi_py"),
+    (r"c:\foo\bar.html", "d_e7c107482373f299_bar_html"),
+    (r"d:\foo\bar.html", "d_584a05dcebc67b46_bar_html"),
+    ("Montréal/☺/conf.py", "d_c840497a2c647ce0_conf_py"),
     ( # original:
-        r"c:\lorem\ipsum\quia\dolor\sit\amet\consectetur\adipisci\velit\sed\quia\non"
-        r"\numquam\eius\modi\tempora\incidunt\ut\labore\et\dolore\magnam\aliquam"
-        r"\quaerat\voluptatem\ut\enim\ad\minima\veniam\quis\nostrum\exercitationem"
-        r"\ullam\corporis\suscipit\laboriosam\Montréal\☺\my_program.py",
+        r"c:\lorem\ipsum\quia\dolor\sit\amet\consectetur\adipisci\velit\sed" +
+        r"\quia\non\numquam\eius\modi\tempora\incidunt\ut\labore\et\dolore" +
+        r"\magnam\aliquam\quaerat\voluptatem\ut\enim\ad\minima\veniam\quis" +
+        r"\nostrum\exercitationem\ullam\corporis\suscipit\laboriosam" +
+        r"\Montréal\☺\my_program.py",
         # flat:
-        "re_et_dolore_magnam_aliquam_quaerat_voluptatem_ut_enim_ad_minima_veniam_quis_"
-        "nostrum_exercitationem_ullam_corporis_suscipit_laboriosam_Montréal_☺_my_program_py_"
-        "97eaca41b860faaa1a21349b1f3009bb061cf0a8"
+        "d_e597dfacb73a23d5_my_program_py"
      ),
 ])
 def test_flat_rootname(original, flat):
diff --git a/tests/test_html.py b/tests/test_html.py
index 56519a641..a0ab2d4a0 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -56,7 +56,7 @@ def run_coverage(self, covargs=None, htmlargs=None):
 
     def get_html_report_content(self, module):
         """Return the content of the HTML report for `module`."""
-        filename = module.replace(".", "_").replace("/", "_") + ".html"
+        filename = flat_rootname(module) + ".html"
         filename = os.path.join("htmlcov", filename)
         with open(filename) as f:
             return f.read()
@@ -617,7 +617,7 @@ def filepath_to_regex(path):
     return regex
 
 
-def compare_html(expected, actual):
+def compare_html(expected, actual, extra_scrubs=None):
     """Specialized compare function for our HTML files."""
     scrubs = [
         (r'/coverage.readthedocs.io/?[-.\w/]*', '/coverage.readthedocs.io/VER'),
@@ -640,6 +640,8 @@ def compare_html(expected, actual):
     if env.WINDOWS:
         # For file paths...
         scrubs += [(r"\\", "/")]
+    if extra_scrubs:
+        scrubs += extra_scrubs
     compare(expected, actual, file_pattern="*.html", scrubs=scrubs)
 
 
@@ -897,7 +899,12 @@ def test_other(self):
         for p in glob.glob("out/other/*_other_py.html"):
             os.rename(p, "out/other/blah_blah_other_py.html")
 
-        compare_html(gold_path("html/other"), "out/other")
+        compare_html(
+            gold_path("html/other"), "out/other",
+            extra_scrubs=[
+                (r'href="d_[0-9a-z]{16}_', 'href="_TEST_TMPDIR_othersrc_'),
+                ],
+            )
         contains(
             "out/other/index.html",
             '<a href="here_py.html">here.py</a>',
diff --git a/tests/test_process.py b/tests/test_process.py
index 04a8a2d53..58915b876 100644
--- a/tests/test_process.py
+++ b/tests/test_process.py
@@ -1332,10 +1332,11 @@ def test_accented_directory(self):
         # The HTML report uses ascii-encoded HTML entities.
         out = self.run_command("coverage html")
         assert out == ""
-        self.assert_exists("htmlcov/\xe2_accented_py.html")
+        self.assert_exists("htmlcov/d_5786906b6f0ffeb4_accented_py.html")
         with open("htmlcov/index.html") as indexf:
             index = indexf.read()
-        assert '<a href="&#226;_accented_py.html">&#226;%saccented.py</a>' % os.sep in index
+        expected = '<a href="d_5786906b6f0ffeb4_accented_py.html">&#226;%saccented.py</a>'
+        assert expected % os.sep in index
 
         # The XML report is always UTF8-encoded.
         out = self.run_command("coverage xml")