Skip to content

Commit

Permalink
perf: hash data files during combining to avoid unneeded work. #1483
Browse files Browse the repository at this point in the history
When generating many parallel data files, often some data files will be exact
copies of each other.  Checking the hashes, we can avoid combining the
duplicates, speeding the process.
  • Loading branch information
nedbat committed Nov 7, 2022
1 parent 9c26c95 commit 3381918
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 11 deletions.
30 changes: 22 additions & 8 deletions coverage/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"""

import glob
import hashlib
import os.path

from coverage.exceptions import CoverageException, NoDataError
Expand Down Expand Up @@ -110,6 +111,7 @@ def combine_parallel_data(
if strict and not files_to_combine:
raise NoDataError("No data to combine")

file_hashes = set()
files_combined = 0
for f in files_to_combine:
if f == data.data_filename():
Expand All @@ -118,6 +120,25 @@ def combine_parallel_data(
if data._debug.should('dataio'):
data._debug.write(f"Skipping combining ourself: {f!r}")
continue

try:
rel_file_name = os.path.relpath(f)
except ValueError:
# ValueError can be raised under Windows when os.getcwd() returns a
# folder from a different drive than the drive of f, in which case
# we print the original value of f instead of its relative path
rel_file_name = f

with open(f, "rb") as fobj:
hasher = hashlib.new("sha3_256")
hasher.update(fobj.read())
sha = hasher.digest()
if sha in file_hashes:
if message:
message(f"Skipping duplicate data {rel_file_name}")
continue
file_hashes.add(sha)

if data._debug.should('dataio'):
data._debug.write(f"Combining data file {f!r}")
try:
Expand All @@ -132,14 +153,7 @@ def combine_parallel_data(
data.update(new_data, aliases=aliases)
files_combined += 1
if message:
try:
file_name = os.path.relpath(f)
except ValueError:
# ValueError can be raised under Windows when os.getcwd() returns a
# folder from a different drive than the drive of f, in which case
# we print the original value of f instead of its relative path
file_name = f
message(f"Combined data file {file_name}")
message(f"Combined data file {rel_file_name}")
if not keep:
if data._debug.should('dataio'):
data._debug.write(f"Deleting combined data file {f!r}")
Expand Down
2 changes: 1 addition & 1 deletion coverage/sqldata.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def _init_db(self, db):
[
("sys_argv", str(getattr(sys, "argv", None))),
("version", __version__),
("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
#("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
]
)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1362,7 +1362,7 @@ def test_combine_no_usable_files(self):

# Make bogus data files.
self.make_file(".coverage.bad1", "This isn't a coverage data file.")
self.make_file(".coverage.bad2", "This isn't a coverage data file.")
self.make_file(".coverage.bad2", "This isn't a coverage data file either.")

# Combine the parallel coverage data files into .coverage, but nothing is readable.
cov = coverage.Coverage()
Expand Down
5 changes: 4 additions & 1 deletion tests/test_concurrency.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,10 @@ def try_multiprocessing_code(
out_lines = out.splitlines()
assert len(out_lines) == nprocs + 1
assert all(
re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line)
re.fullmatch(
r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+",
line
)
for line in out_lines
)
out = self.run_command("coverage report -m")
Expand Down

0 comments on commit 3381918

Please sign in to comment.