Merge pull request #7764 from DIRACGridBot/cherry-pick-2-ba021e86f-in…

…tegration [sweep:integration] feat (consistency): add checksum comparison
DIRACGrid · Aug 27, 2024 · b802887 · b802887
2 parents f18be6f + 9f14d35
commit b802887
Showing 1 changed file with 48 additions and 2 deletions.
diff --git a/consistency_check/consistency.py b/consistency_check/consistency.py
@@ -17,14 +17,18 @@ def load_se_definition(se_def_path):
 
 
 def load_dfc_dump(dfc_dump_path, version):
-    fc_dump = pd.read_csv(dfc_dump_path, names=["seName", "lfn", "cks", "size"], delimiter="|")
+    fc_dump = pd.read_csv(dfc_dump_path, names=["seName", "lfn", "fc_cks", "size"], delimiter="|")
+    fc_dump["fc_cks"] = fc_dump["fc_cks"].str.lower().str.pad(8, fillchar="0")
     fc_dump["version"] = version
     return fc_dump
 
 
 def load_se_dump(se_dump_path):
-    se_dump = pd.read_csv(se_dump_path, names=["pfn"], delimiter=";", index_col="pfn")
+    se_dump = pd.read_csv(se_dump_path, names=["pfn", "se_cks"], delimiter="|", index_col="pfn")
+    se_dump["se_cks"] = se_dump["se_cks"].str.lower().str.pad(8, fillchar="0")
     se_dump["version"] = "se_dump"
+    assert not se_dump.index.duplicated().any(), f"Duplicated entries in SE dump {se_dump[se_dump.index.duplicated()]}"
+
     return se_dump
 
 
@@ -98,6 +102,48 @@ def possibly_dark_data(
         typer.secho("No dark data found", fg=GREEN)
 
 
+@app.command()
+def compare_checksum(
+    fc_dump_file: Annotated[Path, typer.Option(help="DFC dump")],
+    se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
+    se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
+    checksum_output: Annotated[
+        Path, typer.Option(help="Output file in which to dump checksum difference")
+    ] = "cks_diff.csv",
+):
+    """
+    Compare the checksums of a DFC and an SE dump.
+    Careful, sometimes the cks are not padded the same way
+    """
+    se_dump = load_se_dump(se_dump_file)
+    se_def = load_se_definition(se_def_file)
+
+    # Compute the PFN for each LFN in the DFC dump
+
+    fc_dump = load_dfc_dump(fc_dump_file, "fc")
+    fc_dump = pd.merge(fc_dump, se_def, on="seName")
+    fc_dump["pfn"] = fc_dump["basePath"] + fc_dump["lfn"]
+    fc_dump.set_index("pfn", inplace=True)
+
+    typer.echo(f"Computing checksum mismath")
+    # Find data in both SE and FC
+    in_both = se_dump.index.intersection(fc_dump.index)
+    # Make a single DF with both info, and only keep pfn in both
+    joined = pd.concat([fc_dump, se_dump], axis=1)
+    joined = joined[joined.index.isin(in_both)]
+
+    # Filter on non matching checksum
+    non_matching = joined.loc[joined["fc_cks"] != joined["se_cks"]][["seName", "lfn", "fc_cks", "se_cks"]]
+
+    if len(non_matching):
+        typer.secho(
+            f"Found {len(non_matching)} non matching checksum, dumping them in {checksum_output}", err=True, fg=RED
+        )
+        non_matching.to_csv(checksum_output, index=False)
+    else:
+        typer.secho("No checksum mismatch found", fg=GREEN)
+
+
 @app.command()
 def threeway(
     old_fc_dump_file: Annotated[Path, typer.Option(help="DFC dump BEFORE the SE dump")],