Skip to content

Commit

Permalink
Fix backfill deleting cache when split names not ready (#3119)
Browse files Browse the repository at this point in the history
* fix backfill deleting cache when split names not ready

* Update libs/libcommon/src/libcommon/state.py

Co-authored-by: Sylvain Lesage <sylvain.lesage@huggingface.co>

---------

Co-authored-by: Sylvain Lesage <sylvain.lesage@huggingface.co>
  • Loading branch information
lhoestq and severo authored Dec 13, 2024
1 parent 7540b32 commit 096abc2
Showing 1 changed file with 7 additions and 6 deletions.
13 changes: 7 additions & 6 deletions libs/libcommon/src/libcommon/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,13 +232,14 @@ def __post_init__(self, pending_jobs_df: pd.DataFrame, cache_entries_df: pd.Data
name_field="split",
) # Note that we use the cached content even the revision is different (ie. maybe obsolete)

unexpected_split_names = set(cache_entries_df["split"].unique()).difference(
set(self.split_names).union({None})
)
if unexpected_split_names:
raise UnexceptedSplitNamesError(
f"Unexpected split names for dataset={self.dataset} config={self.config} ({len(unexpected_split_names)}): {list(islice(unexpected_split_names, 10))}{'' if len(unexpected_split_names) <= 10 else '...'}"
if self.split_names: # empty if the config-split-names cache is missing
unexpected_split_names = set(cache_entries_df["split"].unique()).difference(
set(self.split_names).union({None})
)
if unexpected_split_names:
raise UnexceptedSplitNamesError(
f"Unexpected split names for dataset={self.dataset} config={self.config} ({len(unexpected_split_names)}): {list(islice(unexpected_split_names, 10))}{'' if len(unexpected_split_names) <= 10 else '...'}"
)

with StepProfiler(
method="ConfigState.__post_init__",
Expand Down

0 comments on commit 096abc2

Please sign in to comment.