Skip to content

Commit

Permalink
Verify it works on exp-01
Browse files Browse the repository at this point in the history
Signed-off-by: Vibhu Jawa <vibhujawa@gmail.com>
  • Loading branch information
VibhuJawa committed Oct 17, 2024
1 parent 5c76836 commit 8396237
Showing 1 changed file with 3 additions and 5 deletions.
8 changes: 3 additions & 5 deletions nemo_curator/modules/fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1448,7 +1448,7 @@ def _run_connected_components(
self.profile_dir, "connected-components-run"
):

Comms.initialize(p2p=True)
Comms.initialize(p2p=False)
df = dask_cudf.read_parquet(
deduped_encoded_jaccard_path, blocksize="1GB", aggregate_files=True
)
Expand Down Expand Up @@ -1476,9 +1476,7 @@ def _run_connected_components(
labels_df = labels_df.merge(
result, left_on=["uid"], right_on=["vertex"], how="inner"
)
id_columns = (
["dataset_id", "doc_id"] if self.convert_str_ids else [self.id_column]
)
id_columns = [self.id_column]
labels_df = labels_df[id_columns + ["labels"]]
labels_df = labels_df.rename(columns={"labels": "group"})
labels_df = labels_df.persist()
Expand Down Expand Up @@ -1578,7 +1576,7 @@ def _write_dedup_parsed_id(self):
ddf = dask_cudf.read_parquet(
self.jaccard_pairs_path,
columns=[self.left_id, self.right_id],
blocksize="1GB",
blocksize="512MB",
aggregate_files=True,
)
id_columns = [self.id_column]
Expand Down

0 comments on commit 8396237

Please sign in to comment.