Verify it works on exp-01

Signed-off-by: Vibhu Jawa <vibhujawa@gmail.com>
NVIDIA · Oct 17, 2024 · 8396237 · 8396237
1 parent 5c76836
commit 8396237
Showing 1 changed file with 3 additions and 5 deletions.
diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py
@@ -1448,7 +1448,7 @@ def _run_connected_components(
             self.profile_dir, "connected-components-run"
         ):
 
-            Comms.initialize(p2p=True)
+            Comms.initialize(p2p=False)
             df = dask_cudf.read_parquet(
                 deduped_encoded_jaccard_path, blocksize="1GB", aggregate_files=True
             )
@@ -1476,9 +1476,7 @@ def _run_connected_components(
             labels_df = labels_df.merge(
                 result, left_on=["uid"], right_on=["vertex"], how="inner"
             )
-            id_columns = (
-                ["dataset_id", "doc_id"] if self.convert_str_ids else [self.id_column]
-            )
+            id_columns = [self.id_column]
             labels_df = labels_df[id_columns + ["labels"]]
             labels_df = labels_df.rename(columns={"labels": "group"})
             labels_df = labels_df.persist()
@@ -1578,7 +1576,7 @@ def _write_dedup_parsed_id(self):
             ddf = dask_cudf.read_parquet(
                 self.jaccard_pairs_path,
                 columns=[self.left_id, self.right_id],
-                blocksize="1GB",
+                blocksize="512MB",
                 aggregate_files=True,
             )
             id_columns = [self.id_column]