Skip to content

Commit

Permalink
Style fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Vibhu Jawa <vjawa@nvidia.com>
  • Loading branch information
VibhuJawa committed Oct 17, 2024
1 parent cc68c4d commit df62a1f
Showing 1 changed file with 4 additions and 5 deletions.
9 changes: 4 additions & 5 deletions nemo_curator/modules/fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1436,9 +1436,7 @@ def cc_workflow(self, output_path):
cc_path = self._run_connected_components(
deduped_encoded_jaccard_path, deduped_parsed_id_path, output_path
)
self._logger.info(
f"End to End time in cc_workflow = {time.time() - st}s"
)
self._logger.info(f"End to End time in cc_workflow = {time.time() - st}s")
return cc_path

def _run_connected_components(
Expand All @@ -1458,7 +1456,9 @@ def _run_connected_components(
)
df = df[df["jaccard"] == 1].reset_index(drop=True)

labels_df = dask_cudf.read_parquet(deduped_parsed_id_path, blocksize="1GB", aggregate_files=True)
labels_df = dask_cudf.read_parquet(
deduped_parsed_id_path, blocksize="1GB", aggregate_files=True
)
num_nodes = len(labels_df)
self_edge_df = labels_df[["uid"]].rename(columns={"uid": self.left_id})
self_edge_df[self.right_id] = self_edge_df[self.left_id]
Expand Down Expand Up @@ -1649,7 +1649,6 @@ def _merge_and_write(
ddf = ddf[[self.left_id, self.right_id, "jaccard"]]
ddf.to_parquet(output_path, write_index=False)


@staticmethod
def _get_unique_ids_per_partition(df, id_columns):
unique_df_ls = []
Expand Down

0 comments on commit df62a1f

Please sign in to comment.