diff --git a/kfp/transform_workflows/universal/ededup/src/ededup_compute_execution_params.py b/kfp/transform_workflows/universal/ededup/src/ededup_compute_execution_params.py index 99926e2c5..529a6ace3 100644 --- a/kfp/transform_workflows/universal/ededup/src/ededup_compute_execution_params.py +++ b/kfp/transform_workflows/universal/ededup/src/ededup_compute_execution_params.py @@ -63,6 +63,9 @@ def ededup_compute_execution_params( sampling = data_access.sample_input_data(n_samples=n_samples) avg_doc_size = sampling.get("average doc size KB") number_of_docs = sampling.get("estimated number of docs") + if number_of_docs == 0: + print(f"Estimated number of documents and documents size is zero. Please verify the input path.") + sys.exit(1) avg_table_size = sampling.get("average table size MB") / KB # compute number of hashes n_hashes = math.ceil(number_of_docs * 32 / GB) diff --git a/kfp/transform_workflows/universal/fdedup/src/fdedup_compute_execution_params.py b/kfp/transform_workflows/universal/fdedup/src/fdedup_compute_execution_params.py index 6aed9124e..a9f8b8d66 100644 --- a/kfp/transform_workflows/universal/fdedup/src/fdedup_compute_execution_params.py +++ b/kfp/transform_workflows/universal/fdedup/src/fdedup_compute_execution_params.py @@ -135,6 +135,9 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: avg_doc_size = sampling.get("average doc size KB") number_of_docs = sampling.get("estimated number of docs") avg_table_size = sampling.get("average table size MB") / KB + if number_of_docs == 0: + print(f"Estimated number of documents and documents size is zero. Please verify the input path.") + sys.exit(1) # we are creating more buckets actors, so that we get better parallelization for bucket processing b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB) d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB)