diff --git a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py index 20ef49dc3..b115a300a 100644 --- a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py +++ b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py @@ -238,7 +238,7 @@ def _convert_pdf2parquet( num_pages = len(doc.pages) num_tables = len(doc.tables) num_doc_elements = len(doc.texts) - document_hash = np.uint64(doc.origin.binary_hash) + document_hash = str(doc.origin.binary_hash) # we turn the uint64 hash into str, because it is easier to handle for pyarrow self._update_metrics(num_pages=num_pages, elapse_time=elapse_time) diff --git a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet index f68ff66e1..27b97529d 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json index 330ee3a5c..f5961f843 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-11-11 21:04:30", - "end_time": "2024-11-11 21:04:38", + "start_time": "2024-11-13 08:35:51", + "end_time": "2024-11-13 08:36:23", "status": "success" }, "code": { @@ -36,18 +36,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 21.1, + "cpus": 147.5, "gpus": 0, - "memory": 32.09, + "memory": 33.72, "object_store": 0, - "execution time, min": 0.139 + "execution time, min": 0.522 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 32939, - "processing_time": 5.596, + "result_size": 33078, + "processing_time": 4.221, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet index 17a7cf950..3e08723a0 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json index 32023e56a..8756a013e 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-11-11 21:06:08", - "end_time": "2024-11-11 21:06:14", + "start_time": "2024-11-13 08:37:05", + "end_time": "2024-11-13 08:37:11", "status": "success" }, "code": { @@ -36,9 +36,9 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 21.5, + "cpus": 143.9, "gpus": 0, - "memory": 32.19, + "memory": 34.21, "object_store": 0, "execution time, min": 0.1 }, @@ -46,12 +46,12 @@ "source_files": 2, "source_size": 605137, "result_files": 1, - "processing_time": 3.353, + "processing_time": 3.364, "nrows": 3, "nsuccess": 3, "nfail": 0, "nskip": 0, - "result_size": 27147 + "result_size": 27226 }, "source": { "name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input", diff --git a/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet index c29b5db0e..9e3302c8c 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet index 42b0a245d..584cbea22 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json index ed05c6b34..35a8bd874 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-11-11 21:05:31", - "end_time": "2024-11-11 21:05:36", + "start_time": "2024-11-13 08:37:56", + "end_time": "2024-11-13 08:38:02", "status": "success" }, "code": { @@ -36,18 +36,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 21.4, + "cpus": 142.2, "gpus": 0, - "memory": 32.33, + "memory": 33.63, "object_store": 0, - "execution time, min": 0.096 + "execution time, min": 0.1 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 22850, - "processing_time": 3.229, + "result_size": 22993, + "processing_time": 3.422, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet index 0f4bda73e..915c07189 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet index 32bfa6d00..f1bbf6c77 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json index e8a3894bf..ad1709b3d 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-11-11 21:05:04", - "end_time": "2024-11-11 21:05:06", + "start_time": "2024-11-13 08:37:31", + "end_time": "2024-11-13 08:37:34", "status": "success" }, "code": { @@ -36,18 +36,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 21.6, + "cpus": 143.4, "gpus": 0, - "memory": 29.57, + "memory": 31.51, "object_store": 0, - "execution time, min": 0.041 + "execution time, min": 0.042 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 29555, - "processing_time": 1.997, + "result_size": 29694, + "processing_time": 2.077, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet index db8b58790..004f70d2d 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet index f68ff66e1..27b97529d 100644 Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json index 330ee3a5c..f5961f843 100644 --- a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-11-11 21:04:30", - "end_time": "2024-11-11 21:04:38", + "start_time": "2024-11-13 08:35:51", + "end_time": "2024-11-13 08:36:23", "status": "success" }, "code": { @@ -36,18 +36,18 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 21.1, + "cpus": 147.5, "gpus": 0, - "memory": 32.09, + "memory": 33.72, "object_store": 0, - "execution time, min": 0.139 + "execution time, min": 0.522 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 32939, - "processing_time": 5.596, + "result_size": 33078, + "processing_time": 4.221, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet index 17a7cf950..3e08723a0 100644 Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet differ