diff --git a/transforms/language/pdf2parquet/pdf2parquet.ipynb b/transforms/language/pdf2parquet/pdf2parquet.ipynb index 1200e7a7f..e5548eb4c 100644 --- a/transforms/language/pdf2parquet/pdf2parquet.ipynb +++ b/transforms/language/pdf2parquet/pdf2parquet.ipynb @@ -9,8 +9,8 @@ "\n", "##### **** example: \n", "```\n", - "python -m venv && source venv/bin/activate\n", - "pip install -r requirements.txt\n", + "make venv \n", + "source venv/bin/activate \n", "pip install jupyterlab\n", "```" ] @@ -122,22 +122,22 @@ "name": "stderr", "output_type": "stream", "text": [ - "13:23:55 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 0}\n", - "13:23:55 INFO - pipeline id pipeline_id\n", - "13:23:55 INFO - code location None\n", - "13:23:55 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", - "13:23:55 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:23:55 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf', '.docx', '.pptx', '.zip'], files to checkpoint ['.parquet']\n", - "13:23:55 INFO - orchestrator pdf2parquet started at 2024-11-20 13:23:55\n", - "13:23:55 INFO - Number of files is 2, source profile {'max_file_size': 0.3013172149658203, 'min_file_size': 0.2757863998413086, 'total_file_size': 0.5771036148071289}\n", - "13:23:55 INFO - Initializing models\n", - "13:23:58 INFO - Processing archive_doc_filename='2305.03393v1-pg9.pdf' \n", - "13:23:59 INFO - Processing archive_doc_filename='2408.09869v1-pg1.pdf' \n", - "13:24:00 INFO - Completed 1 files (50.0%) in 0.029 min\n", - "13:24:03 INFO - Completed 2 files (100.0%) in 0.08 min\n", - "13:24:03 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:24:03 INFO - done flushing in 0.0 sec\n", - "13:24:03 INFO - Completed execution in 0.132 min, execution result 0\n" + "15:13:18 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 0}\n", + "15:13:18 INFO - pipeline id pipeline_id\n", + "15:13:18 INFO - code location None\n", + "15:13:18 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "15:13:18 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:13:18 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf', '.docx', '.pptx', '.zip'], files to checkpoint ['.parquet']\n", + "15:13:18 INFO - orchestrator pdf2parquet started at 2024-11-20 15:13:18\n", + "15:13:18 INFO - Number of files is 2, source profile {'max_file_size': 0.3013172149658203, 'min_file_size': 0.2757863998413086, 'total_file_size': 0.5771036148071289}\n", + "15:13:18 INFO - Initializing models\n", + "15:14:08 INFO - Processing archive_doc_filename='2305.03393v1-pg9.pdf' \n", + "15:14:09 INFO - Processing archive_doc_filename='2408.09869v1-pg1.pdf' \n", + "15:14:10 INFO - Completed 1 files (50.0%) in 0.04 min\n", + "15:14:18 INFO - Completed 2 files (100.0%) in 0.179 min\n", + "15:14:18 INFO - Done processing 2 files, waiting for flush() completion.\n", + "15:14:18 INFO - done flushing in 0.0 sec\n", + "15:14:18 INFO - Completed execution in 1.007 min, execution result 0\n" ] } ], @@ -205,7 +205,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.10" + "version": "3.10.8" } }, "nbformat": 4,