Merge pull request #756 from IBM/docling-v2

Update pdf2parquet to Docling v2
IBM · Oct 31, 2024 · 5877e4d · 5877e4d
2 parents 0c4a0be + 269d732
commit 5877e4d
Show file tree

Hide file tree

Showing 39 changed files with 441 additions and 141 deletions.
diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md
@@ -32,7 +32,6 @@ The transform can be tuned with the following parameters.
 | `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. |
 | `content_column_name`        | `contents` | Name of the column containing the text to be chunked. |
 | `doc_id_column_name`         | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
-| `dl_min_chunk_len`           | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
 | `chunk_size_tokens`          | `128` | Size of the chunk in tokens for the token text chunker. |
 | `chunk_overlap_tokens`       | `30` | Number of tokens overlapping between chunks for the token text chunker. |
 | `output_chunk_column_name`   | `contents` | Column name to store the chunks in the output table. |

diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_chunk_transform_python"
-version = "0.2.2.dev1"
+version = "0.3.0"
 requires-python = ">=3.10,<3.13"
 description = "chunk documents Python Transform"
 license = {text = "Apache-2.0"}

diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt
@@ -1,3 +1,3 @@
 data-prep-toolkit==0.2.2.dev1
-docling-core==1.7.2
+docling-core==2.3.0
 llama-index-core>=0.11.0,<0.12.0
diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
@@ -13,11 +13,11 @@
 from abc import ABCMeta, abstractmethod
 from typing import Iterator, Optional, Dict, List
 
-from docling_core.types import Document as DLDocument
+from docling_core.types.doc import DoclingDocument
 from llama_index.core.node_parser.text.token import TokenTextSplitter
 from llama_index.core import Document as LIDocument
 from llama_index.core.node_parser import MarkdownNodeParser
-from docling_core.transforms.chunker import HierarchicalChunker
+from docling_core.transforms.chunker import HierarchicalChunker, DocMeta
 
 
 class ChunkingExecutor(metaclass=ABCMeta):
@@ -29,7 +29,6 @@ def chunk(self, content: str) -> Iterator[dict]:
 class DLJsonChunker(ChunkingExecutor):
     def __init__(
         self,
-        min_chunk_len: Optional[int],
         output_chunk_column_name: str,
         output_jsonpath_column_name: str,
         output_pageno_column_name_key: str,
@@ -40,19 +39,19 @@ def __init__(
         self.output_pageno_column_name_key = output_pageno_column_name_key
         self.output_bbox_column_name_key = output_bbox_column_name_key
 
-        chunker_kwargs = dict(include_metadata=True)
-        if min_chunk_len is not None:
-            chunker_kwargs["min_chunk_len"] = min_chunk_len
-        self._chunker = HierarchicalChunker(**chunker_kwargs)
+        self._chunker = HierarchicalChunker()
 
     def chunk(self, content: str) -> Iterator[dict]:
-        doc = DLDocument.model_validate_json(content)
+        doc = DoclingDocument.model_validate_json(content)
         for chunk in self._chunker.chunk(doc):
+            meta = DocMeta.model_validate(chunk.meta)
+            doc_item = meta.doc_items[0]
+            prov = doc_item.prov[0]
             yield {
                 self.output_chunk_column_name: chunk.text,
-                self.output_jsonpath_column_name: chunk.path,
-                self.output_pageno_column_name_key: chunk.page,
-                self.output_bbox_column_name_key: chunk.bbox,
+                self.output_jsonpath_column_name: doc_item.self_ref,
+                self.output_pageno_column_name_key: prov.page_no,
+                self.output_bbox_column_name_key: prov.bbox.as_tuple(),
             }
 
 

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py
@@ -38,6 +38,7 @@
     "runtime_job_id": "job_id",
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
     # doc_chunk params
+    # "doc_chunk_dl_min_chunk_len": 10,  # for testing the usage of the deprecated argument
     # "doc_chunk_chunking_type": "li_markdown",
     "doc_chunk_chunking_type": "dl_json",
     # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, 

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
@@ -26,7 +26,6 @@
 content_column_name_key = "content_column_name"
 doc_id_column_name_key = "doc_id_column_name"
 chunking_type_key = "chunking_type"
-dl_min_chunk_len_key = "dl_min_chunk_len"
 chunk_size_tokens_key = "chunk_size_tokens"
 chunk_overlap_tokens_key = "chunk_overlap_tokens"
 output_chunk_column_name_key = "output_chunk_column_name"
@@ -38,7 +37,6 @@
 content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
 doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
 chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
-dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
 output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
 output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
 output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
@@ -59,7 +57,6 @@ def __str__(self):
 default_content_column_name = "contents"
 default_doc_id_column_name = "document_id"
 default_chunking_type = chunking_types.DL_JSON
-default_dl_min_chunk_len = None
 default_output_chunk_column_name = "contents"
 default_output_chunk_column_id = "chunk_id"
 default_output_source_doc_id_column_name = "source_document_id"
@@ -95,7 +92,6 @@ def __init__(self, config: dict[str, Any]):
         self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
 
         # Parameters for Docling JSON chunking
-        self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
         self.output_jsonpath_column_name = config.get(
             output_jsonpath_column_name_key, default_output_jsonpath_column_name
         )
@@ -113,7 +109,6 @@ def __init__(self, config: dict[str, Any]):
         self.chunker: ChunkingExecutor
         if self.chunking_type == chunking_types.DL_JSON:
             self.chunker = DLJsonChunker(
-                min_chunk_len=self.dl_min_chunk_len,
                 output_chunk_column_name=self.output_chunk_column_name,
                 output_jsonpath_column_name=self.output_jsonpath_column_name,
                 output_pageno_column_name_key=self.output_pageno_column_name_key,
@@ -202,11 +197,6 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=default_doc_id_column_name,
             help="Name of the column containing the doc_id to be propagated in the output",
         )
-        parser.add_argument(
-            f"--{dl_min_chunk_len_cli_param}",
-            default=default_dl_min_chunk_len,
-            help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
-        )
         parser.add_argument(
             f"--{output_chunk_column_name_cli_param}",
             default=default_output_chunk_column_name,
@@ -244,6 +234,11 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             type=int,
             help="Number of tokens overlapping between chunks for the fixed-sized chunker.",
         )
+        parser.add_argument(
+            f"--{cli_prefix}dl_min_chunk_len",
+            default=None,
+            help="Deprecated. This option is no longer considered.",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """
@@ -254,5 +249,7 @@ def apply_input_params(self, args: Namespace) -> bool:
         captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
 
         self.params = self.params | captured
+        if self.params.get("dl_min_chunk_len") is not None:
+            self.logger.warning("The `dl_min_chunk_len` option is deprecated and will be ignored. Please stop using it, it will not accepted anymore in future versions.")
         self.logger.info(f"doc_chunk parameters are : {self.params}")
         return True
diff --git a/transforms/language/doc_chunk/python/test-data/expected/metadata.json b/transforms/language/doc_chunk/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "doc_chunk",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 14:05:09",
-    "end_time": "2024-10-18 14:05:11",
+    "start_time": "2024-10-30 18:38:40",
+    "end_time": "2024-10-30 18:38:40",
     "status": "success"
   },
   "code": {
@@ -18,7 +18,6 @@
     "chunking_type": "dl_json",
     "content_column_name": "contents",
     "doc_id_column_name": "document_id",
-    "dl_min_chunk_len": null,
     "output_chunk_column_name": "contents",
     "output_source_doc_id_column_name": "source_document_id",
     "output_jsonpath_column_name": "doc_jsonpath",
@@ -35,22 +34,22 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 27.9,
+    "cpus": 19.5,
     "gpus": 0,
-    "memory": 25.75,
+    "memory": 27.48,
     "object_store": 0,
-    "execution time, min": 0.021
+    "execution time, min": 0.001
   },
   "job_output_stats": {
     "source_files": 1,
-    "source_size": 50276,
+    "source_size": 12073,
     "result_files": 1,
-    "result_size": 31223,
-    "processing_time": 1.266,
+    "result_size": 14363,
+    "processing_time": 0.043,
     "nfiles": 1,
-    "nrows": 88,
+    "nrows": 39,
     "source_doc_count": 1,
-    "result_doc_count": 88
+    "result_doc_count": 39
   },
   "source": {
     "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",

diff --git a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet
diff --git a/transforms/language/doc_chunk/python/test-data/input/test1.parquet b/transforms/language/doc_chunk/python/test-data/input/test1.parquet
diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_chunk_transform_ray"
-version = "0.2.2.dev1"
+version = "0.3.0"
 requires-python = ">=3.10,<3.13"
 description = "chunk documents Ray Transform"
 license = {text = "Apache-2.0"}
@@ -11,7 +11,7 @@ authors = [
     { name = "Christoph Auer", email = "cau@zurich.ibm.com" },
 ]
 dependencies = [
-    "dpk-doc-chunk-transform-python==0.2.2.dev1",
+    "dpk-doc-chunk-transform-python==0.3.0",
     "data-prep-toolkit[ray]==0.2.2.dev1",
 ]
 

diff --git a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "doc_chunk",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-10-18 14:05:09",
-    "end_time": "2024-10-18 14:05:11",
+    "start_time": "2024-10-30 18:38:40",
+    "end_time": "2024-10-30 18:38:40",
     "status": "success"
   },
   "code": {
@@ -18,7 +18,6 @@
     "chunking_type": "dl_json",
     "content_column_name": "contents",
     "doc_id_column_name": "document_id",
-    "dl_min_chunk_len": null,
     "output_chunk_column_name": "contents",
     "output_source_doc_id_column_name": "source_document_id",
     "output_jsonpath_column_name": "doc_jsonpath",
@@ -35,22 +34,22 @@
     "num_processors": 0
   },
   "execution_stats": {
-    "cpus": 27.9,
+    "cpus": 19.5,
     "gpus": 0,
-    "memory": 25.75,
+    "memory": 27.48,
     "object_store": 0,
-    "execution time, min": 0.021
+    "execution time, min": 0.001
   },
   "job_output_stats": {
     "source_files": 1,
-    "source_size": 50276,
+    "source_size": 12073,
     "result_files": 1,
-    "result_size": 31223,
-    "processing_time": 1.266,
+    "result_size": 14363,
+    "processing_time": 0.043,
     "nfiles": 1,
-    "nrows": 88,
+    "nrows": 39,
     "source_doc_count": 1,
-    "result_doc_count": 88
+    "result_doc_count": 39
   },
   "source": {
     "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",

diff --git a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet
diff --git a/transforms/language/doc_chunk/ray/test-data/input/test1.parquet b/transforms/language/doc_chunk/ray/test-data/input/test1.parquet
diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
@@ -39,8 +39,11 @@ def compute_exec_params_func(
     runtime_pipeline_id: str,
     runtime_job_id: str,
     runtime_code_location: dict,
+    pdf2parquet_batch_size: int,
     pdf2parquet_do_table_structure: bool,
     pdf2parquet_do_ocr: bool,
+    pdf2parquet_ocr_engine: str,
+    pdf2parquet_bitmap_area_threshold: float,
 ) -> dict:
     from runtime_utils import KFPUtils
 
@@ -53,8 +56,11 @@ def compute_exec_params_func(
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
+        "pdf2parquet_batch_size": pdf2parquet_batch_size,
         "pdf2parquet_do_table_structure": pdf2parquet_do_table_structure,
         "pdf2parquet_do_ocr": pdf2parquet_do_ocr,
+        "pdf2parquet_ocr_engine": pdf2parquet_ocr_engine,
+        "pdf2parquet_bitmap_area_threshold": pdf2parquet_bitmap_area_threshold,
     }
 
 
@@ -112,8 +118,11 @@ def pdf2parquet(
     runtime_pipeline_id: str = "pipeline_id",
     runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
     # pdf2parquet parameters
+    pdf2parquet_batch_size: int = -1,
     pdf2parquet_do_table_structure: bool = True,
     pdf2parquet_do_ocr: bool = False,
+    pdf2parquet_ocr_engine: str = "easyocr",
+    pdf2parquet_bitmap_area_threshold: float = 0.05,
     # additional parameters
     additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}',
 ):
@@ -150,8 +159,11 @@ def pdf2parquet(
     :param runtime_actor_options - actor options
     :param runtime_pipeline_id - pipeline id
     :param runtime_code_location - code location
+    :param pdf2parquet_batch_size - how many inputs to batch into one output table
     :param pdf2parquet_do_table_structure - run table structure model
     :param pdf2parquet_do_ocr - run ocr model
+    :param pdf2parquet_ocr_engine - which ocr engine
+    :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps
     :return: None
     """
     # create clean_up task
@@ -169,8 +181,11 @@ def pdf2parquet(
             runtime_pipeline_id=runtime_pipeline_id,
             runtime_job_id=run_id,
             runtime_code_location=runtime_code_location,
+            pdf2parquet_batch_size=pdf2parquet_batch_size,
             pdf2parquet_do_table_structure=pdf2parquet_do_table_structure,
             pdf2parquet_do_ocr=pdf2parquet_do_ocr,
+            pdf2parquet_ocr_engine=pdf2parquet_ocr_engine,
+            pdf2parquet_bitmap_area_threshold=pdf2parquet_bitmap_area_threshold,
         )
         ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
         # start Ray cluster

diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
@@ -40,8 +40,11 @@ def compute_exec_params_func(
     runtime_pipeline_id: str,
     runtime_job_id: str,
     runtime_code_location: dict,
+    pdf2parquet_batch_size: int,
     pdf2parquet_do_table_structure: bool,
     pdf2parquet_do_ocr: bool,
+    pdf2parquet_ocr_engine: str,
+    pdf2parquet_bitmap_area_threshold: float,
 ) -> dict:
     from runtime_utils import KFPUtils
 
@@ -55,8 +58,11 @@ def compute_exec_params_func(
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
+        "pdf2parquet_batch_size": pdf2parquet_batch_size,
         "pdf2parquet_do_table_structure": pdf2parquet_do_table_structure,
         "pdf2parquet_do_ocr": pdf2parquet_do_ocr,
+        "pdf2parquet_ocr_engine": pdf2parquet_ocr_engine,
+        "pdf2parquet_bitmap_area_threshold": pdf2parquet_bitmap_area_threshold,
     }
 
 
@@ -116,8 +122,11 @@ def pdf2parquet(
     runtime_pipeline_id: str = "pipeline_id",
     runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
     # pdf2parquet parameters
+    pdf2parquet_batch_size: int = -1,
     pdf2parquet_do_table_structure: bool = True,
     pdf2parquet_do_ocr: bool = False,
+    pdf2parquet_ocr_engine: str = "easyocr",
+    pdf2parquet_bitmap_area_threshold: float = 0.05,
     # additional parameters
     additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}',
 ):
@@ -154,8 +163,11 @@ def pdf2parquet(
     :param runtime_actor_options - actor options
     :param runtime_pipeline_id - pipeline id
     :param runtime_code_location - code location
+    :param pdf2parquet_batch_size - how many inputs to batch into one output table
     :param pdf2parquet_do_table_structure - run table structure model
     :param pdf2parquet_do_ocr - run ocr model
+    :param pdf2parquet_ocr_engine - which ocr engine
+    :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps
     :return: None
     """
     # create clean_up task
@@ -174,8 +186,11 @@ def pdf2parquet(
             runtime_pipeline_id=runtime_pipeline_id,
             runtime_job_id=run_id,
             runtime_code_location=runtime_code_location,
+            pdf2parquet_batch_size=pdf2parquet_batch_size,
             pdf2parquet_do_table_structure=pdf2parquet_do_table_structure,
             pdf2parquet_do_ocr=pdf2parquet_do_ocr,
+            pdf2parquet_ocr_engine=pdf2parquet_ocr_engine,
+            pdf2parquet_bitmap_area_threshold=pdf2parquet_bitmap_area_threshold,
         )
         ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
         # start Ray cluster

diff --git a/transforms/language/pdf2parquet/python/Dockerfile b/transforms/language/pdf2parquet/python/Dockerfile
@@ -32,7 +32,7 @@ RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e .
 
 # Download models
 RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
-RUN python -c 'from docling.document_converter import DocumentConverter; s=DocumentConverter.download_models_hf(); print(f"Models cached in {s}")'
+RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")'
 
 # copy the main() entry point to the image
 COPY --chown=dpk:root src/pdf2parquet_transform.py ./