doc_id and source_doc_id params in doc_chunk (#598)

* doc_id and source_doc_id params in doc_chunk This expands the doc_chunk parameters with the possibility to tune the propagation of the doc_id from the input tables to the results. Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * align document_id naming Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
IBM · Sep 18, 2024 · 1b8db48 · 1b8db48
1 parent 5d56398
commit 1b8db48
Show file tree

Hide file tree

Showing 8 changed files with 193 additions and 140 deletions.
diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md
@@ -9,6 +9,16 @@ which provides the required JSON structure.
 
 When using documents converted to Markdown, the transform leverages the [Llama Index](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) `MarkdownNodeParser`, which is relying on its internal Markdown splitting logic.
 
+## Output format
+
+The output parquet file will contain all the original columns, but the content will be replaced with the individual chunks.
+
+
+### Tracing the origin of the chunks
+
+The transform allows to trace the origin of the chunk with the `source_doc_id` which is set to the value of the `document_id` column (if present) in the input table.
+The actual name of columns can be customized with the parameters described below.
+
 
 ## Running
 
@@ -21,8 +31,10 @@ The transform can be tuned with the following parameters.
 |------------|----------|--------------|
 | `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
 | `content_column_name`        | `contents` | Name of the column containing the text to be chunked. |
+| `doc_id_column_name`         | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
 | `dl_min_chunk_len`           | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
 | `output_chunk_column_name`   | `contents` | Column name to store the chunks in the output table. |
+| `output_source_doc_id_column_name`   | `source_document_id` | Column name to store the `doc_id` from the input table. |
 | `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
 | `output_pageno_column_name`  | `page_number` | Column name to store the page number of the chunk in the output table. |
 | `output_bbox_column_name`    | `bbox` | Column name to store the bbox of the chunk in the output table. |

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
@@ -24,16 +24,20 @@
 short_name = "doc_chunk"
 cli_prefix = f"{short_name}_"
 content_column_name_key = "content_column_name"
+doc_id_column_name_key = "doc_id_column_name"
 chunking_type_key = "chunking_type"
 dl_min_chunk_len_key = "dl_min_chunk_len"
 output_chunk_column_name_key = "output_chunk_column_name"
+output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
 output_jsonpath_column_name_key = "output_jsonpath_column_name"
 output_pageno_column_name_key = "output_pageno_column_name"
 output_bbox_column_name_key = "output_bbox_column_name"
 content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
+doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
 chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
 dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
 output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
+output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
 output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
 output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
 output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"
@@ -48,9 +52,11 @@ def __str__(self):
 
 
 default_content_column_name = "contents"
+default_doc_id_column_name = "document_id"
 default_chunking_type = chunking_types.DL_JSON
 default_dl_min_chunk_len = None
 default_output_chunk_column_name = "contents"
+default_output_source_doc_id_column_name = "source_document_id"
 default_output_jsonpath_column_name = "doc_jsonpath"
 default_output_pageno_column_name = "page_number"
 default_output_bbox_column_name = "bbox"
@@ -76,7 +82,9 @@ def __init__(self, config: dict[str, Any]):
         self.chunking_type = config.get(chunking_type_key, default_chunking_type)
 
         self.content_column_name = config.get(content_column_name_key, default_content_column_name)
+        self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
         self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
+        self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
 
         # Parameters for Docling JSON chunking
         self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
@@ -117,8 +125,11 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
         for batch in table.to_batches():
             for row in batch.to_pylist():
                 content: str = row[self.content_column_name]
-                new_row = {k: v for k, v in row.items() if k not in (self.content_column_name,)}
+                new_row = {k: v for k, v in row.items() if k not in (self.content_column_name, self.doc_id_column_name)}
+                if self.doc_id_column_name in row:
+                    new_row[self.output_source_doc_id_column_name] = row[self.doc_id_column_name]
                 for chunk in self.chunker.chunk(content):
+                    chunk[self.doc_id_column_name] = TransformUtils.str_to_hash(chunk[self.output_chunk_column_name])
                     data.append(
                         {
                             **new_row,
@@ -167,6 +178,11 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=default_content_column_name,
             help="Name of the column containing the text to be chunked",
         )
+        parser.add_argument(
+            f"--{doc_id_column_name_cli_param}",
+            default=default_doc_id_column_name,
+            help="Name of the column containing the doc_id to be propagated in the output",
+        )
         parser.add_argument(
             f"--{dl_min_chunk_len_cli_param}",
             default=default_dl_min_chunk_len,
@@ -177,6 +193,11 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=default_output_chunk_column_name,
             help="Column name to store the chunks",
         )
+        parser.add_argument(
+            f"--{output_source_doc_id_column_name_cli_param}",
+            default=default_output_source_doc_id_column_name,
+            help="Column name to store the `document_id` from the input table",
+        )
         parser.add_argument(
             f"--{output_jsonpath_column_name_cli_param}",
             default=default_output_jsonpath_column_name,

diff --git a/transforms/language/doc_chunk/python/test-data/expected/metadata.json b/transforms/language/doc_chunk/python/test-data/expected/metadata.json
@@ -1,48 +1,54 @@
 {
-    "pipeline": "pipeline_id",
-    "job details": {
-        "job category": "preprocessing",
-        "job name": "doc_chunk",
-        "job type": "pure python",
-        "job id": "job_id",
-        "start_time": "2024-07-31 21:18:32",
-        "end_time": "2024-07-31 21:18:32",
-        "status": "success"
-    },
-    "code": {
-        "github": "github",
-        "commit_hash": "12345",
-        "path": "path"
-    },
-    "job_input_params": {
-        "chunking_type": "dl_json",
-        "content_column_name": "contents",
-        "output_chunk_column_name": "contents",
-        "output_jsonpath_column_name": "doc_jsonpath",
-        "output_pageno_column_name": "page_number",
-        "output_bbox_column_name": "bbox",
-        "checkpointing": false,
-        "max_files": -1,
-        "random_samples": -1,
-        "files_to_use": [".parquet"]
-    },
-    "job_output_stats": {
-        "source_files": 1,
-        "source_size": 50276,
-        "result_files": 1,
-        "result_size": 27400,
-        "processing_time": 0.051286935806274414,
-        "nfiles": 1,
-        "nrows": 88,
-        "source_doc_count": 1,
-        "result_doc_count": 88
-    },
-    "source": {
-        "name": "/Users/dol/scratch/dpk-dev/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
-        "type": "path"
-    },
-    "target": {
-        "name": "/Users/dol/scratch/dpk-dev/data-prep-kit/transforms/language/doc_chunk/python/output",
-        "type": "path"
-    }
-}
+  "pipeline": "pipeline_id",
+  "job details": {
+    "job category": "preprocessing",
+    "job name": "doc_chunk",
+    "job type": "pure python",
+    "job id": "job_id",
+    "start_time": "2024-09-18 16:05:04",
+    "end_time": "2024-09-18 16:05:04",
+    "status": "success"
+  },
+  "code": {
+    "github": "github",
+    "commit_hash": "12345",
+    "path": "path"
+  },
+  "job_input_params": {
+    "chunking_type": "dl_json",
+    "content_column_name": "contents",
+    "doc_id_column_name": "document_id",
+    "dl_min_chunk_len": null,
+    "output_chunk_column_name": "contents",
+    "output_source_doc_id_column_name": "source_document_id",
+    "output_jsonpath_column_name": "doc_jsonpath",
+    "output_pageno_column_name": "page_number",
+    "output_bbox_column_name": "bbox",
+    "checkpointing": false,
+    "max_files": -1,
+    "random_samples": -1,
+    "files_to_use": [
+      ".parquet"
+    ],
+    "num_processors": 0
+  },
+  "job_output_stats": {
+    "source_files": 1,
+    "source_size": 50276,
+    "result_files": 1,
+    "result_size": 31246,
+    "processing_time": 0.071,
+    "nfiles": 1,
+    "nrows": 88,
+    "source_doc_count": 1,
+    "result_doc_count": 88
+  },
+  "source": {
+    "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
+    "type": "path"
+  },
+  "target": {
+    "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/output",
+    "type": "path"
+  }
+}
diff --git a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet
diff --git a/transforms/language/doc_chunk/python/test-data/expected_md/2206.01062.parquet b/transforms/language/doc_chunk/python/test-data/expected_md/2206.01062.parquet
diff --git a/transforms/language/doc_chunk/python/test-data/expected_md/metadata.json b/transforms/language/doc_chunk/python/test-data/expected_md/metadata.json
@@ -1,46 +1,54 @@
 {
-    "pipeline": "pipeline_id",
-    "job details": {
-        "job category": "preprocessing",
-        "job name": "doc_chunk",
-        "job type": "pure python",
-        "job id": "job_id",
-        "start_time": "2024-07-30 18:27:44",
-        "end_time": "2024-07-30 18:27:44",
-        "status": "success"
-    },
-    "code": {
-        "github": "github",
-        "commit_hash": "12345",
-        "path": "path"
-    },
-    "job_input_params": {
-        "chunking_type": "li_markdown",
-        "content_column_name": "contents",
-        "output_chunk_column_name": "contents",
-        "output_jsonpath_column_name": "doc_jsonpath",
-        "checkpointing": false,
-        "max_files": -1,
-        "random_samples": -1,
-        "files_to_use": [".parquet"]
-    },
-    "job_output_stats": {
-        "source_files": 1,
-        "source_size": 23756,
-        "result_files": 1,
-        "result_size": 23781,
-        "processing_time": 0.014858007431030273,
-        "nfiles": 1,
-        "nrows": 18,
-        "source_doc_count": 1,
-        "result_doc_count": 18
-    },
-    "source": {
-        "name": "/Users/dol/scratch/dpk-dev/data-prep-kit-outer/transforms/language/doc_chunk/python/test-data/input_md",
-        "type": "path"
-    },
-    "target": {
-        "name": "/Users/dol/scratch/dpk-dev/data-prep-kit-outer/transforms/language/doc_chunk/python/output",
-        "type": "path"
-    }
-}
+  "pipeline": "pipeline_id",
+  "job details": {
+    "job category": "preprocessing",
+    "job name": "doc_chunk",
+    "job type": "pure python",
+    "job id": "job_id",
+    "start_time": "2024-09-18 16:04:28",
+    "end_time": "2024-09-18 16:04:29",
+    "status": "success"
+  },
+  "code": {
+    "github": "github",
+    "commit_hash": "12345",
+    "path": "path"
+  },
+  "job_input_params": {
+    "chunking_type": "li_markdown",
+    "content_column_name": "contents",
+    "doc_id_column_name": "document_id",
+    "dl_min_chunk_len": null,
+    "output_chunk_column_name": "contents",
+    "output_source_doc_id_column_name": "source_document_id",
+    "output_jsonpath_column_name": "doc_jsonpath",
+    "output_pageno_column_name": "page_number",
+    "output_bbox_column_name": "bbox",
+    "checkpointing": false,
+    "max_files": -1,
+    "random_samples": -1,
+    "files_to_use": [
+      ".parquet"
+    ],
+    "num_processors": 0
+  },
+  "job_output_stats": {
+    "source_files": 1,
+    "source_size": 23756,
+    "result_files": 1,
+    "result_size": 25170,
+    "processing_time": 1.277,
+    "nfiles": 1,
+    "nrows": 18,
+    "source_doc_count": 1,
+    "result_doc_count": 18
+  },
+  "source": {
+    "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input_md",
+    "type": "path"
+  },
+  "target": {
+    "name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/output",
+    "type": "path"
+  }
+}