Skip to content

Commit

Permalink
doc_id and source_doc_id params in doc_chunk (#598)
Browse files Browse the repository at this point in the history
* doc_id and source_doc_id params in doc_chunk

This expands the doc_chunk parameters with the possibility
to tune the propagation of the doc_id from the input tables
to the results.

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* align document_id naming

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
  • Loading branch information
dolfim-ibm authored Sep 18, 2024
1 parent 5d56398 commit 1b8db48
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 140 deletions.
12 changes: 12 additions & 0 deletions transforms/language/doc_chunk/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ which provides the required JSON structure.

When using documents converted to Markdown, the transform leverages the [Llama Index](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) `MarkdownNodeParser`, which is relying on its internal Markdown splitting logic.

## Output format

The output parquet file will contain all the original columns, but the content will be replaced with the individual chunks.


### Tracing the origin of the chunks

The transform allows to trace the origin of the chunk with the `source_doc_id` which is set to the value of the `document_id` column (if present) in the input table.
The actual name of columns can be customized with the parameters described below.


## Running

Expand All @@ -21,8 +31,10 @@ The transform can be tuned with the following parameters.
|------------|----------|--------------|
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
| `content_column_name` | `contents` | Name of the column containing the text to be chunked. |
| `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. |
| `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. |
| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
| `output_pageno_column_name` | `page_number` | Column name to store the page number of the chunk in the output table. |
| `output_bbox_column_name` | `bbox` | Column name to store the bbox of the chunk in the output table. |
Expand Down
23 changes: 22 additions & 1 deletion transforms/language/doc_chunk/python/src/doc_chunk_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,20 @@
short_name = "doc_chunk"
cli_prefix = f"{short_name}_"
content_column_name_key = "content_column_name"
doc_id_column_name_key = "doc_id_column_name"
chunking_type_key = "chunking_type"
dl_min_chunk_len_key = "dl_min_chunk_len"
output_chunk_column_name_key = "output_chunk_column_name"
output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
output_jsonpath_column_name_key = "output_jsonpath_column_name"
output_pageno_column_name_key = "output_pageno_column_name"
output_bbox_column_name_key = "output_bbox_column_name"
content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"
Expand All @@ -48,9 +52,11 @@ def __str__(self):


default_content_column_name = "contents"
default_doc_id_column_name = "document_id"
default_chunking_type = chunking_types.DL_JSON
default_dl_min_chunk_len = None
default_output_chunk_column_name = "contents"
default_output_source_doc_id_column_name = "source_document_id"
default_output_jsonpath_column_name = "doc_jsonpath"
default_output_pageno_column_name = "page_number"
default_output_bbox_column_name = "bbox"
Expand All @@ -76,7 +82,9 @@ def __init__(self, config: dict[str, Any]):
self.chunking_type = config.get(chunking_type_key, default_chunking_type)

self.content_column_name = config.get(content_column_name_key, default_content_column_name)
self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)

# Parameters for Docling JSON chunking
self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
Expand Down Expand Up @@ -117,8 +125,11 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
for batch in table.to_batches():
for row in batch.to_pylist():
content: str = row[self.content_column_name]
new_row = {k: v for k, v in row.items() if k not in (self.content_column_name,)}
new_row = {k: v for k, v in row.items() if k not in (self.content_column_name, self.doc_id_column_name)}
if self.doc_id_column_name in row:
new_row[self.output_source_doc_id_column_name] = row[self.doc_id_column_name]
for chunk in self.chunker.chunk(content):
chunk[self.doc_id_column_name] = TransformUtils.str_to_hash(chunk[self.output_chunk_column_name])
data.append(
{
**new_row,
Expand Down Expand Up @@ -167,6 +178,11 @@ def add_input_params(self, parser: ArgumentParser) -> None:
default=default_content_column_name,
help="Name of the column containing the text to be chunked",
)
parser.add_argument(
f"--{doc_id_column_name_cli_param}",
default=default_doc_id_column_name,
help="Name of the column containing the doc_id to be propagated in the output",
)
parser.add_argument(
f"--{dl_min_chunk_len_cli_param}",
default=default_dl_min_chunk_len,
Expand All @@ -177,6 +193,11 @@ def add_input_params(self, parser: ArgumentParser) -> None:
default=default_output_chunk_column_name,
help="Column name to store the chunks",
)
parser.add_argument(
f"--{output_source_doc_id_column_name_cli_param}",
default=default_output_source_doc_id_column_name,
help="Column name to store the `document_id` from the input table",
)
parser.add_argument(
f"--{output_jsonpath_column_name_cli_param}",
default=default_output_jsonpath_column_name,
Expand Down
100 changes: 53 additions & 47 deletions transforms/language/doc_chunk/python/test-data/expected/metadata.json
Original file line number Diff line number Diff line change
@@ -1,48 +1,54 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-07-31 21:18:32",
"end_time": "2024-07-31 21:18:32",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"chunking_type": "dl_json",
"content_column_name": "contents",
"output_chunk_column_name": "contents",
"output_jsonpath_column_name": "doc_jsonpath",
"output_pageno_column_name": "page_number",
"output_bbox_column_name": "bbox",
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [".parquet"]
},
"job_output_stats": {
"source_files": 1,
"source_size": 50276,
"result_files": 1,
"result_size": 27400,
"processing_time": 0.051286935806274414,
"nfiles": 1,
"nrows": 88,
"source_doc_count": 1,
"result_doc_count": 88
},
"source": {
"name": "/Users/dol/scratch/dpk-dev/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/dol/scratch/dpk-dev/data-prep-kit/transforms/language/doc_chunk/python/output",
"type": "path"
}
}
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-09-18 16:05:04",
"end_time": "2024-09-18 16:05:04",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"chunking_type": "dl_json",
"content_column_name": "contents",
"doc_id_column_name": "document_id",
"dl_min_chunk_len": null,
"output_chunk_column_name": "contents",
"output_source_doc_id_column_name": "source_document_id",
"output_jsonpath_column_name": "doc_jsonpath",
"output_pageno_column_name": "page_number",
"output_bbox_column_name": "bbox",
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".parquet"
],
"num_processors": 0
},
"job_output_stats": {
"source_files": 1,
"source_size": 50276,
"result_files": 1,
"result_size": 31246,
"processing_time": 0.071,
"nfiles": 1,
"nrows": 88,
"source_doc_count": 1,
"result_doc_count": 88
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/output",
"type": "path"
}
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,46 +1,54 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-07-30 18:27:44",
"end_time": "2024-07-30 18:27:44",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"chunking_type": "li_markdown",
"content_column_name": "contents",
"output_chunk_column_name": "contents",
"output_jsonpath_column_name": "doc_jsonpath",
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [".parquet"]
},
"job_output_stats": {
"source_files": 1,
"source_size": 23756,
"result_files": 1,
"result_size": 23781,
"processing_time": 0.014858007431030273,
"nfiles": 1,
"nrows": 18,
"source_doc_count": 1,
"result_doc_count": 18
},
"source": {
"name": "/Users/dol/scratch/dpk-dev/data-prep-kit-outer/transforms/language/doc_chunk/python/test-data/input_md",
"type": "path"
},
"target": {
"name": "/Users/dol/scratch/dpk-dev/data-prep-kit-outer/transforms/language/doc_chunk/python/output",
"type": "path"
}
}
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-09-18 16:04:28",
"end_time": "2024-09-18 16:04:29",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"chunking_type": "li_markdown",
"content_column_name": "contents",
"doc_id_column_name": "document_id",
"dl_min_chunk_len": null,
"output_chunk_column_name": "contents",
"output_source_doc_id_column_name": "source_document_id",
"output_jsonpath_column_name": "doc_jsonpath",
"output_pageno_column_name": "page_number",
"output_bbox_column_name": "bbox",
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".parquet"
],
"num_processors": 0
},
"job_output_stats": {
"source_files": 1,
"source_size": 23756,
"result_files": 1,
"result_size": 25170,
"processing_time": 1.277,
"nfiles": 1,
"nrows": 18,
"source_doc_count": 1,
"result_doc_count": 18
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input_md",
"type": "path"
},
"target": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/output",
"type": "path"
}
}
Loading

0 comments on commit 1b8db48

Please sign in to comment.