Skip to content

Commit

Permalink
feat: support single metadata dictionary in TikaDocumentConverter (#…
Browse files Browse the repository at this point in the history
…6698)

* reno

* converter

* test

* comment
  • Loading branch information
ZanSara authored Jan 9, 2024
1 parent 93b2aae commit 974d65f
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 17 deletions.
29 changes: 17 additions & 12 deletions haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from haystack.lazy_imports import LazyImport
from haystack import component, Document
from haystack.dataclasses import ByteStream
from haystack.components.converters.utils import get_bytestream_from_source
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata


with LazyImport("Run 'pip install tika'") as tika_import:
Expand All @@ -31,7 +31,10 @@ class TikaDocumentConverter:
from haystack.components.converters.tika import TikaDocumentConverter
converter = TikaDocumentConverter()
results = converter.run(sources=["sample.docx", "my_document.rtf", "archive.zip"])
results = converter.run(
sources=["sample.docx", "my_document.rtf", "archive.zip"],
meta={"date_added": datetime.now().isoformat()}
)
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the docx file.'
Expand All @@ -48,24 +51,26 @@ def __init__(self, tika_url: str = "http://localhost:9998/tika"):
self.tika_url = tika_url

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Convert files to Documents.
:param sources: List of file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""

documents = []
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Adds support for single metadata dictionary input in `TikaDocumentConverter`.
13 changes: 8 additions & 5 deletions test/components/converters/test_tika_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,19 @@ def test_run(self, mock_tika_parser):
assert len(documents) == 1
assert documents[0].content == "Content of mock source"

def test_run_with_meta(self):
def test_run_with_meta(self, test_files_path):
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

converter = TikaDocumentConverter()
with patch("haystack.components.converters.tika.tika_parser.from_buffer"):
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]
output = converter.run(
sources=[bytestream, test_files_path / "markdown" / "sample.md"], meta={"language": "it"}
)

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}
# check that the metadata from the sources is merged with that from the meta parameter
assert output["documents"][0].meta["author"] == "test_author"
assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it"

def test_run_nonexistent_file(self, caplog):
component = TikaDocumentConverter()
Expand Down

0 comments on commit 974d65f

Please sign in to comment.