diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py index eed81fe306..51d4a59b95 100644 --- a/haystack/components/converters/tika.py +++ b/haystack/components/converters/tika.py @@ -6,7 +6,7 @@ from haystack.lazy_imports import LazyImport from haystack import component, Document from haystack.dataclasses import ByteStream -from haystack.components.converters.utils import get_bytestream_from_source +from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata with LazyImport("Run 'pip install tika'") as tika_import: @@ -31,7 +31,10 @@ class TikaDocumentConverter: from haystack.components.converters.tika import TikaDocumentConverter converter = TikaDocumentConverter() - results = converter.run(sources=["sample.docx", "my_document.rtf", "archive.zip"]) + results = converter.run( + sources=["sample.docx", "my_document.rtf", "archive.zip"], + meta={"date_added": datetime.now().isoformat()} + ) documents = results["documents"] print(documents[0].content) # 'This is a text from the docx file.' @@ -48,24 +51,26 @@ def __init__(self, tika_url: str = "http://localhost:9998/tika"): self.tika_url = tika_url @component.output_types(documents=List[Document]) - def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None): + def run( + self, + sources: List[Union[str, Path, ByteStream]], + meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + ): """ Convert files to Documents. :param sources: List of file paths or ByteStream objects. - :param meta: Optional list of metadata to attach to the Documents. - The length of the list must match the number of sources. Defaults to `None`. + :param meta: Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + Defaults to `None`. :return: A dictionary containing a list of Document objects under the 'documents' key. """ - documents = [] + meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) - if meta is None: - meta = [{}] * len(sources) - elif len(sources) != len(meta): - raise ValueError("The length of the metadata list must match the number of sources.") - - for source, metadata in zip(sources, meta): + for source, metadata in zip(sources, meta_list): try: bytestream = get_bytestream_from_source(source) except Exception as e: diff --git a/releasenotes/notes/single-meta-in-tikaconverter-89b454c451a2ed93.yaml b/releasenotes/notes/single-meta-in-tikaconverter-89b454c451a2ed93.yaml new file mode 100644 index 0000000000..fcd1671ad5 --- /dev/null +++ b/releasenotes/notes/single-meta-in-tikaconverter-89b454c451a2ed93.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Adds support for single metadata dictionary input in `TikaDocumentConverter`. diff --git a/test/components/converters/test_tika_doc_converter.py b/test/components/converters/test_tika_doc_converter.py index 23c1fa92f0..d98483c075 100644 --- a/test/components/converters/test_tika_doc_converter.py +++ b/test/components/converters/test_tika_doc_converter.py @@ -18,16 +18,19 @@ def test_run(self, mock_tika_parser): assert len(documents) == 1 assert documents[0].content == "Content of mock source" - def test_run_with_meta(self): + def test_run_with_meta(self, test_files_path): bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) converter = TikaDocumentConverter() with patch("haystack.components.converters.tika.tika_parser.from_buffer"): - output = converter.run(sources=[bytestream], meta=[{"language": "it"}]) - document = output["documents"][0] + output = converter.run( + sources=[bytestream, test_files_path / "markdown" / "sample.md"], meta={"language": "it"} + ) - # check that the metadata from the bytestream is merged with that from the meta parameter - assert document.meta == {"author": "test_author", "language": "it"} + # check that the metadata from the sources is merged with that from the meta parameter + assert output["documents"][0].meta["author"] == "test_author" + assert output["documents"][0].meta["language"] == "it" + assert output["documents"][1].meta["language"] == "it" def test_run_nonexistent_file(self, caplog): component = TikaDocumentConverter()