deepset-ai · ZanSara · Dec 21, 2023 · Dec 20, 2023 · Dec 20, 2023 · Dec 20, 2023
@@ -4,7 +4,7 @@
 
 from haystack import Document, component
 from haystack.dataclasses import ByteStream
-from haystack.components.converters.utils import get_bytestream_from_source
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 
 
 logger = logging.getLogger(__name__)
@@ -38,25 +38,29 @@ def __init__(self, encoding: str = "utf-8"):
         self.encoding = encoding
 
     @component.output_types(documents=List[Document])
-    def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
         """
         Convert text files to Documents.
 
         :param sources: A list of paths to text files or ByteStream objects.
           Note that if an encoding is specified in the metadata of a ByteStream,
           it will override the component's default.
-        :param meta: Optional list of metadata to attach to the Documents.
-          The length of the list must match the number of sources. Defaults to `None`.
+        :param meta: Optional metadata to attach to the Documents.
+          This value can be either a list of dictionaries or a single dictionary.
+          If it's a single dictionary, its content is added to the metadata of all produced Documents.
+          If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+          Defaults to `None`.
         :return: A dictionary containing a list of Document objects under the 'documents' key.
         """
         documents = []
 
-        if meta is None:
-            meta = [{}] * len(sources)
-        elif len(sources) != len(meta):
-            raise ValueError("The length of the metadata list must match the number of sources.")
+        meta_list = normalize_metadata(meta, sources_count=len(sources))
 
-        for source, metadata in zip(sources, meta):
+        for source, metadata in zip(sources, meta_list):
             try:
                 bytestream = get_bytestream_from_source(source)
             except Exception as e:

@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Union
+from typing import List, Union, Dict, Any, Optional
 
 from haystack.dataclasses import ByteStream
 
@@ -18,3 +18,25 @@ def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStre
         bs.metadata["file_path"] = str(source)
         return bs
     raise ValueError(f"Unsupported source type {type(source)}")
+
+
+def normalize_metadata(
+    meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], sources_count: int
+) -> List[Dict[str, Any]]:
+    """
+    Given all the possible value of the meta input for a converter (None, dictionary or list of dicts),
+    makes sure to return a list of dictionaries of the correct length for the converter to use.
+
+    :param meta: the meta input of the converter, as-is
+    :sources_count: the number of sources the converter received
+    :returns: a list of dictionaries of the make length as the sources list
+    """
+    if meta is None:
+        return [{}] * sources_count
+    if isinstance(meta, dict):
+        return [meta] * sources_count
+    if isinstance(meta, list):
+        if sources_count != len(meta):
+            raise ValueError("The length of the metadata list must match the number of sources.")
+        return meta
+    raise ValueError("meta must be either None, a dictionary or a list of dictionaries.")
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Adds support for single metadata dictionary input in `TextFileToDocument``.
@@ -0,0 +1,29 @@
+import pytest
+from haystack.components.converters.utils import normalize_metadata
+
+
+def test_normalize_metadata_None():
+    assert normalize_metadata(None, sources_count=1) == [{}]
+    assert normalize_metadata(None, sources_count=3) == [{}, {}, {}]
+
+
+def test_normalize_metadata_single_dict():
+    assert normalize_metadata({"a": 1}, sources_count=1) == [{"a": 1}]
+    assert normalize_metadata({"a": 1}, sources_count=3) == [{"a": 1}, {"a": 1}, {"a": 1}]
+
+
+def test_normalize_metadata_list_of_right_size():
+    assert normalize_metadata([{"a": 1}], sources_count=1) == [{"a": 1}]
+    assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=3) == [{"a": 1}, {"b": 2}, {"c": 3}]
+
+
+def test_normalize_metadata_list_of_wrong_size():
+    with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
+        normalize_metadata([{"a": 1}], sources_count=3)
+    with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
+        assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=1)
+
+
+def test_normalize_metadata_other_type():
+    with pytest.raises(ValueError, match="meta must be either None, a dictionary or a list of dictionaries."):
+        normalize_metadata(({"a": 1},), sources_count=1)