Skip to content

Commit

Permalink
feat: add support for single meta dict in TextFileToDocument (#6606)
Browse files Browse the repository at this point in the history
* add support for single meta dict

* reno

* reno

* mypy

* extract to function

* docstring

* mypy
  • Loading branch information
ZanSara authored Dec 21, 2023
1 parent 7cc6080 commit cf79aa1
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 10 deletions.
22 changes: 13 additions & 9 deletions haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from haystack import Document, component
from haystack.dataclasses import ByteStream
from haystack.components.converters.utils import get_bytestream_from_source
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -38,25 +38,29 @@ def __init__(self, encoding: str = "utf-8"):
self.encoding = encoding

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Convert text files to Documents.
:param sources: A list of paths to text files or ByteStream objects.
Note that if an encoding is specified in the metadata of a ByteStream,
it will override the component's default.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
documents = []

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
meta_list = normalize_metadata(meta, sources_count=len(sources))

for source, metadata in zip(sources, meta):
for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
Expand Down
24 changes: 23 additions & 1 deletion haystack/components/converters/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import Union
from typing import List, Union, Dict, Any, Optional

from haystack.dataclasses import ByteStream

Expand All @@ -18,3 +18,25 @@ def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStre
bs.meta["file_path"] = str(source)
return bs
raise ValueError(f"Unsupported source type {type(source)}")


def normalize_metadata(
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], sources_count: int
) -> List[Dict[str, Any]]:
"""
Given all the possible value of the meta input for a converter (None, dictionary or list of dicts),
makes sure to return a list of dictionaries of the correct length for the converter to use.
:param meta: the meta input of the converter, as-is
:sources_count: the number of sources the converter received
:returns: a list of dictionaries of the make length as the sources list
"""
if meta is None:
return [{}] * sources_count
if isinstance(meta, dict):
return [meta] * sources_count
if isinstance(meta, list):
if sources_count != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
return meta
raise ValueError("meta must be either None, a dictionary or a list of dictionaries.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Adds support for single metadata dictionary input in `TextFileToDocument``.
29 changes: 29 additions & 0 deletions test/components/converters/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pytest
from haystack.components.converters.utils import normalize_metadata


def test_normalize_metadata_None():
assert normalize_metadata(None, sources_count=1) == [{}]
assert normalize_metadata(None, sources_count=3) == [{}, {}, {}]


def test_normalize_metadata_single_dict():
assert normalize_metadata({"a": 1}, sources_count=1) == [{"a": 1}]
assert normalize_metadata({"a": 1}, sources_count=3) == [{"a": 1}, {"a": 1}, {"a": 1}]


def test_normalize_metadata_list_of_right_size():
assert normalize_metadata([{"a": 1}], sources_count=1) == [{"a": 1}]
assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=3) == [{"a": 1}, {"b": 2}, {"c": 3}]


def test_normalize_metadata_list_of_wrong_size():
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
normalize_metadata([{"a": 1}], sources_count=3)
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
assert normalize_metadata([{"a": 1}, {"b": 2}, {"c": 3}], sources_count=1)


def test_normalize_metadata_other_type():
with pytest.raises(ValueError, match="meta must be either None, a dictionary or a list of dictionaries."):
normalize_metadata(({"a": 1},), sources_count=1)

0 comments on commit cf79aa1

Please sign in to comment.