diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index 7ada22beee..5248801cdc 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -21,7 +21,7 @@ ForeignKeyConstraint, ) from sqlalchemy.ext.declarative import declarative_base - from sqlalchemy.orm import relationship, sessionmaker + from sqlalchemy.orm import relationship, sessionmaker, validates from sqlalchemy.sql import case, null except (ImportError, ModuleNotFoundError) as ie: from haystack.utils.import_utils import _optional_component_not_installed @@ -73,6 +73,17 @@ class MetaDocumentORM(ORMBase): {}, ) # type: ignore + valid_metadata_types = (str, int, float, bool, bytes, bytearray, type(None)) + + @validates("value") + def validate_value(self, key, value): + if not isinstance(value, self.valid_metadata_types): + raise TypeError( + f"Discarded metadata '{self.name}', since it has invalid type: {type(value).__name__}.\n" + f"SQLDocumentStore can accept and cast to string only the following types: {', '.join([el.__name__ for el in self.valid_metadata_types])}" + ) + return value + class LabelORM(ORMBase): __tablename__ = "label" @@ -386,7 +397,12 @@ def write_documents( for doc in document_objects[i : i + batch_size]: meta_fields = doc.meta or {} vector_id = meta_fields.pop("vector_id", None) - meta_orms = [MetaDocumentORM(name=key, value=value) for key, value in meta_fields.items()] + meta_orms = [] + for key, value in meta_fields.items(): + try: + meta_orms.append(MetaDocumentORM(name=key, value=value)) + except TypeError as ex: + logger.error(f"Document {doc.id} - {ex}") doc_mapping = { "id": doc.id, "content": doc.to_dict()["content"], diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index 6af5edeb85..fbdcb3cb3f 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -453,6 +453,30 @@ def test_write_document_meta(document_store: BaseDocumentStore): assert document_store.get_document_by_id("4").meta["meta_field"] == "test4" +@pytest.mark.parametrize("document_store", ["sql"], indirect=True) +def test_write_document_sql_invalid_meta(document_store: BaseDocumentStore): + documents = [ + { + "content": "dict_with_invalid_meta", + "valid_meta_field": "test1", + "invalid_meta_field": [1, 2, 3], + "name": "filename1", + "id": "1", + }, + Document( + content="document_object_with_invalid_meta", + meta={"valid_meta_field": "test2", "invalid_meta_field": [1, 2, 3], "name": "filename2"}, + id="2", + ), + ] + document_store.write_documents(documents) + documents_in_store = document_store.get_all_documents() + assert len(documents_in_store) == 2 + + assert document_store.get_document_by_id("1").meta == {"name": "filename1", "valid_meta_field": "test1"} + assert document_store.get_document_by_id("2").meta == {"name": "filename2", "valid_meta_field": "test2"} + + def test_write_document_index(document_store: BaseDocumentStore): document_store.delete_index("haystack_test_one") document_store.delete_index("haystack_test_two")