Skip to content

Commit

Permalink
feat: store metadata using JSON in SQLDocumentStore (#3547)
Browse files Browse the repository at this point in the history
* add warnings

* make the field cachable

* review comment
  • Loading branch information
masci authored Nov 18, 2022
1 parent 1399681 commit ea75e2a
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 46 deletions.
39 changes: 20 additions & 19 deletions haystack/document_stores/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import logging
import itertools
import json
from uuid import uuid4

import numpy as np
Expand All @@ -20,9 +21,10 @@
JSON,
ForeignKeyConstraint,
UniqueConstraint,
TypeDecorator,
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker, validates
from sqlalchemy.orm import relationship, sessionmaker
from sqlalchemy.sql import case, null
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
Expand All @@ -38,6 +40,20 @@
Base = declarative_base() # type: Any


class ArrayType(TypeDecorator):

impl = String
cache_ok = True

def process_bind_param(self, value, dialect):
return json.dumps(value)

def process_result_value(self, value, dialect):
if value is not None:
return json.loads(value)
return value


class ORMBase(Base):
__abstract__ = True

Expand All @@ -64,7 +80,7 @@ class MetaDocumentORM(ORMBase):
__tablename__ = "meta_document"

name = Column(String(100), index=True)
value = Column(String(1000), index=True)
value = Column(ArrayType(100), index=True)
documents = relationship("DocumentORM", back_populates="meta")

document_id = Column(String(100), nullable=False, index=True)
Expand All @@ -76,17 +92,6 @@ class MetaDocumentORM(ORMBase):
{},
) # type: ignore

valid_metadata_types = (str, int, float, bool, bytes, bytearray, type(None))

@validates("value")
def validate_value(self, key, value):
if not isinstance(value, self.valid_metadata_types):
raise TypeError(
f"Discarded metadata '{self.name}', since it has invalid type: {type(value).__name__}.\n"
f"SQLDocumentStore can accept and cast to string only the following types: {', '.join([el.__name__ for el in self.valid_metadata_types])}"
)
return value


class LabelORM(ORMBase):
__tablename__ = "label"
Expand Down Expand Up @@ -298,6 +303,7 @@ def _query(
).filter_by(index=index)

if filters:
logger.warning("filters won't work on metadata fields containing compound data types")
parsed_filter = LogicalFilterClause.parse(filters)
select_ids = parsed_filter.convert_to_sql(MetaDocumentORM)
documents_query = documents_query.filter(DocumentORM.id.in_(select_ids))
Expand Down Expand Up @@ -402,12 +408,7 @@ def write_documents(
if "classification" in meta_fields:
meta_fields = self._flatten_classification_meta_fields(meta_fields)
vector_id = meta_fields.pop("vector_id", None)
meta_orms = []
for key, value in meta_fields.items():
try:
meta_orms.append(MetaDocumentORM(name=key, value=value))
except TypeError as ex:
logger.error("Document %s - %s", doc.id, ex)
meta_orms = [MetaDocumentORM(name=key, value=value) for key, value in meta_fields.items()]
doc_orm = DocumentORM(
id=doc.id,
content=doc.to_dict()["content"],
Expand Down
36 changes: 9 additions & 27 deletions test/document_stores/test_sql.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

import pytest

from haystack.document_stores.sql import SQLDocumentStore
Expand All @@ -24,28 +26,6 @@ def test_delete_index(self, ds, documents):
ds.delete_index(index="custom_index")
assert ds.get_document_count(index="custom_index") == 0

@pytest.mark.integration
def test_sql_write_document_invalid_meta(self, ds):
documents = [
{
"content": "dict_with_invalid_meta",
"valid_meta_field": "test1",
"invalid_meta_field": [1, 2, 3],
"name": "filename1",
"id": "1",
},
Document(
content="document_object_with_invalid_meta",
meta={"valid_meta_field": "test2", "invalid_meta_field": [1, 2, 3], "name": "filename2"},
id="2",
),
]
ds.write_documents(documents)
documents_in_store = ds.get_all_documents()
assert len(documents_in_store) == 2
assert ds.get_document_by_id("1").meta == {"name": "filename1", "valid_meta_field": "test1"}
assert ds.get_document_by_id("2").meta == {"name": "filename2", "valid_meta_field": "test2"}

@pytest.mark.integration
def test_sql_write_different_documents_same_vector_id(self, ds):
doc1 = {"content": "content 1", "name": "doc1", "id": "1", "vector_id": "vector_id"}
Expand Down Expand Up @@ -98,13 +78,15 @@ def test_sql_get_documents_using_nested_filters_about_classification(self, ds):
assert len(ds.get_all_documents(filters={"classification.score": {"$gt": 0.95}})) == 0
assert len(ds.get_all_documents(filters={"classification.label": ["LABEL_100"]})) == 0

# NOTE: the SQLDocumentStore behaves differently to the others when filters are applied.
# While this should be considered a bug, the relative tests are skipped in the meantime
# NOTE: the SQLDocumentStore marshals metadata values with JSON so querying
# using filters doesn't always work. While this should be considered a bug,
# the relative tests are either customized or skipped while we work on a fix.

@pytest.mark.skip
@pytest.mark.integration
def test_ne_filters(self, ds, documents):
pass
def test_ne_filters(self, ds, caplog):
with caplog.at_level(logging.WARNING):
ds.get_all_documents(filters={"year": {"$ne": "2020"}})
assert "filters won't work on metadata fields" in caplog.text

@pytest.mark.skip
@pytest.mark.integration
Expand Down

0 comments on commit ea75e2a

Please sign in to comment.