Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: store metadata using JSON in SQLDocumentStore #3547

Merged
merged 6 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 20 additions & 19 deletions haystack/document_stores/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import logging
import itertools
import json
from uuid import uuid4

import numpy as np
Expand All @@ -20,9 +21,10 @@
JSON,
ForeignKeyConstraint,
UniqueConstraint,
TypeDecorator,
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker, validates
from sqlalchemy.orm import relationship, sessionmaker
from sqlalchemy.sql import case, null
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
Expand All @@ -38,6 +40,20 @@
Base = declarative_base() # type: Any


class ArrayType(TypeDecorator):

impl = String
cache_ok = True

def process_bind_param(self, value, dialect):
return json.dumps(value)

def process_result_value(self, value, dialect):
if value is not None:
return json.loads(value)
return value


class ORMBase(Base):
__abstract__ = True

Expand All @@ -64,7 +80,7 @@ class MetaDocumentORM(ORMBase):
__tablename__ = "meta_document"

name = Column(String(100), index=True)
value = Column(String(1000), index=True)
value = Column(ArrayType(100), index=True)
documents = relationship("DocumentORM", back_populates="meta")

document_id = Column(String(100), nullable=False, index=True)
Expand All @@ -76,17 +92,6 @@ class MetaDocumentORM(ORMBase):
{},
) # type: ignore

valid_metadata_types = (str, int, float, bool, bytes, bytearray, type(None))

@validates("value")
def validate_value(self, key, value):
if not isinstance(value, self.valid_metadata_types):
raise TypeError(
f"Discarded metadata '{self.name}', since it has invalid type: {type(value).__name__}.\n"
f"SQLDocumentStore can accept and cast to string only the following types: {', '.join([el.__name__ for el in self.valid_metadata_types])}"
)
return value


class LabelORM(ORMBase):
__tablename__ = "label"
Expand Down Expand Up @@ -298,6 +303,7 @@ def _query(
).filter_by(index=index)

if filters:
logger.warning("filters won't work on metadata fields containing compound data types")
parsed_filter = LogicalFilterClause.parse(filters)
select_ids = parsed_filter.convert_to_sql(MetaDocumentORM)
documents_query = documents_query.filter(DocumentORM.id.in_(select_ids))
Expand Down Expand Up @@ -402,12 +408,7 @@ def write_documents(
if "classification" in meta_fields:
meta_fields = self._flatten_classification_meta_fields(meta_fields)
vector_id = meta_fields.pop("vector_id", None)
meta_orms = []
for key, value in meta_fields.items():
try:
meta_orms.append(MetaDocumentORM(name=key, value=value))
except TypeError as ex:
logger.error("Document %s - %s", doc.id, ex)
meta_orms = [MetaDocumentORM(name=key, value=value) for key, value in meta_fields.items()]
doc_orm = DocumentORM(
id=doc.id,
content=doc.to_dict()["content"],
Expand Down
36 changes: 9 additions & 27 deletions test/document_stores/test_sql.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

import pytest

from haystack.document_stores.sql import SQLDocumentStore
Expand All @@ -24,28 +26,6 @@ def test_delete_index(self, ds, documents):
ds.delete_index(index="custom_index")
assert ds.get_document_count(index="custom_index") == 0

@pytest.mark.integration
def test_sql_write_document_invalid_meta(self, ds):
documents = [
{
"content": "dict_with_invalid_meta",
"valid_meta_field": "test1",
"invalid_meta_field": [1, 2, 3],
"name": "filename1",
"id": "1",
},
Document(
content="document_object_with_invalid_meta",
meta={"valid_meta_field": "test2", "invalid_meta_field": [1, 2, 3], "name": "filename2"},
id="2",
),
]
ds.write_documents(documents)
documents_in_store = ds.get_all_documents()
assert len(documents_in_store) == 2
assert ds.get_document_by_id("1").meta == {"name": "filename1", "valid_meta_field": "test1"}
assert ds.get_document_by_id("2").meta == {"name": "filename2", "valid_meta_field": "test2"}

@pytest.mark.integration
def test_sql_write_different_documents_same_vector_id(self, ds):
doc1 = {"content": "content 1", "name": "doc1", "id": "1", "vector_id": "vector_id"}
Expand Down Expand Up @@ -98,13 +78,15 @@ def test_sql_get_documents_using_nested_filters_about_classification(self, ds):
assert len(ds.get_all_documents(filters={"classification.score": {"$gt": 0.95}})) == 0
assert len(ds.get_all_documents(filters={"classification.label": ["LABEL_100"]})) == 0

# NOTE: the SQLDocumentStore behaves differently to the others when filters are applied.
# While this should be considered a bug, the relative tests are skipped in the meantime
# NOTE: the SQLDocumentStore marshals metadata values with JSON so querying
# using filters doesn't always work. While this should be considered a bug,
# the relative tests are either customized or skipped while we work on a fix.

@pytest.mark.skip
@pytest.mark.integration
def test_ne_filters(self, ds, documents):
pass
def test_ne_filters(self, ds, caplog):
with caplog.at_level(logging.WARNING):
ds.get_all_documents(filters={"year": {"$ne": "2020"}})
assert "filters won't work on metadata fields" in caplog.text

@pytest.mark.skip
@pytest.mark.integration
Expand Down