Skip to content

Commit

Permalink
feat(array): add the usage of context manager
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbp authored Mar 30, 2022
1 parent 972a6a7 commit f86043b
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 1 deletion.
10 changes: 10 additions & 0 deletions docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,16 @@ def __new__(
"""Create a Elastic-powered DocumentArray object."""
...

def __enter__(self):
return self

def __exit__(self, *args, **kwargs):
"""
Ensures that offset2ids are stored in the db after
operations in the DocumentArray are performed.
"""
self._save_offset2ids()

def __new__(cls, *args, storage: str = 'memory', **kwargs):
if cls is DocumentArray:
if storage == 'memory':
Expand Down
32 changes: 32 additions & 0 deletions docs/advanced/document-store/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,38 @@ Using dataclass gives you better type-checking in IDE but requires an extra impo

## Known limitations


### Using a context manager to ensure data and offset syncronization

Modifications of a DocumentArray with a storage backend can be "lazy" and might not be reflected instantly to a DocumentArray.
To ensure modifications are beeing sincronized with the storage backend one can use a context manager as in the following snipped:


```python
import numpy as np
from docarray import DocumentArray, Document

storage = 'sqlite'
table_name = 'Test'
connection = './test.db'

da = DocumentArray(
storage=storage,
config={'table_name': table_name, 'connection': connection},
)

with da as da_open:
da_open.append(Document(embedding=np.random.random(128)))
da_open.append(Document(embedding=np.random.random(128)))

# Any changes done to da inside the context manager should be syncronized with the backend now
print(f'len(da)={len(da)}')
print(f'len(da._offset2ids.ids)={len(da._offset2ids.ids)}')
```

This will ensure that the data and offsets are syncronized after the `with` statement.


### Out-of-array modification

One can not take a Document *out* from a DocumentArray and modify it, then expect its modification to be committed back to the DocumentArray.
Expand Down
55 changes: 54 additions & 1 deletion tests/unit/array/test_sequence.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import uuid

import pytest
import tempfile

from docarray import Document
from docarray import Document, DocumentArray
from docarray.array.memory import DocumentArrayInMemory
from docarray.array.qdrant import DocumentArrayQdrant
from docarray.array.sqlite import DocumentArraySqlite
from docarray.array.storage.sqlite import SqliteConfig
from docarray.array.weaviate import DocumentArrayWeaviate
from docarray.array.elastic import DocumentArrayElastic
from docarray.array.storage.qdrant import QdrantConfig
from docarray.array.storage.weaviate import WeaviateConfig
from docarray.array.storage.elastic import ElasticConfig
import numpy as np

from tests.conftest import tmpfile


@pytest.mark.parametrize(
Expand Down Expand Up @@ -48,5 +55,51 @@ def test_append_extend(da_cls, config, start_storage):
da.append(Document())
da.append(Document())
assert len(da) == 2
# assert len(da._offset2ids.ids) == 2 will not work unless used in a context manager
da.extend([Document(), Document()])
assert len(da) == 4
# assert len(da._offset2ids.ids) == 4 will not work unless used in a context manager


def update_config_inplace(config, tmpdir, tmpfile):
variable_names = ['table_name', 'connection', 'collection_name', 'index_name']
variable_names_db = ['connection']

for field in variable_names_db:
if field in config:
config[field] = str(tmpfile)

for field in variable_names:
if field in config:
config[field] = f'{config[field]}_{uuid.uuid4().hex}'


@pytest.mark.parametrize(
'storage, config',
[
('sqlite', {'table_name': 'Test', 'connection': 'sqlite'}),
('weaviate', {'n_dim': 3, 'name': 'Weaviate'}),
('qdrant', {'n_dim': 3, 'collection_name': 'qdrant'}),
('elasticsearch', {'n_dim': 3, 'index_name': 'elasticsearch'}),
],
)
def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfile):
config = config
update_config_inplace(config, tmpdir, tmpfile)

da = DocumentArray(storage=storage, config=config)

with da as da_open:
da_open.append(Document(embedding=np.random.random(3)))
da_open.append(Document(embedding=np.random.random(3)))

assert len(da) == 2
assert len(da._offset2ids.ids) == 2

da2 = DocumentArray(storage=storage, config=config)

assert len(da2) == 2
assert len(da2._offset2ids.ids) == 2

del da
del da2

0 comments on commit f86043b

Please sign in to comment.