Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

semantic-search: increase maximal allowed chunk size, fix error when timeout #752

Merged
merged 1 commit into from
Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions orangecontrib/text/semantic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@
import base64
import zlib
import sys
from typing import Any, List, Optional, Tuple, Callable, Union
from typing import Any, List, Optional, Callable, Union

import numpy as np

from Orange.misc.server_embedder import ServerEmbedderCommunicator
from Orange.util import dummy_callback

MAX_PACKAGE_SIZE = 50000
# maximum document size that we still send to the server
MAX_PACKAGE_SIZE = 3000000
# maximum size of a chunk - when one document is longer send is as a chunk with
# a single document
MAX_CHUNK_SIZE = 50000
MIN_CHUNKS = 20


Expand Down Expand Up @@ -85,8 +89,13 @@ def cb(success=True):
return [None] * len(texts)

result = list()
for chunk in result_:
result.extend(chunk)
assert len(result_) == len(chunks)
for res_chunk, orig_chunk in zip(result_, chunks):
if res_chunk is None:
# when embedder fails (Timeout or other error) result will be None
result.extend([None] * len(orig_chunk))
else:
result.extend(res_chunk)

results = list()
idx = 0
Expand All @@ -104,7 +113,9 @@ def _make_chunks(self, encoded_texts, sizes, depth=0):
chunk_sizes = np.array_split(sizes, MIN_CHUNKS if depth == 0 else 2)
result = list()
for i in range(len(chunks)):
if np.sum(chunk_sizes[i]) > MAX_PACKAGE_SIZE:
# checking that more than one text in chunk prevent recursion to infinity
# when one text is bigger than MAX_CHUNK_SIZE
if len(chunks[i]) > 1 and np.sum(chunk_sizes[i]) > MAX_CHUNK_SIZE:
result.extend(self._make_chunks(chunks[i], chunk_sizes[i], depth + 1))
else:
result.append(chunks[i])
Expand Down
12 changes: 12 additions & 0 deletions orangecontrib/text/tests/test_semantic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,18 @@ def test_success(self):
result = self.semantic_search(self.corpus.documents, QUERIES)
self.assertEqual(result, IDEAL_RESPONSE)

# added None three times since server will repeate request on None response
# three times
@patch(PATCH_METHOD, make_dummy_post(iter(RESPONSE[:-1] + [None] * 3)))
def test_none_result(self):
"""
It can happen that the result of an embedding for a chunk is None (server
fail to respond three times because Timeout or other error).
Make sure that semantic search module can handle None responses.
"""
result = self.semantic_search(self.corpus.documents, QUERIES)
self.assertEqual(result, IDEAL_RESPONSE[:-1] + [None])

@patch(PATCH_METHOD, make_dummy_post(RESPONSE[0]))
def test_success_chunks(self):
num_docs = len(self.corpus.documents)
Expand Down