Skip to content

Commit

Permalink
semantic-search: increase maximal allowed chunk size, fix error when …
Browse files Browse the repository at this point in the history
…timeout
  • Loading branch information
PrimozGodec committed Nov 17, 2021
1 parent b3511fa commit 0570321
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 5 deletions.
21 changes: 16 additions & 5 deletions orangecontrib/text/semantic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@
import base64
import zlib
import sys
from typing import Any, List, Optional, Tuple, Callable, Union
from typing import Any, List, Optional, Callable, Union

import numpy as np

from Orange.misc.server_embedder import ServerEmbedderCommunicator
from Orange.util import dummy_callback

MAX_PACKAGE_SIZE = 50000
# maximum document size that we still send to the server
MAX_PACKAGE_SIZE = 3000000
# maximum size of a chunk - when one document is longer send is as a chunk with
# a single document
MAX_CHUNK_SIZE = 50000
MIN_CHUNKS = 20


Expand Down Expand Up @@ -85,8 +89,13 @@ def cb(success=True):
return [None] * len(texts)

result = list()
for chunk in result_:
result.extend(chunk)
assert len(result_) == len(chunks)
for res_chunk, orig_chunk in zip(result_, chunks):
if res_chunk is None:
# when embedder fails (Timeout or other error) result will be None
result.extend([None] * len(orig_chunk))
else:
result.extend(res_chunk)

results = list()
idx = 0
Expand All @@ -104,7 +113,9 @@ def _make_chunks(self, encoded_texts, sizes, depth=0):
chunk_sizes = np.array_split(sizes, MIN_CHUNKS if depth == 0 else 2)
result = list()
for i in range(len(chunks)):
if np.sum(chunk_sizes[i]) > MAX_PACKAGE_SIZE:
# checking that more than one text in chunk prevent recursion to infinity
# when one text is bigger than MAX_CHUNK_SIZE
if len(chunks[i]) > 1 and np.sum(chunk_sizes[i]) > MAX_CHUNK_SIZE:
result.extend(self._make_chunks(chunks[i], chunk_sizes[i], depth + 1))
else:
result.append(chunks[i])
Expand Down
12 changes: 12 additions & 0 deletions orangecontrib/text/tests/test_semantic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,18 @@ def test_success(self):
result = self.semantic_search(self.corpus.documents, QUERIES)
self.assertEqual(result, IDEAL_RESPONSE)

# added None three times since server will repeate request on None response
# three times
@patch(PATCH_METHOD, make_dummy_post(iter(RESPONSE[:-1] + [None] * 3)))
def test_none_result(self):
"""
It can happen that the result of an embedding for a chunk is None (server
fail to respond three times because Timeout or other error).
Make sure that semantic search module can handle None responses.
"""
result = self.semantic_search(self.corpus.documents, QUERIES)
self.assertEqual(result, IDEAL_RESPONSE[:-1] + [None])

@patch(PATCH_METHOD, make_dummy_post(RESPONSE[0]))
def test_success_chunks(self):
num_docs = len(self.corpus.documents)
Expand Down

0 comments on commit 0570321

Please sign in to comment.