fix: float32 matchidx overflow

jina-ai · Apr 13, 2020 · 4750555 · 4750555
1 parent 813738b
commit 4750555
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 6 deletions.
diff --git a/jina/drivers/prune.py b/jina/drivers/prune.py
@@ -7,10 +7,14 @@ class ChunkPruneDriver(BaseDriver):
     Removed fields are ``embedding``, ``raw_bytes``, ``blob``, ``text``.
     """
 
+    def __init__(self, pruned=('embedding', 'raw_bytes', 'blob', 'text'), *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pruned = pruned
+
     def __call__(self, *args, **kwargs):
         for d in self.req.docs:
             for c in d.chunks:
-                for k in ('embedding', 'raw_bytes', 'blob', 'text'):
+                for k in self.pruned:
                     c.ClearField(k)
 
 
@@ -20,9 +24,13 @@ class DocPruneDriver(BaseDriver):
     Removed fields are ``chunks``
     """
 
+    def __init__(self, pruned=('chunks',), *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pruned = pruned
+
     def __call__(self, *args, **kwargs):
         for d in self.req.docs:
-            for k in ('chunks',):
+            for k in self.pruned:
                 d.ClearField(k)
 
 

diff --git a/jina/drivers/score.py b/jina/drivers/score.py
@@ -23,7 +23,9 @@ def __call__(self, *args, **kwargs):
                     query_chunk_meta[c.chunk_id] = pb_obj2dict(c, exec.required_keys)
                     match_chunk_meta[k.match_chunk.chunk_id] = pb_obj2dict(k.match_chunk, exec.required_keys)
 
-            match_idx = np.array(match_idx, dtype=np.float32)
+            # the type range has to be bigger than np.uint32
+            # otherwise the chunk_id will go overflow without even notice
+            match_idx = np.array(match_idx, dtype=np.float64)
 
             doc_idx = self.exec_fn(match_idx, query_chunk_meta, match_chunk_meta)
 

diff --git a/jina/peapods/gateway.py b/jina/peapods/gateway.py
@@ -112,7 +112,8 @@ def prefetch_req(num_req, fetch_to):
                     return False
 
                 with TimeContext(f'prefetching {self.args.prefetch} requests', self.logger):
-                    self.logger.info('if this takes too long, you may want to reduce "--prefetch"')
+                    self.logger.warning('if this takes too long, you may want to take smaller "--prefetch" or '
+                                        'ask client to reduce "--batch-size"')
                     is_req_empty = prefetch_req(self.args.prefetch, prefetch_task)
 
                 while not (zmqlet.msg_sent == zmqlet.msg_recv != 0 and is_req_empty):

diff --git a/jina/resources/executors.requests.DocPbIndexer.yml b/jina/resources/executors.requests.DocPbIndexer.yml
@@ -2,10 +2,13 @@ on:
   ControlRequest:
     - !ControlReqDriver {}
   SearchRequest:
-    - !DocPbIndexDriver
+    - !DocPbSearchDriver
       with:
         method: query
   IndexRequest:
-    - !DocPbSearchDriver
+    - !DocPruneDriver
+      with:
+        pruned: raw_bytes
+    - !DocPbIndexDriver
       with:
         method: add