deeppavlov · Ramimashkouk · Jun 2, 2023 · Jun 1, 2023 · Jun 2, 2023 · Jun 2, 2023
diff --git a/annotators/custom_entity_linking/Dockerfile b/annotators/custom_entity_linking/Dockerfile
@@ -1,18 +1,21 @@
 FROM tensorflow/tensorflow:1.15.2-gpu
 
+WORKDIR /src
+
 RUN apt-key del 7fa2af80  && \
-    # install cuda packages
     rm -f /etc/apt/sources.list.d/cuda*.list && \
     curl https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
     -o cuda-keyring_1.0-1_all.deb && \
     dpkg -i cuda-keyring_1.0-1_all.deb
+RUN apt-get -y update
+RUN apt-get install -y build-essential zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget llvm \
+    libncurses5-dev libncursesw5-dev xz-utils libffi-dev liblzma-dev
 
-# install auxiliary packages for sqlite3
-RUN apt-get -y update && apt-get install -y build-essential zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget llvm \
-    libncurses5-dev libncursesw5-dev xz-utils libffi-dev liblzma-dev && \
-    # install git and sqlite3
+RUN apt-get -y update && \
     apt-get install -y software-properties-common && \
-    apt-get install git -y && apt-get install -y sqlite3
+    apt-get update && apt-get install git -y
+
+RUN apt-get install -y sqlite3
 
 ARG LANGUAGE=EN
 ENV LANGUAGE ${LANGUAGE}
@@ -25,15 +28,14 @@ ARG SED_ARG=" | "
 ENV CONFIG=$CONFIG
 ENV PORT=$PORT
 
-COPY ./annotators/custom_entity_linking/requirements.txt /src/requirements.txt
-RUN pip install -r /src/requirements.txt
-RUN python -c "import nltk; nltk.download('stopwords')"
+COPY ./annotators/custom_entity_linking/requirements.txt ./requirements.txt
+RUN pip install -r ./requirements.txt
+
+COPY $SRC_DIR .
 
-COPY $SRC_DIR /src
 
-WORKDIR /src
 RUN python -m deeppavlov install $CONFIG
 
 RUN sed -i "s|$SED_ARG|g" "$CONFIG"
 
-CMD gunicorn  --workers=1 --timeout 500 server:app -b 0.0.0.0:8153
+CMD gunicorn  --workers=1 --timeout 500 server:app -b 0.0.0.0:${PORT}
diff --git a/annotators/custom_entity_linking/custom_entity_linking.json b/annotators/custom_entity_linking/custom_entity_linking.json
@@ -1,10 +1,10 @@
 {
   "chainer": {
-    "in": ["user_ids", "entity_substr", "entity_tags", "sentences"],
+    "in": ["user_ids", "entity_substr", "entity_tags"],
     "pipe": [
       {
         "class_name": "src.entity_linking:EntityLinker",
-        "in": ["user_ids", "entity_substr", "entity_tags", "sentences"],
+        "in": ["user_ids", "entity_substr", "entity_tags"],
         "out": ["entity_ids", "entity_conf", "entity_id_tags"],
         "load_path": "{DOWNLOADS_PATH}/entity_linking_eng/custom_el_eng_dream",
         "rank_in_runtime": true,

diff --git a/annotators/custom_entity_linking/server.py b/annotators/custom_entity_linking/server.py
@@ -37,82 +37,106 @@ def add_entities():
     return {}
 
 
-def preprocess_hist_utt(context_batch):
-    opt_context_batch = []
-    for hist_utt in context_batch:
-        hist_utt = [utt for utt in hist_utt if len(utt) > 1]
-        last_utt = hist_utt[-1]
-        if last_utt[-1] not in {".", "!", "?"}:
-            last_utt = f"{last_utt}."
-        if len(hist_utt) > 1:
-            prev_utt = hist_utt[-2]
-            if prev_utt[-1] not in {".", "!", "?"}:
-                prev_utt = f"{prev_utt}."
-            opt_context_batch.append([prev_utt, last_utt])
-        else:
-            opt_context_batch.append([last_utt])
-    return opt_context_batch
-
-
-def define_entity_info(prex_info, context, substr_list, ids_list, conf_list, id_tags_list):
-    entity_info_list = []
-    triplet = dict()
-    if isinstance(prex_info, list) and prex_info:
-        prex_info = prex_info[0]
-    if prex_info:
-        triplet = prex_info.get("triplet", {})
-    rel = ""
-    if "relation" in triplet:
-        rel = triplet["relation"]
-    elif "property" in triplet:
-        rel = triplet["property"]
-    for substr, ids, confs, id_tags in zip(substr_list, ids_list, conf_list, id_tags_list):
-        entity_info = {}
-        is_abstract = rel.lower().replace("_", " ") in abstract_rels and not any(
-            [f" {word} {substr}" in context for word in ["the", "my", "his", "her"]]
-        )
-
-        f_ids, f_confs, f_id_tags = [], [], []
-        for entity_id, conf, id_tag in zip(ids, confs, id_tags):
-            if id_tag.startswith("Abstract") and not is_abstract:
-                pass
-            else:
-                f_ids.append(entity_id)
-                f_confs.append(conf)
-                f_id_tags.append(id_tag)
-
-        if f_ids:
-            entity_info["entity_substr"] = substr
-            entity_info["entity_ids"] = f_ids
-            entity_info["confidences"] = [float(elem[2]) for elem in f_confs]
-            entity_info["tokens_match_conf"] = [float(elem[0]) for elem in f_confs]
-            entity_info["entity_id_tags"] = f_id_tags
-            entity_info_list.append(entity_info)
-    return entity_info_list
-
-
 @app.route("/model", methods=["POST"])
 def respond():
     st_time = time.time()
     user_ids = request.json.get("user_id", [""])
-    substr_batch = request.json.get("entity_substr", [[""]])
-    tags_batch = request.json.get("entity_tags", [["" for _ in substr_list] for substr_list in substr_batch])
+    entity_substr_batch = request.json.get("entity_substr", [[""]])
+    entity_tags_batch = request.json.get(
+        "entity_tags",
+        [["" for _ in entity_substr_list] for entity_substr_list in entity_substr_batch],
+    )
     context_batch = request.json.get("context", [[""]])
-    prex_info_batch = request.json.get("property_extraction", [dict() for _ in substr_batch])
-    opt_context_batch = preprocess_hist_utt(context_batch)
+    prex_info_batch = request.json.get("property_extraction", [{} for _ in entity_substr_batch])
+    opt_context_batch = []
+    logger.info(f"init context: {context_batch}")
+    for hist_uttr in context_batch:
+        if len(hist_uttr) == 1:
+            opt_context_batch.append(hist_uttr[0])
+        else:
+            prev_uttr = hist_uttr[-2]
+            cur_uttr = hist_uttr[-1]
+            is_q = (
+                any([prev_uttr.startswith(q_word) for q_word in ["what ", "who ", "when ", "where "]])
+                or "?" in prev_uttr
+            )
+            if is_q and len(cur_uttr.split()) < 3:
+                opt_context_batch.append(f"{prev_uttr} {cur_uttr}")
+            else:
+                opt_context_batch.append(cur_uttr)
 
-    entity_info_batch = [[dict()] for _ in substr_batch]
+    logger.info(f"context batch: {opt_context_batch}")
+    entity_info_batch = [[{}] for _ in entity_substr_batch]
     try:
-        substr_batch, ids_batch, conf_batch, id_tags_batch = el(user_ids, substr_batch, tags_batch, opt_context_batch)
+        (
+            entity_substr_batch,
+            entity_ids_batch,
+            conf_batch,
+            entity_id_tags_batch,
+        ) = el(user_ids, entity_substr_batch, entity_tags_batch)
         entity_info_batch = []
-        for (substr_list, ids_list, conf_list, id_tags_list, prex_info, context) in zip(
-            substr_batch, ids_batch, conf_batch, id_tags_batch, prex_info_batch, opt_context_batch
+        for (
+            entity_substr_list,
+            entity_ids_list,
+            conf_list,
+            entity_id_tags_list,
+            prex_info,
+            context,
+        ) in zip(
+            entity_substr_batch,
+            entity_ids_batch,
+            conf_batch,
+            entity_id_tags_batch,
+            prex_info_batch,
+            opt_context_batch,
         ):
-            if context:
-                context = " ".join(context)
-            else:
-                context = ""
-            entity_info_list = define_entity_info(prex_info, context, substr_list, ids_list, conf_list, id_tags_list)
+            entity_info_list = []
+            triplets = {}
+            if isinstance(prex_info, list) and prex_info:
+                prex_info = prex_info[0]
+            if prex_info:
+                triplets = prex_info.get("triplets", {})
+            obj2rel_dict = {}
+            for triplet in triplets:
+                obj = triplet["object"].lower()
+                if "relation" in triplet:
+                    rel = triplet["relation"]
+                elif "property" in triplet:
+                    rel = triplet["property"]
+                else:
+                    rel = ""
+                obj2rel_dict[obj] = rel
+            for entity_substr, entity_ids, confs, entity_id_tags in zip(
+                entity_substr_list,
+                entity_ids_list,
+                conf_list,
+                entity_id_tags_list,
+            ):
+                entity_info = {}
+                entity_substr = entity_substr.lower()
+                logger.info(f"context -- {context}")
+                context = context.lower()
+                curr_rel = obj2rel_dict.get(entity_substr, "")
+                is_abstract = curr_rel.lower().replace("_", " ") in abstract_rels and not any(
+                    [f" {word} {entity_substr}" in context for word in ["the", "my", "his", "her"]]
+                )
+
+                f_entity_ids, f_confs, f_entity_id_tags = [], [], []
+                for entity_id, conf, entity_id_tag in zip(entity_ids, confs, entity_id_tags):
+                    if entity_id_tag.startswith("Abstract") and not is_abstract:
+                        pass
+                    else:
+                        f_entity_ids.append(entity_id)
+                        f_confs.append(conf)
+                        f_entity_id_tags.append(entity_id_tag)
+
+                if f_entity_ids and entity_substr in context:
+                    entity_info["entity_substr"] = entity_substr
+                    entity_info["entity_ids"] = f_entity_ids
+                    entity_info["confidences"] = [float(elem[2]) for elem in f_confs]
+                    entity_info["tokens_match_conf"] = [float(elem[0]) for elem in f_confs]
+                    entity_info["entity_id_tags"] = f_entity_id_tags
+                    entity_info_list.append(entity_info)
             entity_info_batch.append(entity_info_list)
     except Exception as e:
         sentry_sdk.capture_exception(e)

diff --git a/annotators/custom_entity_linking/service_configs/custom-entity-linking/environment.yml b/annotators/custom_entity_linking/service_configs/custom-entity-linking/environment.yml
@@ -0,0 +1,5 @@
+CONFIG: custom_entity_linking.json
+SERVICE_PORT: 8153
+SRC_DIR: annotators/custom_entity_linking/
+SERVICE_NAME: custom_entity_linking
+FLASK_APP: server
diff --git a/annotators/custom_entity_linking/service_configs/custom-entity-linking/service.yml b/annotators/custom_entity_linking/service_configs/custom-entity-linking/service.yml
@@ -0,0 +1,26 @@
+name: custom-entity-linking
+endpoints:
+- model
+- add_entities
+compose:
+  env_file:
+  - .env
+  build:
+    args:
+      CONFIG: custom_entity_linking.json
+      PORT: 8153
+      SRC_DIR: annotators/custom_entity_linking
+    context: ./
+    dockerfile: annotators/custom_entity_linking/Dockerfile
+  deploy:
+    resources:
+      limits:
+        memory: 128M
+      reservations:
+        memory: 128M
+  volumes:
+  - "./annotators/custom_entity_linking:/src"
+  - "~/.deeppavlov:/root/.deeppavlov"
+  ports:
+  - 8153:8153
+proxy: null