feat: partial db lookup, document entities lookup and render (#145)

New features - Lookup entities in documents - Make cropped SVG of figures - Make SVG of a document page with a bbox overlay - Expose partial lookup in patents DB --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
DS4SD · Nov 24, 2023 · 106de7c · 106de7c
1 parent 0073b37
commit 106de7c
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 22 deletions.
diff --git a/deepsearch/chemistry/queries/molecules.py b/deepsearch/chemistry/queries/molecules.py
@@ -94,6 +94,7 @@ def MoleculeQuery(
 def MoleculesInPatentsQuery(
     patents: Union[str, List[str]],
     num_items: int = 10,
+    partial_lookup: bool = False,
 ) -> Query:
     """
     List all molecules contained in a list of patents.
@@ -117,6 +118,7 @@ def MoleculesInPatentsQuery(
                     for v in patents
                 ]
             },
+            "partial_references": partial_lookup,
             "limit": num_items,
         },
         coordinates=KnowledgeDbResource(),

diff --git a/deepsearch/documents/core/export.py b/deepsearch/documents/core/export.py
@@ -2,28 +2,7 @@
 
 from tabulate import tabulate
 
-
-def _resolve_item(item, doc):
-    """
-    Return the resolved item in the document.
-    If the item is referencing another part of the document, it will be retrieved.
-    """
-
-    # TODO: improve the function for handling proper json path (not only two segments)
-    try:
-        if "$ref" in item:
-            parts = item["$ref"].split("/")
-            citem = doc[parts[1]][int(parts[2])]
-        elif "__ref" in item:
-            parts = item["__ref"].split("/")
-            citem = doc[parts[1]][int(parts[2])]
-        else:
-            citem = item
-
-        return citem
-
-    except KeyError:
-        return None
+from deepsearch.documents.core.lookup import _resolve_item
 
 
 def export_to_markdown(document: Dict[str, Any]) -> str:

diff --git a/deepsearch/documents/core/lookup.py b/deepsearch/documents/core/lookup.py
@@ -0,0 +1,75 @@
+from typing import Any, Dict, List
+
+from pydantic import BaseModel
+
+
+def _resolve_item(item, doc):
+    """
+    Return the resolved item in the document.
+    If the item is referencing another part of the document, it will be retrieved.
+    """
+
+    # TODO: improve the function for handling proper json path (not only two segments)
+    try:
+        if "$ref" in item:
+            parts = item["$ref"].split("/")
+            citem = doc[parts[1]][int(parts[2])]
+        elif "__ref" in item:
+            parts = item["__ref"].split("/")
+            citem = doc[parts[1]][int(parts[2])]
+        else:
+            citem = item
+
+        return citem
+
+    except KeyError:
+        return None
+
+
+class EntitiesLookup:
+    class _MatchedEntry(BaseModel):
+        index: int
+        doc_path: str
+        type: str
+        content: Dict[str, Any]
+
+    def __init__(self, document: Dict[str, Any]):
+        self.document = document
+        self._lookup: Dict[
+            str, Dict[str, List[EntitiesLookup._MatchedEntry]]
+        ] = {}  # {entity_type: {entity_instance: [reference, ...]}}
+        self._build()
+
+    def _build(self):
+        """
+        Build the internal lookup structure for entities in a document.
+        """
+        for ix, item in enumerate(self.document["main-text"]):
+            item = _resolve_item(item, self.document)
+            if not "entities" in item:
+                continue
+
+            me = EntitiesLookup._MatchedEntry(
+                index=ix,
+                doc_path=f"main-text.{ix}",
+                type=item["type"],
+                content=item,
+            )
+
+            for entity_type, entities in item["entities"].items():
+                if not entity_type in self._lookup:
+                    self._lookup[entity_type] = {}
+
+                for entity in entities:
+                    matches = {entity["match"], entity["original"]}
+                    for match in matches:
+                        if not match in self._lookup[entity_type]:
+                            self._lookup[entity_type][match] = []
+
+                        self._lookup[entity_type][match].append(me)
+
+    def get(self, *, entity_type: str, entity: str) -> List[_MatchedEntry]:
+        """
+        Lookup where a given entity is mentioned in a document.
+        """
+        return self._lookup.get(entity_type, {}).get(entity, [])
diff --git a/deepsearch/documents/core/render.py b/deepsearch/documents/core/render.py
@@ -0,0 +1,62 @@
+from typing import Any, Dict
+
+
+def get_figure_svg(document: Dict[str, Any], figure: Dict[str, Any]):
+    """
+    Generates a SVG which crops the figure from the image of the document page.
+    """
+    page_no = figure["prov"][0]["page"]
+    doc_page_dims = next(
+        (p for p in document["page-dimensions"] if p["page"] == page_no), None
+    )
+    if doc_page_dims is None:
+        return ""
+
+    s3_page_image = next(
+        (p for p in document["_s3_data"]["pdf-images"] if p["page"] == page_no), None
+    )
+    if s3_page_image is None:
+        return ""
+
+    [pw, ph] = doc_page_dims["width"], doc_page_dims["height"]
+    [x1, y1, x2, y2] = figure["prov"][0]["bbox"]
+
+    page_url = s3_page_image["url"]
+
+    svg = f"""
+    <svg viewBox="{x1} {ph - y2} {x2 - x1} {y2 - y1}">
+        <image width={pw} height={ph} href="{page_url}" /> 
+    </svg>
+    """
+    return svg
+
+
+def get_page_svg_with_item(document: Dict[str, Any], item: Dict[str, Any]):
+    """
+    Generates a SVG which overlays the bounding-box of the item with the image of the page.
+    """
+    page_no = item["prov"][0]["page"]
+    doc_page_dims = next(
+        (p for p in document["page-dimensions"] if p["page"] == page_no), None
+    )
+    if doc_page_dims is None:
+        return ""
+
+    s3_page_image = next(
+        (p for p in document["_s3_data"]["pdf-images"] if p["page"] == page_no), None
+    )
+    if s3_page_image is None:
+        return ""
+
+    [pw, ph] = doc_page_dims["width"], doc_page_dims["height"]
+    [x1, y1, x2, y2] = item["prov"][0]["bbox"]
+
+    page_url = s3_page_image["url"]
+
+    svg = f"""
+    <svg viewBox="0 0 {pw} {ph}">
+        <image width={pw} height={ph} href="{page_url}" /> 
+        <rect x="{x1}" y="{ph - y2}" width="{x2 - x1}" height="{y2 - y1}" style="stroke-width:0.03;stroke:rgb(0,0,255);fill-opacity:0" />
+    </svg>
+    """
+    return svg