Skip to content

Commit

Permalink
feat: partial db lookup, document entities lookup and render (#145)
Browse files Browse the repository at this point in the history
New features
- Lookup entities in documents
- Make cropped SVG of figures
- Make SVG of a document page with a bbox overlay
- Expose partial lookup in patents DB

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
  • Loading branch information
dolfim-ibm authored Nov 24, 2023
1 parent 0073b37 commit 106de7c
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 22 deletions.
2 changes: 2 additions & 0 deletions deepsearch/chemistry/queries/molecules.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def MoleculeQuery(
def MoleculesInPatentsQuery(
patents: Union[str, List[str]],
num_items: int = 10,
partial_lookup: bool = False,
) -> Query:
"""
List all molecules contained in a list of patents.
Expand All @@ -117,6 +118,7 @@ def MoleculesInPatentsQuery(
for v in patents
]
},
"partial_references": partial_lookup,
"limit": num_items,
},
coordinates=KnowledgeDbResource(),
Expand Down
23 changes: 1 addition & 22 deletions deepsearch/documents/core/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,7 @@

from tabulate import tabulate


def _resolve_item(item, doc):
"""
Return the resolved item in the document.
If the item is referencing another part of the document, it will be retrieved.
"""

# TODO: improve the function for handling proper json path (not only two segments)
try:
if "$ref" in item:
parts = item["$ref"].split("/")
citem = doc[parts[1]][int(parts[2])]
elif "__ref" in item:
parts = item["__ref"].split("/")
citem = doc[parts[1]][int(parts[2])]
else:
citem = item

return citem

except KeyError:
return None
from deepsearch.documents.core.lookup import _resolve_item


def export_to_markdown(document: Dict[str, Any]) -> str:
Expand Down
75 changes: 75 additions & 0 deletions deepsearch/documents/core/lookup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Any, Dict, List

from pydantic import BaseModel


def _resolve_item(item, doc):
"""
Return the resolved item in the document.
If the item is referencing another part of the document, it will be retrieved.
"""

# TODO: improve the function for handling proper json path (not only two segments)
try:
if "$ref" in item:
parts = item["$ref"].split("/")
citem = doc[parts[1]][int(parts[2])]
elif "__ref" in item:
parts = item["__ref"].split("/")
citem = doc[parts[1]][int(parts[2])]
else:
citem = item

return citem

except KeyError:
return None


class EntitiesLookup:
class _MatchedEntry(BaseModel):
index: int
doc_path: str
type: str
content: Dict[str, Any]

def __init__(self, document: Dict[str, Any]):
self.document = document
self._lookup: Dict[
str, Dict[str, List[EntitiesLookup._MatchedEntry]]
] = {} # {entity_type: {entity_instance: [reference, ...]}}
self._build()

def _build(self):
"""
Build the internal lookup structure for entities in a document.
"""
for ix, item in enumerate(self.document["main-text"]):
item = _resolve_item(item, self.document)
if not "entities" in item:
continue

me = EntitiesLookup._MatchedEntry(
index=ix,
doc_path=f"main-text.{ix}",
type=item["type"],
content=item,
)

for entity_type, entities in item["entities"].items():
if not entity_type in self._lookup:
self._lookup[entity_type] = {}

for entity in entities:
matches = {entity["match"], entity["original"]}
for match in matches:
if not match in self._lookup[entity_type]:
self._lookup[entity_type][match] = []

self._lookup[entity_type][match].append(me)

def get(self, *, entity_type: str, entity: str) -> List[_MatchedEntry]:
"""
Lookup where a given entity is mentioned in a document.
"""
return self._lookup.get(entity_type, {}).get(entity, [])
62 changes: 62 additions & 0 deletions deepsearch/documents/core/render.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from typing import Any, Dict


def get_figure_svg(document: Dict[str, Any], figure: Dict[str, Any]):
"""
Generates a SVG which crops the figure from the image of the document page.
"""
page_no = figure["prov"][0]["page"]
doc_page_dims = next(
(p for p in document["page-dimensions"] if p["page"] == page_no), None
)
if doc_page_dims is None:
return ""

s3_page_image = next(
(p for p in document["_s3_data"]["pdf-images"] if p["page"] == page_no), None
)
if s3_page_image is None:
return ""

[pw, ph] = doc_page_dims["width"], doc_page_dims["height"]
[x1, y1, x2, y2] = figure["prov"][0]["bbox"]

page_url = s3_page_image["url"]

svg = f"""
<svg viewBox="{x1} {ph - y2} {x2 - x1} {y2 - y1}">
<image width={pw} height={ph} href="{page_url}" />
</svg>
"""
return svg


def get_page_svg_with_item(document: Dict[str, Any], item: Dict[str, Any]):
"""
Generates a SVG which overlays the bounding-box of the item with the image of the page.
"""
page_no = item["prov"][0]["page"]
doc_page_dims = next(
(p for p in document["page-dimensions"] if p["page"] == page_no), None
)
if doc_page_dims is None:
return ""

s3_page_image = next(
(p for p in document["_s3_data"]["pdf-images"] if p["page"] == page_no), None
)
if s3_page_image is None:
return ""

[pw, ph] = doc_page_dims["width"], doc_page_dims["height"]
[x1, y1, x2, y2] = item["prov"][0]["bbox"]

page_url = s3_page_image["url"]

svg = f"""
<svg viewBox="0 0 {pw} {ph}">
<image width={pw} height={ph} href="{page_url}" />
<rect x="{x1}" y="{ph - y2}" width="{x2 - x1}" height="{y2 - y1}" style="stroke-width:0.03;stroke:rgb(0,0,255);fill-opacity:0" />
</svg>
"""
return svg

0 comments on commit 106de7c

Please sign in to comment.