Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: partial db lookup, document entities lookup and render #145

Merged
merged 2 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions deepsearch/chemistry/queries/molecules.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def MoleculeQuery(
def MoleculesInPatentsQuery(
patents: Union[str, List[str]],
num_items: int = 10,
partial_lookup: bool = False,
) -> Query:
"""
List all molecules contained in a list of patents.
Expand All @@ -117,6 +118,7 @@ def MoleculesInPatentsQuery(
for v in patents
]
},
"partial_references": partial_lookup,
"limit": num_items,
},
coordinates=KnowledgeDbResource(),
Expand Down
23 changes: 1 addition & 22 deletions deepsearch/documents/core/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,7 @@

from tabulate import tabulate


def _resolve_item(item, doc):
"""
Return the resolved item in the document.
If the item is referencing another part of the document, it will be retrieved.
"""

# TODO: improve the function for handling proper json path (not only two segments)
try:
if "$ref" in item:
parts = item["$ref"].split("/")
citem = doc[parts[1]][int(parts[2])]
elif "__ref" in item:
parts = item["__ref"].split("/")
citem = doc[parts[1]][int(parts[2])]
else:
citem = item

return citem

except KeyError:
return None
from deepsearch.documents.core.lookup import _resolve_item


def export_to_markdown(document: Dict[str, Any]) -> str:
Expand Down
75 changes: 75 additions & 0 deletions deepsearch/documents/core/lookup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Any, Dict, List

from pydantic import BaseModel


def _resolve_item(item, doc):
"""
Return the resolved item in the document.
If the item is referencing another part of the document, it will be retrieved.
"""

# TODO: improve the function for handling proper json path (not only two segments)
try:
if "$ref" in item:
parts = item["$ref"].split("/")
citem = doc[parts[1]][int(parts[2])]
elif "__ref" in item:
parts = item["__ref"].split("/")
citem = doc[parts[1]][int(parts[2])]
else:
citem = item

return citem

except KeyError:
return None


class EntitiesLookup:
class _MatchedEntry(BaseModel):
index: int
doc_path: str
type: str
content: Dict[str, Any]

def __init__(self, document: Dict[str, Any]):
self.document = document
self._lookup: Dict[
str, Dict[str, List[EntitiesLookup._MatchedEntry]]
] = {} # {entity_type: {entity_instance: [reference, ...]}}
self._build()

def _build(self):
"""
Build the internal lookup structure for entities in a document.
"""
for ix, item in enumerate(self.document["main-text"]):
item = _resolve_item(item, self.document)
if not "entities" in item:
continue

me = EntitiesLookup._MatchedEntry(
index=ix,
doc_path=f"main-text.{ix}",
type=item["type"],
content=item,
)

for entity_type, entities in item["entities"].items():
if not entity_type in self._lookup:
self._lookup[entity_type] = {}

for entity in entities:
matches = {entity["match"], entity["original"]}
for match in matches:
if not match in self._lookup[entity_type]:
self._lookup[entity_type][match] = []

self._lookup[entity_type][match].append(me)

def get(self, *, entity_type: str, entity: str) -> List[_MatchedEntry]:
"""
Lookup where a given entity is mentioned in a document.
"""
return self._lookup.get(entity_type, {}).get(entity, [])
62 changes: 62 additions & 0 deletions deepsearch/documents/core/render.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from typing import Any, Dict


def get_figure_svg(document: Dict[str, Any], figure: Dict[str, Any]):
"""
Generates a SVG which crops the figure from the image of the document page.
"""
page_no = figure["prov"][0]["page"]
doc_page_dims = next(
(p for p in document["page-dimensions"] if p["page"] == page_no), None
)
if doc_page_dims is None:
return ""

s3_page_image = next(
(p for p in document["_s3_data"]["pdf-images"] if p["page"] == page_no), None
)
if s3_page_image is None:
return ""

[pw, ph] = doc_page_dims["width"], doc_page_dims["height"]
[x1, y1, x2, y2] = figure["prov"][0]["bbox"]

page_url = s3_page_image["url"]

svg = f"""
<svg viewBox="{x1} {ph - y2} {x2 - x1} {y2 - y1}">
<image width={pw} height={ph} href="{page_url}" />
</svg>
"""
return svg


def get_page_svg_with_item(document: Dict[str, Any], item: Dict[str, Any]):
"""
Generates a SVG which overlays the bounding-box of the item with the image of the page.
"""
page_no = item["prov"][0]["page"]
doc_page_dims = next(
(p for p in document["page-dimensions"] if p["page"] == page_no), None
)
if doc_page_dims is None:
return ""

s3_page_image = next(
(p for p in document["_s3_data"]["pdf-images"] if p["page"] == page_no), None
)
if s3_page_image is None:
return ""

[pw, ph] = doc_page_dims["width"], doc_page_dims["height"]
[x1, y1, x2, y2] = item["prov"][0]["bbox"]

page_url = s3_page_image["url"]

svg = f"""
<svg viewBox="0 0 {pw} {ph}">
<image width={pw} height={ph} href="{page_url}" />
<rect x="{x1}" y="{ph - y2}" width="{x2 - x1}" height="{y2 - y1}" style="stroke-width:0.03;stroke:rgb(0,0,255);fill-opacity:0" />
</svg>
"""
return svg