-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: partial db lookup, document entities lookup and render (#145)
New features - Lookup entities in documents - Make cropped SVG of figures - Make SVG of a document page with a bbox overlay - Expose partial lookup in patents DB --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
- Loading branch information
1 parent
0073b37
commit 106de7c
Showing
4 changed files
with
140 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from typing import Any, Dict, List | ||
|
||
from pydantic import BaseModel | ||
|
||
|
||
def _resolve_item(item, doc): | ||
""" | ||
Return the resolved item in the document. | ||
If the item is referencing another part of the document, it will be retrieved. | ||
""" | ||
|
||
# TODO: improve the function for handling proper json path (not only two segments) | ||
try: | ||
if "$ref" in item: | ||
parts = item["$ref"].split("/") | ||
citem = doc[parts[1]][int(parts[2])] | ||
elif "__ref" in item: | ||
parts = item["__ref"].split("/") | ||
citem = doc[parts[1]][int(parts[2])] | ||
else: | ||
citem = item | ||
|
||
return citem | ||
|
||
except KeyError: | ||
return None | ||
|
||
|
||
class EntitiesLookup: | ||
class _MatchedEntry(BaseModel): | ||
index: int | ||
doc_path: str | ||
type: str | ||
content: Dict[str, Any] | ||
|
||
def __init__(self, document: Dict[str, Any]): | ||
self.document = document | ||
self._lookup: Dict[ | ||
str, Dict[str, List[EntitiesLookup._MatchedEntry]] | ||
] = {} # {entity_type: {entity_instance: [reference, ...]}} | ||
self._build() | ||
|
||
def _build(self): | ||
""" | ||
Build the internal lookup structure for entities in a document. | ||
""" | ||
for ix, item in enumerate(self.document["main-text"]): | ||
item = _resolve_item(item, self.document) | ||
if not "entities" in item: | ||
continue | ||
|
||
me = EntitiesLookup._MatchedEntry( | ||
index=ix, | ||
doc_path=f"main-text.{ix}", | ||
type=item["type"], | ||
content=item, | ||
) | ||
|
||
for entity_type, entities in item["entities"].items(): | ||
if not entity_type in self._lookup: | ||
self._lookup[entity_type] = {} | ||
|
||
for entity in entities: | ||
matches = {entity["match"], entity["original"]} | ||
for match in matches: | ||
if not match in self._lookup[entity_type]: | ||
self._lookup[entity_type][match] = [] | ||
|
||
self._lookup[entity_type][match].append(me) | ||
|
||
def get(self, *, entity_type: str, entity: str) -> List[_MatchedEntry]: | ||
""" | ||
Lookup where a given entity is mentioned in a document. | ||
""" | ||
return self._lookup.get(entity_type, {}).get(entity, []) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from typing import Any, Dict | ||
|
||
|
||
def get_figure_svg(document: Dict[str, Any], figure: Dict[str, Any]): | ||
""" | ||
Generates a SVG which crops the figure from the image of the document page. | ||
""" | ||
page_no = figure["prov"][0]["page"] | ||
doc_page_dims = next( | ||
(p for p in document["page-dimensions"] if p["page"] == page_no), None | ||
) | ||
if doc_page_dims is None: | ||
return "" | ||
|
||
s3_page_image = next( | ||
(p for p in document["_s3_data"]["pdf-images"] if p["page"] == page_no), None | ||
) | ||
if s3_page_image is None: | ||
return "" | ||
|
||
[pw, ph] = doc_page_dims["width"], doc_page_dims["height"] | ||
[x1, y1, x2, y2] = figure["prov"][0]["bbox"] | ||
|
||
page_url = s3_page_image["url"] | ||
|
||
svg = f""" | ||
<svg viewBox="{x1} {ph - y2} {x2 - x1} {y2 - y1}"> | ||
<image width={pw} height={ph} href="{page_url}" /> | ||
</svg> | ||
""" | ||
return svg | ||
|
||
|
||
def get_page_svg_with_item(document: Dict[str, Any], item: Dict[str, Any]): | ||
""" | ||
Generates a SVG which overlays the bounding-box of the item with the image of the page. | ||
""" | ||
page_no = item["prov"][0]["page"] | ||
doc_page_dims = next( | ||
(p for p in document["page-dimensions"] if p["page"] == page_no), None | ||
) | ||
if doc_page_dims is None: | ||
return "" | ||
|
||
s3_page_image = next( | ||
(p for p in document["_s3_data"]["pdf-images"] if p["page"] == page_no), None | ||
) | ||
if s3_page_image is None: | ||
return "" | ||
|
||
[pw, ph] = doc_page_dims["width"], doc_page_dims["height"] | ||
[x1, y1, x2, y2] = item["prov"][0]["bbox"] | ||
|
||
page_url = s3_page_image["url"] | ||
|
||
svg = f""" | ||
<svg viewBox="0 0 {pw} {ph}"> | ||
<image width={pw} height={ph} href="{page_url}" /> | ||
<rect x="{x1}" y="{ph - y2}" width="{x2 - x1}" height="{y2 - y1}" style="stroke-width:0.03;stroke:rgb(0,0,255);fill-opacity:0" /> | ||
</svg> | ||
""" | ||
return svg |