Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] feat: use author known identifiers in import API #10110

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
115 changes: 88 additions & 27 deletions openlibrary/catalog/add_book/load_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from openlibrary.catalog.utils import author_dates_match, flip_name, key_int
from openlibrary.core.helpers import extract_year
from openlibrary.utils import extract_numeric_id_from_olid, uniq

if TYPE_CHECKING:
from openlibrary.plugins.upstream.models import Author
Expand Down Expand Up @@ -144,8 +145,58 @@ def walk_redirects(obj, seen):
seen.add(obj['key'])
return obj

# Try for an 'exact' (case-insensitive) name match, but fall back to alternate_names,
# then last name with identical birth and death dates (that are not themselves `None` or '').
def get_redirected_authors(authors: list["Author"]):
if any(a.type.key != '/type/author' for a in authors):
seen: set[dict] = set()
all_authors = [
walk_redirects(a, seen) for a in authors if a['key'] not in seen
]
return all_authors
return authors

# Look for OL ID first.
if (key := author.get("ol_id")) and (
reply := list(web.ctx.site.things({"type": "/type/author", "key~": key}))
):
# Always match on OL ID, even if remote identifiers don't match.
return get_redirected_authors([web.ctx.site.get(k) for k in reply])
# Try other identifiers next.
if identifiers := author.get("identifiers"):
queries = []
matched_authors = []
# Get all the authors that match any incoming identifier.
for identifier, val in identifiers.items():
queries.append({"type": "/type/author", f"remote_ids.{identifier}~": val})
for query in queries:
if reply := list(web.ctx.site.things(query)):
matched_authors.extend(
get_redirected_authors([web.ctx.site.get(k) for k in reply])
)
matched_authors = uniq(matched_authors)
# The match is whichever one has the most identifiers in common AND does not have more conflicts than matched identifiers.
highest_matches = 0
selected_match = None
for a in matched_authors:
# merge_remote_ids will be in #10092
_, matches = a.merge_remote_ids(identifiers)
if matches > highest_matches:
selected_match = a
highest_matches = matches
elif (
matches == highest_matches
and matches > 0
and selected_match is not None
):
# Prioritize the lower OL ID when matched identifiers are equal
selected_match = (
a
if extract_numeric_id_from_olid(a.key)
< extract_numeric_id_from_olid(selected_match.key)
else selected_match
)
if highest_matches > 0 and selected_match is not None:
return [selected_match]
# Fall back to name/date matching, which we did before introducing identifiers.
name = author["name"].replace("*", r"\*")
queries = [
{"type": "/type/author", "name~": name},
Expand All @@ -157,37 +208,18 @@ def walk_redirects(obj, seen):
"death_date~": f"*{extract_year(author.get('death_date', '')) or -1}*",
}, # Use `-1` to ensure an empty string from extract_year doesn't match empty dates.
]
things = []
for query in queries:
if reply := list(web.ctx.site.things(query)):
things = get_redirected_authors([web.ctx.site.get(k) for k in reply])
break

authors = [web.ctx.site.get(k) for k in reply]
if any(a.type.key != '/type/author' for a in authors):
seen: set[dict] = set()
authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen]
return authors


def find_entity(author: dict[str, Any]) -> "Author | None":
"""
Looks for an existing Author record in OL
and returns it if found.

:param dict author: Author import dict {"name": "Some One"}
:return: Existing Author record if found, or None.
"""
assert isinstance(author, dict)
things = find_author(author)
if author.get('entity_type', 'person') != 'person':
return things[0] if things else None
match = []
seen = set()
for a in things:
key = a['key']
if key in seen:
continue
seen.add(key)
orig_key = key
assert a.type.key == '/type/author'
if 'birth_date' in author and 'birth_date' not in a:
continue
Expand All @@ -197,10 +229,28 @@ def find_entity(author: dict[str, Any]) -> "Author | None":
continue
match.append(a)
if not match:
return None
return []
if len(match) == 1:
return match[0]
return pick_from_matches(author, match)
return [match[0]]
return [pick_from_matches(author, match)]


def find_entity(author: dict[str, Any]) -> "Author | None":
"""
Looks for an existing Author record in OL
and returns it if found.

:param dict author: Author import dict {"name": "Some One"}
:return: Existing Author record if found, or None.
"""
assert isinstance(author, dict)
things = find_author(author)
if "identifiers" in author:
for index, t in enumerate(things):
# merge_remote_ids will be in #10092
t.remote_ids, _ = t.merge_remote_ids(author["identifiers"])
things[index] = t
return things[0] if things else None


def remove_author_honorifics(name: str) -> str:
Expand Down Expand Up @@ -248,9 +298,20 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str,
new['death_date'] = author['death_date']
return new
a = {'type': {'key': '/type/author'}}
for f in 'name', 'title', 'personal_name', 'birth_date', 'death_date', 'date':
for f in (
'name',
'title',
'personal_name',
'birth_date',
'death_date',
'date',
'remote_ids',
):
if f in author:
a[f] = author[f]
# Import record hitting endpoing should list external IDs under "identifiers", but needs to be "remote_ids" when going into the DB.
if "identifiers" in author:
a["remote_ids"] = author["identifiers"]
return a


Expand Down
74 changes: 71 additions & 3 deletions openlibrary/catalog/add_book/tests/test_load_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,77 @@ def test_author_wildcard_match_with_no_matches_creates_author_with_wildcard(
new_author_name = import_author(author)
assert author["name"] == new_author_name["name"]

def test_first_match_priority_name_and_dates(self, mock_site):
def test_first_match_ol_key(self, mock_site):
"""
Highest priority match is name, birth date, and death date.
Highest priority match is OL key.
"""
self.add_three_existing_authors(mock_site)

# Author with VIAF
author = {
"name": "William H. Brewer",
"key": "/authors/OL3A",
"type": {"key": "/type/author"},
"remote_ids": {"viaf": "12345678"},
}

# Another author with VIAF
author_different_key = {
"name": "William Brewer",
"key": "/authors/OL4A",
"type": {"key": "/type/author"},
"remote_ids": {"viaf": "87654321"},
}

mock_site.save(author)
mock_site.save(author_different_key)

# Look for exact match on OL ID, regardless of other fields.
# We ideally shouldn't ever have a case where different authors have the same VIAF, but this demonstrates priority.
searched_author = {
"name": "William H. Brewer",
"key": "/authors/OL4A",
"identifiers": {"viaf": "12345678"},
}
found = import_author(searched_author)
assert found.key == author_different_key["key"]

def test_second_match_remote_identifier(self, mock_site):
"""
Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
"""
self.add_three_existing_authors(mock_site)

# Author with VIAF
author = {
"name": "William H. Brewer",
"key": "/authors/OL3A",
"type": {"key": "/type/author"},
"remote_ids": {"viaf": "12345678"},
}

# Another author with VIAF
author_different_viaf = {
"name": "William Brewer",
"key": "/authors/OL4A",
"type": {"key": "/type/author"},
"remote_ids": {"viaf": "87654321"},
}

mock_site.save(author)
mock_site.save(author_different_viaf)

# Look for exact match on VIAF, regardless of name field.
searched_author = {
"name": "William Brewer",
"identifiers": {"viaf": "12345678"},
}
found = import_author(searched_author)
assert found.key == author["key"]

def test_third_match_priority_name_and_dates(self, mock_site):
"""
Next highest priority match is name, birth date, and death date.
"""
self.add_three_existing_authors(mock_site)

Expand Down Expand Up @@ -202,7 +270,7 @@ def test_non_matching_birth_death_creates_new_author(self, mock_site):
assert isinstance(found, dict)
assert found["death_date"] == searched_and_not_found_author["death_date"]

def test_second_match_priority_alternate_names_and_dates(self, mock_site):
def test_match_priority_alternate_names_and_dates(self, mock_site):
"""
Matching, as a unit, alternate name, birth date, and death date, get
second match priority.
Expand Down
26 changes: 26 additions & 0 deletions openlibrary/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from openlibrary.core.ratings import Ratings
from openlibrary.core.vendors import get_amazon_metadata
from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
from openlibrary.plugins.upstream.utils import get_author_config
from openlibrary.utils import extract_numeric_id_from_olid
from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13

Expand Down Expand Up @@ -807,6 +808,31 @@ def get_edition_count(self):
def get_lists(self, limit=50, offset=0, sort=True):
return self._get_lists(limit=limit, offset=offset, sort=sort)

def merge_remote_ids(
self, incoming_ids: dict[str, str]
) -> tuple[dict[str, str], int]:
"""Returns the author's remote IDs merged with a given remote IDs object, as well as a count for how many IDs had conflicts.
If incoming_ids is empty, or if there are more conflicts than matches, no merge will be attempted, and the output will be (author.remote_ids, -1).
"""
output = {**self.remote_ids}
if len(incoming_ids.items()) == 0:
return output, -1
# Count
matches = 0
conflicts = 0
for id in get_author_config():
identifier: str = id.name
if identifier in output and identifier in incoming_ids:
if output[identifier] != incoming_ids[identifier]:
conflicts = conflicts + 1
else:
output[identifier] = incoming_ids[identifier]
matches = matches + 1
if conflicts > matches:
# TODO: Raise this to librarians, somehow.
return self.remote_ids, -1
return output, matches


class User(Thing):
DEFAULT_PREFERENCES = {
Expand Down
13 changes: 9 additions & 4 deletions openlibrary/plugins/importapi/import_edition_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,15 @@ def add_list(self, key, val):
self.edition_dict[key] = [val]

def add_author(self, key, val):
# We don't know birth_date or death_date.
# Should name and personal_name be the same value?
author_dict = {'personal_name': val, 'name': val, 'entity_type': 'person'}
self.add_list('authors', author_dict)
if isinstance(val, dict):
author_dict = val
if "name" in author_dict:
author_dict['personal_name'] = author_dict['name']
self.add_list('authors', author_dict)
else:
self.add_list(
'authors', {'personal_name': val, 'name': val, 'entity_type': 'person'}
)

def add_illustrator(self, key, val):
self.add_list('contributions', val + ' (Illustrator)')
Expand Down
Loading