internetarchive · pidgezero-one · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
@@ -4,6 +4,7 @@
 
 from openlibrary.catalog.utils import author_dates_match, flip_name, key_int
 from openlibrary.core.helpers import extract_year
+from openlibrary.utils import extract_numeric_id_from_olid, uniq
 
 if TYPE_CHECKING:
     from openlibrary.plugins.upstream.models import Author
@@ -144,8 +145,58 @@ def walk_redirects(obj, seen):
             seen.add(obj['key'])
         return obj
 
-    # Try for an 'exact' (case-insensitive) name match, but fall back to alternate_names,
-    # then last name with identical birth and death dates (that are not themselves `None` or '').
+    def get_redirected_authors(authors: list["Author"]):
+        if any(a.type.key != '/type/author' for a in authors):
+            seen: set[dict] = set()
+            all_authors = [
+                walk_redirects(a, seen) for a in authors if a['key'] not in seen
+            ]
+            return all_authors
+        return authors
+
+    # Look for OL ID first.
+    if (key := author.get("ol_id")) and (
+        reply := list(web.ctx.site.things({"type": "/type/author", "key~": key}))
+    ):
+        # Always match on OL ID, even if remote identifiers don't match.
+        return get_redirected_authors([web.ctx.site.get(k) for k in reply])
+    # Try other identifiers next.
+    if identifiers := author.get("identifiers"):
+        queries = []
+        matched_authors = []
+        # Get all the authors that match any incoming identifier.
+        for identifier, val in identifiers.items():
+            queries.append({"type": "/type/author", f"remote_ids.{identifier}~": val})
+        for query in queries:
+            if reply := list(web.ctx.site.things(query)):
+                matched_authors.extend(
+                    get_redirected_authors([web.ctx.site.get(k) for k in reply])
+                )
+        matched_authors = uniq(matched_authors)
+        # The match is whichever one has the most identifiers in common AND does not have more conflicts than matched identifiers.
+        highest_matches = 0
+        selected_match = None
+        for a in matched_authors:
+            # merge_remote_ids will be in #10092
+            _, matches = a.merge_remote_ids(identifiers)
+            if matches > highest_matches:
+                selected_match = a
+                highest_matches = matches
+            elif (
+                matches == highest_matches
+                and matches > 0
+                and selected_match is not None
+            ):
+                # Prioritize the lower OL ID when matched identifiers are equal
+                selected_match = (
+                    a
+                    if extract_numeric_id_from_olid(a.key)
+                    < extract_numeric_id_from_olid(selected_match.key)
+                    else selected_match
+                )
+        if highest_matches > 0 and selected_match is not None:
+            return [selected_match]
+    # Fall back to name/date matching, which we did before introducing identifiers.
     name = author["name"].replace("*", r"\*")
     queries = [
         {"type": "/type/author", "name~": name},
@@ -157,37 +208,18 @@ def walk_redirects(obj, seen):
             "death_date~": f"*{extract_year(author.get('death_date', '')) or -1}*",
         },  # Use `-1` to ensure an empty string from extract_year doesn't match empty dates.
     ]
+    things = []
     for query in queries:
         if reply := list(web.ctx.site.things(query)):
+            things = get_redirected_authors([web.ctx.site.get(k) for k in reply])
             break
-
-    authors = [web.ctx.site.get(k) for k in reply]
-    if any(a.type.key != '/type/author' for a in authors):
-        seen: set[dict] = set()
-        authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen]
-    return authors
-
-
-def find_entity(author: dict[str, Any]) -> "Author | None":
-    """
-    Looks for an existing Author record in OL
-    and returns it if found.
-
-    :param dict author: Author import dict {"name": "Some One"}
-    :return: Existing Author record if found, or None.
-    """
-    assert isinstance(author, dict)
-    things = find_author(author)
-    if author.get('entity_type', 'person') != 'person':
-        return things[0] if things else None
     match = []
     seen = set()
     for a in things:
         key = a['key']
         if key in seen:
             continue
         seen.add(key)
-        orig_key = key
         assert a.type.key == '/type/author'
         if 'birth_date' in author and 'birth_date' not in a:
             continue
@@ -197,10 +229,28 @@ def find_entity(author: dict[str, Any]) -> "Author | None":
             continue
         match.append(a)
     if not match:
-        return None
+        return []
     if len(match) == 1:
-        return match[0]
-    return pick_from_matches(author, match)
+        return [match[0]]
+    return [pick_from_matches(author, match)]
+
+
+def find_entity(author: dict[str, Any]) -> "Author | None":
+    """
+    Looks for an existing Author record in OL
+    and returns it if found.
+
+    :param dict author: Author import dict {"name": "Some One"}
+    :return: Existing Author record if found, or None.
+    """
+    assert isinstance(author, dict)
+    things = find_author(author)
+    if "identifiers" in author:
+        for index, t in enumerate(things):
+            # merge_remote_ids will be in #10092
+            t.remote_ids, _ = t.merge_remote_ids(author["identifiers"])
+            things[index] = t
+    return things[0] if things else None
 
 
 def remove_author_honorifics(name: str) -> str:
@@ -248,9 +298,20 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str,
             new['death_date'] = author['death_date']
         return new
     a = {'type': {'key': '/type/author'}}
-    for f in 'name', 'title', 'personal_name', 'birth_date', 'death_date', 'date':
+    for f in (
+        'name',
+        'title',
+        'personal_name',
+        'birth_date',
+        'death_date',
+        'date',
+        'remote_ids',
+    ):
         if f in author:
             a[f] = author[f]
+    # Import record hitting endpoing should list external IDs under "identifiers", but needs to be "remote_ids" when going into the DB.
+    if "identifiers" in author:
+        a["remote_ids"] = author["identifiers"]
     return a
 
 

diff --git a/openlibrary/catalog/add_book/tests/test_load_book.py b/openlibrary/catalog/add_book/tests/test_load_book.py
@@ -137,9 +137,77 @@ def test_author_wildcard_match_with_no_matches_creates_author_with_wildcard(
         new_author_name = import_author(author)
         assert author["name"] == new_author_name["name"]
 
-    def test_first_match_priority_name_and_dates(self, mock_site):
+    def test_first_match_ol_key(self, mock_site):
         """
-        Highest priority match is name, birth date, and death date.
+        Highest priority match is OL key.
+        """
+        self.add_three_existing_authors(mock_site)
+
+        # Author with VIAF
+        author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL3A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "12345678"},
+        }
+
+        # Another author with VIAF
+        author_different_key = {
+            "name": "William Brewer",
+            "key": "/authors/OL4A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "87654321"},
+        }
+
+        mock_site.save(author)
+        mock_site.save(author_different_key)
+
+        # Look for exact match on OL ID, regardless of other fields.
+        # We ideally shouldn't ever have a case where different authors have the same VIAF, but this demonstrates priority.
+        searched_author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL4A",
+            "identifiers": {"viaf": "12345678"},
+        }
+        found = import_author(searched_author)
+        assert found.key == author_different_key["key"]
+
+    def test_second_match_remote_identifier(self, mock_site):
+        """
+        Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
+        """
+        self.add_three_existing_authors(mock_site)
+
+        # Author with VIAF
+        author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL3A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "12345678"},
+        }
+
+        # Another author with VIAF
+        author_different_viaf = {
+            "name": "William Brewer",
+            "key": "/authors/OL4A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "87654321"},
+        }
+
+        mock_site.save(author)
+        mock_site.save(author_different_viaf)
+
+        # Look for exact match on VIAF, regardless of name field.
+        searched_author = {
+            "name": "William Brewer",
+            "identifiers": {"viaf": "12345678"},
+        }
+        found = import_author(searched_author)
+        assert found.key == author["key"]
+
+    def test_third_match_priority_name_and_dates(self, mock_site):
+        """
+        Next highest priority match is name, birth date, and death date.
         """
         self.add_three_existing_authors(mock_site)
 
@@ -202,7 +270,7 @@ def test_non_matching_birth_death_creates_new_author(self, mock_site):
         assert isinstance(found, dict)
         assert found["death_date"] == searched_and_not_found_author["death_date"]
 
-    def test_second_match_priority_alternate_names_and_dates(self, mock_site):
+    def test_match_priority_alternate_names_and_dates(self, mock_site):
         """
         Matching, as a unit, alternate name, birth date, and death date, get
         second match priority.

diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py
@@ -32,6 +32,7 @@
 from openlibrary.core.ratings import Ratings
 from openlibrary.core.vendors import get_amazon_metadata
 from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
+from openlibrary.plugins.upstream.utils import get_author_config
 from openlibrary.utils import extract_numeric_id_from_olid
 from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13
 
@@ -807,6 +808,31 @@ def get_edition_count(self):
     def get_lists(self, limit=50, offset=0, sort=True):
         return self._get_lists(limit=limit, offset=offset, sort=sort)
 
+    def merge_remote_ids(
+        self, incoming_ids: dict[str, str]
+    ) -> tuple[dict[str, str], int]:
+        """Returns the author's remote IDs merged with a given remote IDs object, as well as a count for how many IDs had conflicts.
+        If incoming_ids is empty, or if there are more conflicts than matches, no merge will be attempted, and the output will be (author.remote_ids, -1).
+        """
+        output = {**self.remote_ids}
+        if len(incoming_ids.items()) == 0:
+            return output, -1
+        # Count
+        matches = 0
+        conflicts = 0
+        for id in get_author_config():
+            identifier: str = id.name
+            if identifier in output and identifier in incoming_ids:
+                if output[identifier] != incoming_ids[identifier]:
+                    conflicts = conflicts + 1
+                else:
+                    output[identifier] = incoming_ids[identifier]
+                    matches = matches + 1
+        if conflicts > matches:
+            # TODO: Raise this to librarians, somehow.
+            return self.remote_ids, -1
+        return output, matches
+
 
 class User(Thing):
     DEFAULT_PREFERENCES = {

diff --git a/openlibrary/plugins/importapi/import_edition_builder.py b/openlibrary/plugins/importapi/import_edition_builder.py
@@ -100,10 +100,15 @@ def add_list(self, key, val):
             self.edition_dict[key] = [val]
 
     def add_author(self, key, val):
-        # We don't know birth_date or death_date.
-        # Should name and personal_name be the same value?
-        author_dict = {'personal_name': val, 'name': val, 'entity_type': 'person'}
-        self.add_list('authors', author_dict)
+        if isinstance(val, dict):
+            author_dict = val
+            if "name" in author_dict:
+                author_dict['personal_name'] = author_dict['name']
+            self.add_list('authors', author_dict)
+        else:
+            self.add_list(
+                'authors', {'personal_name': val, 'name': val, 'entity_type': 'person'}
+            )
 
     def add_illustrator(self, key, val):
         self.add_list('contributions', val + ' (Illustrator)')