internetarchive · mekarpeles · Nov 18, 2023 · Nov 9, 2023 · Nov 13, 2023 · Nov 13, 2023
diff --git a/compose.override.yaml b/compose.override.yaml
@@ -21,6 +21,8 @@ services:
       # The above volume mounts are required so that the local dev bind mount below
       # does not clobber the data generated inside the image / container
       - .:/openlibrary
+    environment:
+      - LOCAL_DEV=true
     depends_on:
       - db
       - infobase

diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
@@ -765,11 +765,12 @@ def load_data(
 def normalize_import_record(rec: dict) -> None:
     """
     Normalize the import record by:
-        - Verifying required fields
-        - Ensuring source_records is a list
-        - Splitting subtitles out of the title field
-        - Cleaning all ISBN and LCCN fields ('bibids'), and
-        - Deduplicate authors.
+        - Verifying required fields;
+        - Ensuring source_records is a list;
+        - Splitting subtitles out of the title field;
+        - Cleaning all ISBN and LCCN fields ('bibids');
+        - Deduplicate authors; and
+        - Remove throw-away data used for validation.
 
         NOTE: This function modifies the passed-in rec in place.
     """
@@ -801,6 +802,17 @@ def normalize_import_record(rec: dict) -> None:
     # deduplicate authors
     rec['authors'] = uniq(rec.get('authors', []), dicthash)
 
+    # Validation by parse_data(), prior to calling load(), requires facially
+    # valid publishers, authors, and publish_date. If data are unavailable, we
+    # provide throw-away data which validates. We use ["????"] as an override,
+    # but this must be removed prior to import.
+    if rec.get('publishers') == ["????"]:
+        rec.pop('publishers')
+    if rec.get('authors') == [{"name": "????"}]:
+        rec.pop('authors')
+    if rec.get('publish_date') == "????":
+        rec.pop('publish_date')
+
 
 def validate_record(rec: dict) -> None:
     """

diff --git a/openlibrary/catalog/add_book/tests/test_add_book.py b/openlibrary/catalog/add_book/tests/test_add_book.py
@@ -1475,3 +1475,38 @@ def test_future_publication_dates_are_deleted(self, year, expected):
         normalize_import_record(rec=rec)
         result = 'publish_date' in rec
         assert result == expected
+
+    @pytest.mark.parametrize(
+        'rec, expected',
+        [
+            (
+                {
+                    'title': 'first title',
+                    'source_records': ['ia:someid'],
+                    'publishers': ['????'],
+                    'authors': [{'name': '????'}],
+                    'publish_date': '????',
+                },
+                {'title': 'first title', 'source_records': ['ia:someid']},
+            ),
+            (
+                {
+                    'title': 'second title',
+                    'source_records': ['ia:someid'],
+                    'publishers': ['a publisher'],
+                    'authors': [{'name': 'an author'}],
+                    'publish_date': '2000',
+                },
+                {
+                    'title': 'second title',
+                    'source_records': ['ia:someid'],
+                    'publishers': ['a publisher'],
+                    'authors': [{'name': 'an author'}],
+                    'publish_date': '2000',
+                },
+            ),
+        ],
+    )
+    def test_dummy_data_to_satisfy_parse_data_is_removed(self, rec, expected):
+        normalize_import_record(rec=rec)
+        assert rec == expected
diff --git a/openlibrary/core/imports.py b/openlibrary/core/imports.py
@@ -1,7 +1,8 @@
 """Interface to import queue.
 """
 from collections import defaultdict
-from typing import Any
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any, Final
 
 import logging
 import datetime
@@ -10,14 +11,23 @@
 import json
 
 from psycopg2.errors import UndefinedTable, UniqueViolation
+from pydantic import ValidationError
+from web.db import ResultSet
+from web.utils import Storage
 
 from . import db
 
 import contextlib
+from openlibrary.catalog import add_book
 from openlibrary.core import cache
 
 logger = logging.getLogger("openlibrary.imports")
 
+STAGED_SOURCES: Final = ('amazon', 'idb')
+
+if TYPE_CHECKING:
+    from openlibrary.core.models import Edition
+
 
 class Batch(web.storage):
     @staticmethod
@@ -64,6 +74,7 @@ def normalize_items(self, items):
                 'batch_id': self.id,
                 # Partner bots set ia_id to eg "partner:978..."
                 'ia_id': item.get('ia_id'),
+                'status': item.get('status', 'pending'),
                 'data': json.dumps(item.get('data'), sort_keys=True)
                 if item.get('data')
                 else None,
@@ -107,8 +118,108 @@ def get_items(self, status="pending"):
 class ImportItem(web.storage):
     @staticmethod
     def find_pending(limit=1000):
-        result = db.where("import_item", status="pending", order="id", limit=limit)
-        return map(ImportItem, result)
+        if result := db.where("import_item", status="pending", order="id", limit=limit):
+            return map(ImportItem, result)
+
+        return None
+
+    @staticmethod
+    def find_staged_or_pending(
+        identifiers: Iterable[str], sources: Iterable[str] = STAGED_SOURCES
+    ) -> ResultSet:
+        """
+        Find staged or pending items in import_item matching the ia_id identifiers.
+
+        Given a list of ISBNs as identifiers, creates list of `ia_ids` and
+        queries the import_item table for them.
+
+        Generated `ia_ids` have the form `{source}:{identifier}` for each `source`
+        in `sources` and `identifier` in `identifiers`.
+        """
+        ia_ids = [
+            f"{source}:{identifier}" for identifier in identifiers for source in sources
+        ]
+
+        query = (
+            "SELECT * "
+            "FROM import_item "
+            "WHERE status IN ('staged', 'pending') "
+            "AND ia_id IN $ia_ids"
+        )
+        return db.query(query, vars={'ia_ids': ia_ids})
+
+    @staticmethod
+    def import_first_staged(
+        identifiers: list[str], sources: Iterable[str] = STAGED_SOURCES
+    ) -> "Edition | None":
+        """
+        Import the first staged item in import_item matching the ia_id identifiers.
+
+        This changes the status of matching ia_id identifiers to prevent a
+        race condition that can result in duplicate imports.
+        """
+        ia_ids = [
+            f"{source}:{identifier}" for identifier in identifiers for source in sources
+        ]
+
+        query_start_processing = (
+            "UPDATE import_item "
+            "SET status = 'processing' "
+            "WHERE status = 'staged' "
+            "AND ia_id IN $ia_ids "
+            "RETURNING *"
+        )
+
+        # TODO: Would this be better to update by the specific ID, given
+        # we have the IDs? If this approach works generally, it could work for
+        # both `staged` and `pending` by making a dictionary of the original
+        # `status` values, and restoring all the original values, based on `id`,
+        # save for the one upon which import was tested.
+        query_finish_processing = (
+            "UPDATE import_item "
+            "SET status = 'staged' "
+            "WHERE status = 'processing' "
+            "AND ia_id IN $ia_ids"
+        )
+
+        if in_process_items := db.query(
+            query_start_processing, vars={'ia_ids': ia_ids}
+        ):
+            item: ImportItem = ImportItem(in_process_items[0])
+            try:
+                return item.single_import()
+            except Exception:  # noqa: BLE001
+                return None
+            finally:
+                db.query(query_finish_processing, vars={'ia_ids': ia_ids})
+
+        return None
+
+    def single_import(self) -> "Edition | None":
+        """Import the item using load(), swallow errors, update status, and return the Edition if any."""
+        try:
+            # Avoids a circular import issue.
+            from openlibrary.plugins.importapi.code import parse_data
+
+            edition, _ = parse_data(self.data.encode('utf-8'))
+            if edition:
+                reply = add_book.load(edition)
+                if reply.get('success') and 'edition' in reply:
+                    edition = reply['edition']
+                    self.set_status(edition['status'], ol_key=edition['key'])  # type: ignore[index]
+                    return web.ctx.site.get(edition['key'])  # type: ignore[index]
+                else:
+                    error_code = reply.get('error_code', 'unknown-error')
+                    self.set_status("failed", error=error_code)
+
+        except ValidationError:
+            self.set_status("failed", error="invalid-value")
+            return None
+        except Exception:  # noqa: BLE001
+            self.set_status("failed", error="unknown-error")
+            return None
+
+        return None
 
     @staticmethod
     def find_by_identifier(identifier):

diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py
@@ -2,6 +2,7 @@
 """
 from datetime import datetime, timedelta
 import logging
+
 import web
 import json
 import requests
@@ -15,20 +16,23 @@
 
 # TODO: fix this. openlibrary.core should not import plugins.
 from openlibrary import accounts
-from openlibrary.utils import extract_numeric_id_from_olid
-from openlibrary.core.helpers import private_collection_in
-from openlibrary.core.bookshelves import Bookshelves
+from openlibrary.catalog import add_book
 from openlibrary.core.booknotes import Booknotes
+from openlibrary.core.bookshelves import Bookshelves
+from openlibrary.core.helpers import private_collection_in
+from openlibrary.core.imports import ImportItem
 from openlibrary.core.observations import Observations
 from openlibrary.core.ratings import Ratings
-from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical
 from openlibrary.core.vendors import create_edition_from_amazon_metadata
+from openlibrary.utils import extract_numeric_id_from_olid
+from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical
 
 # Seed might look unused, but removing it causes an error :/
 from openlibrary.core.lists.model import ListMixin, Seed
 from . import cache, waitinglist
 
 import urllib
+from pydantic import ValidationError
 
 from .ia import get_metadata_direct
 from .waitinglist import WaitingLoan
@@ -366,11 +370,12 @@ def get_ia_download_link(self, suffix):
                 return f"https://archive.org/download/{self.ocaid}/{filename}"
 
     @classmethod
-    def from_isbn(cls, isbn: str, retry: bool = False):
-        """Attempts to fetch an edition by isbn, or if no edition is found,
-        attempts to import from amazon
-        :rtype: edition|None
-        :return: an open library work for this isbn
+    def from_isbn(cls, isbn: str) -> "Edition | None":  # type: ignore[return]
+        """
+        Attempts to fetch an edition by ISBN, or if no edition is found, then
+        check the import_item table for a match, then as a last result, attempt
+        to import from Amazon.
+        :return: an open library edition for this ISBN or None.
         """
         isbn = canonical(isbn)
 
@@ -381,23 +386,22 @@ def from_isbn(cls, isbn: str, retry: bool = False):
         if isbn13 is None:
             return None  # consider raising ValueError
         isbn10 = isbn_13_to_isbn_10(isbn13)
+        isbns = [isbn13, isbn10] if isbn10 is not None else [isbn13]
 
         # Attempt to fetch book from OL
-        for isbn in [isbn13, isbn10]:
+        for isbn in isbns:
             if isbn:
                 matches = web.ctx.site.things(
                     {"type": "/type/edition", 'isbn_%s' % len(isbn): isbn}
                 )
                 if matches:
                     return web.ctx.site.get(matches[0])
 
-        # Attempt to create from amazon, then fetch from OL
-        retries = 5 if retry else 0
-        key = (isbn10 or isbn13) and create_edition_from_amazon_metadata(
-            isbn10 or isbn13, retries=retries
-        )
-        if key:
-            return web.ctx.site.get(key)
+        # Attempt to fetch the book from the import_item table
+        if result := ImportItem.import_first_staged(identifiers=isbns):
+            return result
+
+        # TODO: Final step - call affiliate server, with retry code migrated there.
 
     def is_ia_scan(self):
         metadata = self.get_ia_meta_fields()

diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py
@@ -130,16 +130,7 @@ def POST(self):
         data = web.data()
 
         try:
-            edition, format = parse_data(data)
-            # Validation requires valid publishers and authors.
-            # If data unavailable, provide throw-away data which validates
-            # We use ["????"] as an override pattern
-            if edition.get('publishers') == ["????"]:
-                edition.pop('publishers')
-            if edition.get('authors') == [{"name": "????"}]:
-                edition.pop('authors')
-            if edition.get('publish_date') == "????":
-                edition.pop('publish_date')
+            edition, _ = parse_data(data)
 
         except DataError as e:
             return self.error(str(e), 'Failed to parse import data')

diff --git a/openlibrary/plugins/openlibrary/code.py b/openlibrary/plugins/openlibrary/code.py
@@ -471,7 +471,7 @@ def GET(self, isbn):
             ext += '?' + web.ctx.env['QUERY_STRING']
 
         try:
-            if ed := Edition.from_isbn(isbn, retry=True):
+            if ed := Edition.from_isbn(isbn):
                 return web.found(ed.key + ext)
         except Exception as e:
             logger.error(e)