Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

imports: add importer for ISBNdb #8511

Merged
merged 13 commits into from
Nov 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions compose.override.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ services:
# The above volume mounts are required so that the local dev bind mount below
# does not clobber the data generated inside the image / container
- .:/openlibrary
environment:
- LOCAL_DEV=true
depends_on:
- db
- infobase
Expand Down
22 changes: 17 additions & 5 deletions openlibrary/catalog/add_book/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,11 +765,12 @@ def load_data(
def normalize_import_record(rec: dict) -> None:
"""
Normalize the import record by:
- Verifying required fields
- Ensuring source_records is a list
- Splitting subtitles out of the title field
- Cleaning all ISBN and LCCN fields ('bibids'), and
- Deduplicate authors.
- Verifying required fields;
- Ensuring source_records is a list;
- Splitting subtitles out of the title field;
- Cleaning all ISBN and LCCN fields ('bibids');
- Deduplicate authors; and
- Remove throw-away data used for validation.

NOTE: This function modifies the passed-in rec in place.
"""
Expand Down Expand Up @@ -801,6 +802,17 @@ def normalize_import_record(rec: dict) -> None:
# deduplicate authors
rec['authors'] = uniq(rec.get('authors', []), dicthash)

# Validation by parse_data(), prior to calling load(), requires facially
# valid publishers, authors, and publish_date. If data are unavailable, we
# provide throw-away data which validates. We use ["????"] as an override,
# but this must be removed prior to import.
if rec.get('publishers') == ["????"]:
rec.pop('publishers')
if rec.get('authors') == [{"name": "????"}]:
rec.pop('authors')
if rec.get('publish_date') == "????":
rec.pop('publish_date')


def validate_record(rec: dict) -> None:
"""
Expand Down
35 changes: 35 additions & 0 deletions openlibrary/catalog/add_book/tests/test_add_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -1475,3 +1475,38 @@ def test_future_publication_dates_are_deleted(self, year, expected):
normalize_import_record(rec=rec)
result = 'publish_date' in rec
assert result == expected

@pytest.mark.parametrize(
'rec, expected',
[
(
{
'title': 'first title',
'source_records': ['ia:someid'],
'publishers': ['????'],
'authors': [{'name': '????'}],
'publish_date': '????',
},
{'title': 'first title', 'source_records': ['ia:someid']},
),
(
{
'title': 'second title',
'source_records': ['ia:someid'],
'publishers': ['a publisher'],
'authors': [{'name': 'an author'}],
'publish_date': '2000',
},
{
'title': 'second title',
'source_records': ['ia:someid'],
'publishers': ['a publisher'],
'authors': [{'name': 'an author'}],
'publish_date': '2000',
},
),
],
)
def test_dummy_data_to_satisfy_parse_data_is_removed(self, rec, expected):
normalize_import_record(rec=rec)
assert rec == expected
117 changes: 114 additions & 3 deletions openlibrary/core/imports.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Interface to import queue.
"""
from collections import defaultdict
from typing import Any
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, Final

import logging
import datetime
Expand All @@ -10,14 +11,23 @@
import json

from psycopg2.errors import UndefinedTable, UniqueViolation
from pydantic import ValidationError
from web.db import ResultSet
from web.utils import Storage

from . import db

import contextlib
from openlibrary.catalog import add_book
from openlibrary.core import cache

logger = logging.getLogger("openlibrary.imports")

STAGED_SOURCES: Final = ('amazon', 'idb')

if TYPE_CHECKING:
from openlibrary.core.models import Edition


class Batch(web.storage):
@staticmethod
Expand Down Expand Up @@ -64,6 +74,7 @@ def normalize_items(self, items):
'batch_id': self.id,
# Partner bots set ia_id to eg "partner:978..."
'ia_id': item.get('ia_id'),
'status': item.get('status', 'pending'),
'data': json.dumps(item.get('data'), sort_keys=True)
if item.get('data')
else None,
Expand Down Expand Up @@ -107,8 +118,108 @@ def get_items(self, status="pending"):
class ImportItem(web.storage):
@staticmethod
def find_pending(limit=1000):
result = db.where("import_item", status="pending", order="id", limit=limit)
return map(ImportItem, result)
if result := db.where("import_item", status="pending", order="id", limit=limit):
return map(ImportItem, result)

return None

@staticmethod
def find_staged_or_pending(
identifiers: Iterable[str], sources: Iterable[str] = STAGED_SOURCES
) -> ResultSet:
"""
Find staged or pending items in import_item matching the ia_id identifiers.

Given a list of ISBNs as identifiers, creates list of `ia_ids` and
queries the import_item table for them.

Generated `ia_ids` have the form `{source}:{identifier}` for each `source`
in `sources` and `identifier` in `identifiers`.
"""
ia_ids = [
f"{source}:{identifier}" for identifier in identifiers for source in sources
]

query = (
"SELECT * "
"FROM import_item "
"WHERE status IN ('staged', 'pending') "
"AND ia_id IN $ia_ids"
)
return db.query(query, vars={'ia_ids': ia_ids})

@staticmethod
def import_first_staged(
identifiers: list[str], sources: Iterable[str] = STAGED_SOURCES
) -> "Edition | None":
"""
Import the first staged item in import_item matching the ia_id identifiers.

This changes the status of matching ia_id identifiers to prevent a
race condition that can result in duplicate imports.
"""
ia_ids = [
f"{source}:{identifier}" for identifier in identifiers for source in sources
]

query_start_processing = (
"UPDATE import_item "
"SET status = 'processing' "
"WHERE status = 'staged' "
"AND ia_id IN $ia_ids "
"RETURNING *"
)

# TODO: Would this be better to update by the specific ID, given
# we have the IDs? If this approach works generally, it could work for
# both `staged` and `pending` by making a dictionary of the original
# `status` values, and restoring all the original values, based on `id`,
# save for the one upon which import was tested.
query_finish_processing = (
"UPDATE import_item "
"SET status = 'staged' "
"WHERE status = 'processing' "
"AND ia_id IN $ia_ids"
)

if in_process_items := db.query(
query_start_processing, vars={'ia_ids': ia_ids}
):
item: ImportItem = ImportItem(in_process_items[0])
try:
return item.single_import()
except Exception: # noqa: BLE001
return None
finally:
db.query(query_finish_processing, vars={'ia_ids': ia_ids})

return None

def single_import(self) -> "Edition | None":
"""Import the item using load(), swallow errors, update status, and return the Edition if any."""
try:
# Avoids a circular import issue.
from openlibrary.plugins.importapi.code import parse_data

edition, _ = parse_data(self.data.encode('utf-8'))
if edition:
reply = add_book.load(edition)
if reply.get('success') and 'edition' in reply:
edition = reply['edition']
self.set_status(edition['status'], ol_key=edition['key']) # type: ignore[index]
return web.ctx.site.get(edition['key']) # type: ignore[index]
else:
error_code = reply.get('error_code', 'unknown-error')
self.set_status("failed", error=error_code)

except ValidationError:
self.set_status("failed", error="invalid-value")
return None
except Exception: # noqa: BLE001
self.set_status("failed", error="unknown-error")
return None

return None

@staticmethod
def find_by_identifier(identifier):
Expand Down
38 changes: 21 additions & 17 deletions openlibrary/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
from datetime import datetime, timedelta
import logging

import web
import json
import requests
Expand All @@ -15,20 +16,23 @@

# TODO: fix this. openlibrary.core should not import plugins.
from openlibrary import accounts
from openlibrary.utils import extract_numeric_id_from_olid
from openlibrary.core.helpers import private_collection_in
from openlibrary.core.bookshelves import Bookshelves
from openlibrary.catalog import add_book
from openlibrary.core.booknotes import Booknotes
from openlibrary.core.bookshelves import Bookshelves
from openlibrary.core.helpers import private_collection_in
from openlibrary.core.imports import ImportItem
from openlibrary.core.observations import Observations
from openlibrary.core.ratings import Ratings
from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical
from openlibrary.core.vendors import create_edition_from_amazon_metadata
from openlibrary.utils import extract_numeric_id_from_olid
from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical

# Seed might look unused, but removing it causes an error :/
from openlibrary.core.lists.model import ListMixin, Seed
from . import cache, waitinglist

import urllib
from pydantic import ValidationError

from .ia import get_metadata_direct
from .waitinglist import WaitingLoan
Expand Down Expand Up @@ -366,11 +370,12 @@ def get_ia_download_link(self, suffix):
return f"https://archive.org/download/{self.ocaid}/{filename}"

@classmethod
def from_isbn(cls, isbn: str, retry: bool = False):
"""Attempts to fetch an edition by isbn, or if no edition is found,
attempts to import from amazon
:rtype: edition|None
:return: an open library work for this isbn
def from_isbn(cls, isbn: str) -> "Edition | None": # type: ignore[return]
"""
Attempts to fetch an edition by ISBN, or if no edition is found, then
check the import_item table for a match, then as a last result, attempt
to import from Amazon.
:return: an open library edition for this ISBN or None.
"""
isbn = canonical(isbn)

Expand All @@ -381,23 +386,22 @@ def from_isbn(cls, isbn: str, retry: bool = False):
if isbn13 is None:
return None # consider raising ValueError
isbn10 = isbn_13_to_isbn_10(isbn13)
isbns = [isbn13, isbn10] if isbn10 is not None else [isbn13]

# Attempt to fetch book from OL
for isbn in [isbn13, isbn10]:
for isbn in isbns:
if isbn:
matches = web.ctx.site.things(
{"type": "/type/edition", 'isbn_%s' % len(isbn): isbn}
)
if matches:
return web.ctx.site.get(matches[0])

# Attempt to create from amazon, then fetch from OL
retries = 5 if retry else 0
key = (isbn10 or isbn13) and create_edition_from_amazon_metadata(
isbn10 or isbn13, retries=retries
)
if key:
return web.ctx.site.get(key)
# Attempt to fetch the book from the import_item table
if result := ImportItem.import_first_staged(identifiers=isbns):
return result

# TODO: Final step - call affiliate server, with retry code migrated there.

def is_ia_scan(self):
metadata = self.get_ia_meta_fields()
Expand Down
11 changes: 1 addition & 10 deletions openlibrary/plugins/importapi/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,7 @@ def POST(self):
data = web.data()

try:
edition, format = parse_data(data)
# Validation requires valid publishers and authors.
# If data unavailable, provide throw-away data which validates
# We use ["????"] as an override pattern
if edition.get('publishers') == ["????"]:
edition.pop('publishers')
if edition.get('authors') == [{"name": "????"}]:
edition.pop('authors')
if edition.get('publish_date') == "????":
edition.pop('publish_date')
edition, _ = parse_data(data)

except DataError as e:
return self.error(str(e), 'Failed to parse import data')
Expand Down
2 changes: 1 addition & 1 deletion openlibrary/plugins/openlibrary/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def GET(self, isbn):
ext += '?' + web.ctx.env['QUERY_STRING']

try:
if ed := Edition.from_isbn(isbn, retry=True):
if ed := Edition.from_isbn(isbn):
return web.found(ed.key + ext)
except Exception as e:
logger.error(e)
Expand Down
Loading