diff --git a/openlibrary/book_providers.py b/openlibrary/book_providers.py index ecca74f1c5d..097cbec822f 100644 --- a/openlibrary/book_providers.py +++ b/openlibrary/book_providers.py @@ -1,5 +1,8 @@ -from typing import TypedDict, Literal, cast, TypeVar, Generic +from dataclasses import dataclass +import logging from collections.abc import Callable, Iterator +from typing import TypedDict, Literal, cast, TypeVar, Generic +from urllib import parse import web from web import uniq @@ -10,6 +13,13 @@ from openlibrary.utils import OrderedEnum, multisort_best +logger = logging.getLogger("openlibrary.book_providers") + +AcquisitionAccessLiteral = Literal[ + 'sample', 'buy', 'open-access', 'borrow', 'subscribe' +] + + class EbookAccess(OrderedEnum): # Keep in sync with solr/conf/enumsConfig.xml ! NO_EBOOK = 0 @@ -21,6 +31,103 @@ class EbookAccess(OrderedEnum): def to_solr_str(self): return self.name.lower() + @staticmethod + def from_acquisition_access(literal: AcquisitionAccessLiteral) -> 'EbookAccess': + if literal == 'sample': + # We need to update solr to handle these! Requires full reindex + return EbookAccess.PRINTDISABLED + elif literal == 'buy': + return EbookAccess.NO_EBOOK + elif literal == 'open-access': + return EbookAccess.PUBLIC + elif literal == 'borrow': + return EbookAccess.BORROWABLE + elif literal == 'subscribe': + return EbookAccess.NO_EBOOK + else: + raise ValueError(f'Unknown access literal: {literal}') + + +@dataclass +class Acquisition: + """ + Acquisition represents a book resource found on another website, such as + Standard Ebooks. + + Wording inspired by OPDS; see https://specs.opds.io/opds-1.2#23-acquisition-feeds + """ + + access: AcquisitionAccessLiteral + format: Literal['web', 'pdf', 'epub', 'audio'] + price: str | None + url: str + provider_name: str | None = None + + @property + def ebook_access(self) -> EbookAccess: + return EbookAccess.from_acquisition_access(self.access) + + @staticmethod + def from_json(json: dict) -> 'Acquisition': + if 'href' in json: + # OPDS-style provider + return Acquisition.from_opds_json(json) + elif 'url' in json: + # We have an inconsistency in our API + html_access: dict[str, AcquisitionAccessLiteral] = { + 'read': 'open-access', + 'listen': 'open-access', + 'buy': 'buy', + 'borrow': 'borrow', + 'preview': 'sample', + } + access = json.get('access', 'open-access') + if access in html_access: + access = html_access[access] + # Pressbooks/OL-style + return Acquisition( + access=access, + format=json.get('format', 'web'), + price=json.get('price'), + url=json['url'], + provider_name=json.get('provider_name'), + ) + else: + raise ValueError(f'Unknown ebook acquisition format: {json}') + + @staticmethod + def from_opds_json(json: dict) -> 'Acquisition': + if json.get('properties', {}).get('indirectAcquisition', None): + mimetype = json['properties']['indirectAcquisition'][0]['type'] + else: + mimetype = json['type'] + + fmt: Literal['web', 'pdf', 'epub', 'audio'] = 'web' + if mimetype.startswith('audio/'): + fmt = 'audio' + elif mimetype == 'application/pdf': + fmt = 'pdf' + elif mimetype == 'application/epub+zip': + fmt = 'epub' + elif mimetype == 'text/html': + fmt = 'web' + else: + logger.warning(f'Unknown mimetype: {mimetype}') + fmt = 'web' + + if json.get('properties', {}).get('price', None): + price = f"{json['properties']['price']['value']} {json['properties']['price']['currency']}" + else: + price = None + + return Acquisition( + access=json['rel'].split('/')[-1], + format=fmt, + price=price, + url=json['href'], + provider_name=json.get('name'), + ) + class IALiteMetadata(TypedDict): boxid: set[str] @@ -38,7 +145,7 @@ class AbstractBookProvider(Generic[TProviderMetadata]): The key in the identifiers field on editions; see https://openlibrary.org/config/edition """ - identifier_key: str + identifier_key: str | None def get_olids(self, identifier): return web.ctx.site.things( @@ -112,6 +219,15 @@ def get_access( # Most providers are for public-only ebooks right now return EbookAccess.PUBLIC + def get_acquisitions( + self, + edition: Edition, + ) -> list[Acquisition]: + if edition.providers: + return [Acquisition.from_json(dict(p)) for p in edition.providers] + else: + return [] + class InternetArchiveProvider(AbstractBookProvider[IALiteMetadata]): short_name = 'ia' @@ -195,6 +311,20 @@ def render_download_options(self, edition: Edition, extra_args: list | None = No def is_own_ocaid(self, ocaid: str) -> bool: return 'librivox' in ocaid + def get_acquisitions( + self, + edition: Edition, + ) -> list[Acquisition]: + return [ + Acquisition( + access='open-access', + format='audio', + price=None, + url=f'https://librivox.org/{self.get_best_identifier(edition)}', + provider_name=self.short_name, + ) + ] + class ProjectGutenbergProvider(AbstractBookProvider): short_name = 'gutenberg' @@ -203,6 +333,20 @@ class ProjectGutenbergProvider(AbstractBookProvider): def is_own_ocaid(self, ocaid: str) -> bool: return ocaid.endswith('gut') + def get_acquisitions( + self, + edition: Edition, + ) -> list[Acquisition]: + return [ + Acquisition( + access='open-access', + format='web', + price=None, + url=f'https://www.gutenberg.org/ebooks/{self.get_best_identifier(edition)}', + provider_name=self.short_name, + ) + ] + class StandardEbooksProvider(AbstractBookProvider): short_name = 'standard_ebooks' @@ -212,6 +356,30 @@ def is_own_ocaid(self, ocaid: str) -> bool: # Standard ebooks isn't archived on IA return False + def get_acquisitions( + self, + edition: Edition, + ) -> list[Acquisition]: + standard_ebooks_id = self.get_best_identifier(edition) + base_url = 'https://standardebooks.org/ebooks/' + standard_ebooks_id + flat_id = standard_ebooks_id.replace('/', '_') + return [ + Acquisition( + access='open-access', + format='web', + price=None, + url=f'{base_url}/text/single-page', + provider_name=self.short_name, + ), + Acquisition( + access='open-access', + format='epub', + price=None, + url=f'{base_url}/downloads/{flat_id}.epub', + provider_name=self.short_name, + ), + ] + class OpenStaxProvider(AbstractBookProvider): short_name = 'openstax' @@ -220,6 +388,20 @@ class OpenStaxProvider(AbstractBookProvider): def is_own_ocaid(self, ocaid: str) -> bool: return False + def get_acquisitions( + self, + edition: Edition, + ) -> list[Acquisition]: + return [ + Acquisition( + access='open-access', + format='web', + price=None, + url=f'https://openstax.org/details/books/{self.get_best_identifier(edition)}', + provider_name=self.short_name, + ) + ] + class CitaPressProvider(AbstractBookProvider): short_name = 'cita_press' @@ -229,9 +411,81 @@ def is_own_ocaid(self, ocaid: str) -> bool: return False +class DirectProvider(AbstractBookProvider): + short_name = 'direct' + identifier_key = None + + @property + def db_selector(self): + return "providers.url" + + @property + def solr_key(self): + # TODO: Not implemented yet + return None + + def get_identifiers(self, ed_or_solr: Edition | dict) -> list[str]: + # It's an edition + if ed_or_solr.get('providers'): + return [ + provider.url + for provider in map(Acquisition.from_json, ed_or_solr['providers']) + if provider.ebook_access >= EbookAccess.PRINTDISABLED + ] + else: + # TODO: Not implemented for search/solr yet + return [] + + def render_read_button( + self, ed_or_solr: Edition | dict, analytics_attr: Callable[[str], str] + ): + acq_sorted = sorted( + ( + p + for p in map(Acquisition.from_json, ed_or_solr.get('providers', [])) + if p.ebook_access >= EbookAccess.PRINTDISABLED + ), + key=lambda p: p.ebook_access, + reverse=True, + ) + if not acq_sorted: + return '' + + acquisition = acq_sorted[0] + # pre-process acquisition.url so ParseResult.netloc is always the domain. Only netloc is used. + url = ( + "https://" + acquisition.url + if not acquisition.url.startswith("http") + else acquisition.url + ) + parsed_url = parse.urlparse(url) + domain = parsed_url.netloc + return render_template( + self.get_template_path('read_button'), acquisition, domain + ) + + def render_download_options(self, edition: Edition, extra_args: list | None = None): + # Return an empty string until #9581 is addressed. + return "" + + def get_access( + self, + edition: dict, + metadata: TProviderMetadata | None = None, + ) -> EbookAccess: + """ + Return the access level of the edition. + """ + # For now assume 0 is best + return EbookAccess.from_acquisition_access( + Acquisition.from_json(edition['providers'][0]).access + ) + + PROVIDER_ORDER: list[AbstractBookProvider] = [ # These providers act essentially as their own publishers, so link to the first when # we're on an edition page + DirectProvider(), LibriVoxProvider(), ProjectGutenbergProvider(), StandardEbooksProvider(), @@ -389,7 +643,7 @@ def get_best_edition( def get_solr_keys(): - return [p.solr_key for p in PROVIDER_ORDER] + return [p.solr_key for p in PROVIDER_ORDER if p.solr_key] setattr(get_book_provider, 'ia', get_book_provider_by_name('ia')) # noqa: B010 diff --git a/openlibrary/i18n/messages.pot b/openlibrary/i18n/messages.pot index cd0e585181f..8b4c4fb67ce 100644 --- a/openlibrary/i18n/messages.pot +++ b/openlibrary/i18n/messages.pot @@ -272,7 +272,8 @@ msgstr "" msgid "YAML Representation:" msgstr "" -#: BookPreview.html books/edit/edition.html editpage.html +#: BookPreview.html book_providers/direct_read_button.html +#: books/edit/edition.html editpage.html msgid "Preview" msgstr "" @@ -450,6 +451,7 @@ msgstr "" #: BookPreview.html CreateListModal.html DonateModal.html NotesModal.html #: ObservationsModal.html ShareModal.html #: book_providers/cita_press_read_button.html +#: book_providers/direct_read_button.html #: book_providers/gutenberg_read_button.html #: book_providers/librivox_read_button.html #: book_providers/openstax_read_button.html @@ -836,6 +838,7 @@ msgid "Currently Reading" msgstr "" #: LoanReadForm.html ReadButton.html book_providers/cita_press_read_button.html +#: book_providers/direct_read_button.html #: book_providers/gutenberg_read_button.html #: book_providers/openstax_read_button.html #: book_providers/standard_ebooks_read_button.html books/custom_carousel.html @@ -2855,6 +2858,7 @@ msgid "" msgstr "" #: book_providers/cita_press_read_button.html +#: book_providers/direct_read_button.html #: book_providers/gutenberg_read_button.html #: book_providers/librivox_read_button.html #: book_providers/openstax_read_button.html @@ -2863,6 +2867,17 @@ msgstr "" msgid "Learn more" msgstr "" +#: book_providers/direct_read_button.html +msgid "Read free online" +msgstr "" + +#: book_providers/direct_read_button.html +#, python-format +msgid "" +"This book is freely available from %s, an external " +"third-party book provider." +msgstr "" + #: book_providers/gutenberg_download_options.html msgid "Download an HTML from Project Gutenberg" msgstr "" @@ -3892,10 +3907,6 @@ msgstr "" msgid "Provider Name" msgstr "" -#: ReadButton.html books/edit/edition.html -msgid "Listen" -msgstr "" - #: books/edit/edition.html msgid "Buy" msgstr "" @@ -7129,6 +7140,10 @@ msgstr "" msgid "Read ebook from Internet Archive" msgstr "" +#: ReadButton.html +msgid "Listen" +msgstr "" + #: ReadMore.html msgid "Read more" msgstr "" diff --git a/openlibrary/plugins/upstream/borrow.py b/openlibrary/plugins/upstream/borrow.py index 0f7d03cbcd6..bf17089bb7e 100644 --- a/openlibrary/plugins/upstream/borrow.py +++ b/openlibrary/plugins/upstream/borrow.py @@ -112,7 +112,7 @@ class borrow(delegate.page): def GET(self, key): return self.POST(key) - def POST(self, key): + def POST(self, key): # noqa: PLR0915 """Called when the user wants to borrow the edition""" i = web.input( @@ -130,6 +130,19 @@ def POST(self, key): if not edition: raise web.notfound() + from openlibrary.book_providers import get_book_provider + + # Direct to the first web book if at least one is available. + if ( + action in ["borrow", "read"] + and (provider := get_book_provider(edition)) + and provider.short_name != "ia" + and (acquisitions := provider.get_acquisitions(edition)) + and acquisitions[0].access == "open-access" + ): + stats.increment('ol.loans.webbook') + raise web.seeother(acquisitions[0].url) + archive_url = get_bookreader_stream_url(edition.ocaid) + '?ref=ol' if i._autoReadAloud is not None: archive_url += '&_autoReadAloud=show' diff --git a/openlibrary/plugins/worksearch/code.py b/openlibrary/plugins/worksearch/code.py index 99db1a1ca44..f122de43c65 100644 --- a/openlibrary/plugins/worksearch/code.py +++ b/openlibrary/plugins/worksearch/code.py @@ -130,7 +130,7 @@ def execute_solr_query( public(has_solr_editions_enabled) -def run_solr_query( +def run_solr_query( # noqa: PLR0912 scheme: SearchScheme, param: dict | None = None, rows=100, @@ -216,7 +216,9 @@ def run_solr_query( q = f'{q} {params_q}' if q else params_q if q: - solr_fields = set(fields or scheme.default_fetched_fields) + solr_fields = ( + set(fields or scheme.default_fetched_fields) - scheme.non_solr_fields + ) if 'editions' in solr_fields: solr_fields.remove('editions') solr_fields.add('editions:[subquery]') @@ -236,6 +238,12 @@ def run_solr_query( solr_result = response.json() if response else None end_time = time.time() duration = end_time - start_time + + if solr_result is not None: + non_solr_fields = set(fields) & scheme.non_solr_fields + if non_solr_fields: + scheme.add_non_solr_fields(non_solr_fields, solr_result) + return SearchResponse.from_solr_result(solr_result, sort, url, time=duration) @@ -301,14 +309,18 @@ def do_search( :param spellcheck_count: Not really used; should probably drop """ + fields = WorkSearchScheme.default_fetched_fields | {'editions', 'providers'} if web.cookies(sfw="").sfw == 'yes': - fields = list( - WorkSearchScheme.default_fetched_fields | {'editions'} | {'subject'} - ) - else: - fields = list(WorkSearchScheme.default_fetched_fields | {'editions'}) + fields |= {'subject'} + return run_solr_query( - WorkSearchScheme(), param, rows, page, sort, spellcheck_count, fields=fields + WorkSearchScheme(), + param, + rows, + page, + sort, + spellcheck_count, + fields=list(fields), ) diff --git a/openlibrary/plugins/worksearch/schemes/__init__.py b/openlibrary/plugins/worksearch/schemes/__init__.py index e75c773307b..edcac46baa6 100644 --- a/openlibrary/plugins/worksearch/schemes/__init__.py +++ b/openlibrary/plugins/worksearch/schemes/__init__.py @@ -17,6 +17,8 @@ class SearchScheme: universe: list[str] # All actual solr fields that can be in a user query all_fields: set[str] + # Fields that can be read, but which aren't stored in solr + non_solr_fields: set[str] # These fields are fetched for facets and can also be url params facet_fields: set[str] # Mapping of user-only fields to solr fields @@ -120,3 +122,6 @@ def q_to_solr_params( cur_solr_params: list[tuple[str, str]], ) -> list[tuple[str, str]]: return [('q', q)] + + def add_non_solr_fields(self, solr_fields: set[str], solr_result: dict) -> None: + raise NotImplementedError() diff --git a/openlibrary/plugins/worksearch/schemes/authors.py b/openlibrary/plugins/worksearch/schemes/authors.py index 62bd9255664..86ad2f0ef8f 100644 --- a/openlibrary/plugins/worksearch/schemes/authors.py +++ b/openlibrary/plugins/worksearch/schemes/authors.py @@ -19,6 +19,7 @@ class AuthorSearchScheme(SearchScheme): 'top_subjects', 'work_count', } + non_solr_fields: set[str] = set() facet_fields: set[str] = set() field_name_map: dict[str, str] = {} sorts = { diff --git a/openlibrary/plugins/worksearch/schemes/subjects.py b/openlibrary/plugins/worksearch/schemes/subjects.py index 432dbd48a37..3a265425345 100644 --- a/openlibrary/plugins/worksearch/schemes/subjects.py +++ b/openlibrary/plugins/worksearch/schemes/subjects.py @@ -15,6 +15,7 @@ class SubjectSearchScheme(SearchScheme): 'subject_type', 'work_count', } + non_solr_fields: set[str] = set() facet_fields: set[str] = set() field_name_map: dict[str, str] = {} sorts = { diff --git a/openlibrary/plugins/worksearch/schemes/works.py b/openlibrary/plugins/worksearch/schemes/works.py index 0f689ce51a5..38de36a0b51 100644 --- a/openlibrary/plugins/worksearch/schemes/works.py +++ b/openlibrary/plugins/worksearch/schemes/works.py @@ -8,6 +8,7 @@ import luqum.tree import web +import infogami from openlibrary.plugins.upstream.utils import convert_iso_to_marc from openlibrary.plugins.worksearch.schemes import SearchScheme from openlibrary.solr.query_utils import ( @@ -92,6 +93,10 @@ class WorkSearchScheme(SearchScheme): "ddc_sort", "osp_count", } + non_solr_fields = { + 'description', + 'providers', + } facet_fields = { "has_fulltext", "author_facet", @@ -509,6 +514,40 @@ def convert_work_query_to_edition_query(work_query: str) -> str: new_params.append(('editions.fl', ','.join(edition_fields))) return new_params + def add_non_solr_fields(self, non_solr_fields: set[str], solr_result: dict) -> None: + from openlibrary.plugins.upstream.models import Edition + + # Augment with data from db + edition_keys = [ + ed_doc['key'] + for doc in solr_result['response']['docs'] + for ed_doc in doc.get('editions', {}).get('docs', []) + ] + editions = cast(list[Edition], web.ctx.site.get_many(edition_keys)) + ed_key_to_record = {ed.key: ed for ed in editions if ed.key in edition_keys} + + from openlibrary.book_providers import get_book_provider + + for doc in solr_result['response']['docs']: + for ed_doc in doc.get('editions', {}).get('docs', []): + # `ed` could be `None` if the record has been deleted and Solr not yet updated. + if not (ed := ed_key_to_record.get(ed_doc['key'])): + continue + + for field in non_solr_fields: + val = getattr(ed, field) + if field == 'providers': + provider = get_book_provider(ed) + if not provider: + continue + ed_doc[field] = [ + p.__dict__ for p in provider.get_acquisitions(ed) + ] + elif isinstance(val, infogami.infobase.client.Nothing): + continue + elif field == 'description': + ed_doc[field] = val if isinstance(val, str) else val.value + def lcc_transform(sf: luqum.tree.SearchField): # e.g. lcc:[NC1 TO NC1000] to lcc:[NC-0001.00000000 TO NC-1000.00000000] diff --git a/openlibrary/templates/book_providers/direct_read_button.html b/openlibrary/templates/book_providers/direct_read_button.html new file mode 100644 index 00000000000..d6f56b79253 --- /dev/null +++ b/openlibrary/templates/book_providers/direct_read_button.html @@ -0,0 +1,33 @@ +$def with(acquisition, domain) +$# :param Acquisition acquisition: +$# :param domain str: + +$if acquisition.access == 'open-access': +
+ +$elif acquisition.access == 'sample': + + +$if render_once('direct-provider-toast'): + diff --git a/openlibrary/templates/book_providers/ia_download_options.html b/openlibrary/templates/book_providers/ia_download_options.html index a32e226ba72..99516e94513 100644 --- a/openlibrary/templates/book_providers/ia_download_options.html +++ b/openlibrary/templates/book_providers/ia_download_options.html @@ -4,7 +4,7 @@$_("Download Options")