Merge pull request #6591 from cdrini/6379/feature/tbp-hasfulltext

Add access solr fields to non-IA book providers
internetarchive · Jun 15, 2022 · 823b566 · 823b566
2 parents 7768c98 + 32e867e
commit 823b566
Show file tree

Hide file tree

Showing 3 changed files with 111 additions and 73 deletions.
diff --git a/openlibrary/book_providers.py b/openlibrary/book_providers.py
@@ -1,15 +1,36 @@
-from typing import Optional, Union, Literal, Iterator, cast
+from typing import Optional, TypedDict, Union, Literal, Iterator, cast, TypeVar, Generic
 
 import web
 from web import uniq
 
 from openlibrary.app import render_template
 from openlibrary.plugins.upstream.models import Edition
 from openlibrary.plugins.upstream.utils import get_coverstore_public_url
-from openlibrary.utils import multisort_best
+from openlibrary.utils import OrderedEnum, multisort_best
 
 
-class AbstractBookProvider:
+class EbookAccess(OrderedEnum):
+    # Keep in sync with solr/conf/enumsConfig.xml !
+    NO_EBOOK = 0
+    UNCLASSIFIED = 1
+    PRINTDISABLED = 2
+    BORROWABLE = 3
+    PUBLIC = 4
+
+    def to_solr_str(self):
+        return self.name.lower()
+
+
+class IALiteMetadata(TypedDict):
+    boxid: set[str]
+    collection: set[str]
+    access_restricted_item: Optional[Literal['true', 'false']]
+
+
+TProviderMetadata = TypeVar('TProviderMetadata')
+
+
+class AbstractBookProvider(Generic[TProviderMetadata]):
     short_name: str
 
     """
@@ -76,8 +97,19 @@ def is_own_ocaid(self, ocaid: str) -> bool:
         """Whether the ocaid is an archive of content from this provider"""
         return False
 
+    def get_access(
+        self,
+        edition: dict,
+        metadata: TProviderMetadata = None,
+    ) -> EbookAccess:
+        """
+        Return the access level of the edition.
+        """
+        # Most providers are for public-only ebooks right now
+        return EbookAccess.PUBLIC
 
-class InternetArchiveProvider(AbstractBookProvider):
+
+class InternetArchiveProvider(AbstractBookProvider[IALiteMetadata]):
     short_name = 'ia'
     identifier_key = 'ocaid'
 
@@ -126,6 +158,25 @@ def render_download_options(self, edition: Edition, extra_args: list = None):
         else:
             return ''
 
+    def get_access(self, edition: dict, metadata: IALiteMetadata = None) -> EbookAccess:
+        if not metadata:
+            if edition.get('ocaid'):
+                return EbookAccess.UNCLASSIFIED
+            else:
+                return EbookAccess.NO_EBOOK
+
+        collections = metadata.get('collection', set())
+        access_restricted_item = metadata.get('access_restricted_item') == "true"
+
+        if 'inlibrary' in collections:
+            return EbookAccess.BORROWABLE
+        elif 'printdisabled' in collections:
+            return EbookAccess.PRINTDISABLED
+        elif access_restricted_item or not collections:
+            return EbookAccess.UNCLASSIFIED
+        else:
+            return EbookAccess.PUBLIC
+
 
 class LibriVoxProvider(AbstractBookProvider):
     short_name = 'librivox'
@@ -232,7 +283,10 @@ def get_provider_order(prefer_ia=False) -> list[AbstractBookProvider]:
     default_order = prefer_ia_provider_order if prefer_ia else PROVIDER_ORDER
 
     provider_order = default_order
-    provider_overrides = web.input(providerPref=None).providerPref
+    provider_overrides = None
+    # Need this to work in test environments
+    if 'env' in web.ctx:
+        provider_overrides = web.input(providerPref=None, _method='GET').providerPref
     if provider_overrides:
         new_order: list[AbstractBookProvider] = []
         for name in provider_overrides.split(','):

diff --git a/openlibrary/plugins/worksearch/code.py b/openlibrary/plugins/worksearch/code.py
@@ -22,7 +22,6 @@
 from openlibrary.plugins.inside.code import fulltext_search
 from openlibrary.plugins.openlibrary.processors import urlsafe
 from openlibrary.plugins.upstream.utils import get_language_name, urlencode
-from openlibrary.solr.update_work import get_solr_next
 from openlibrary.solr.solr_types import SolrDocument
 from openlibrary.utils import escape_bracket
 from openlibrary.utils.ddc import (

diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py
@@ -6,7 +6,7 @@
 from json import JSONDecodeError
 from math import ceil
 from statistics import median
-from typing import Iterable, Literal, List, Optional, cast, TypedDict, Any, Union
+from typing import Iterable, Literal, List, Optional, cast, Any, Union
 
 import httpx
 import requests
@@ -23,6 +23,7 @@
 import web
 
 from openlibrary import config
+import openlibrary.book_providers as bp
 from openlibrary.catalog.utils.query import set_query_host, base_url as get_ol_base_url
 from openlibrary.core import helpers as h
 from openlibrary.plugins.upstream.utils import safeget
@@ -32,7 +33,7 @@
     ExternalDataProvider,
 )
 from openlibrary.solr.solr_types import SolrDocument
-from openlibrary.utils import uniq, OrderedEnum
+from openlibrary.utils import uniq
 from openlibrary.utils.ddc import normalize_ddc, choose_sorting_ddc
 from openlibrary.utils.isbn import opposite_isbn
 from openlibrary.utils.lcc import short_lcc_to_sortable_lcc, choose_sorting_lcc
@@ -100,13 +101,7 @@ def extract_edition_olid(key: str) -> str:
     return m.group(1)
 
 
-class IALiteMetadata(TypedDict):
-    boxid: set[str]
-    collection: set[str]
-    access_restricted_item: Optional[Literal['true', 'false']]
-
-
-def get_ia_collection_and_box_id(ia: str) -> Optional[IALiteMetadata]:
+def get_ia_collection_and_box_id(ia: str) -> Optional['bp.IALiteMetadata']:
     """
     Get the collections and boxids of the provided IA id
 
@@ -310,32 +305,6 @@ def datetimestr_to_int(datestr):
     return int(time.mktime(t.timetuple()))
 
 
-class EbookAccess(OrderedEnum):
-    # Keep in sync with solr/conf/enumsConfig.xml !
-    NO_EBOOK = 0
-    UNCLASSIFIED = 1
-    PRINTDISABLED = 2
-    BORROWABLE = 3
-    PUBLIC = 4
-
-    def to_solr_str(self):
-        return self.name.lower()
-
-
-def get_ia_access_enum(
-    collections: set[str],
-    access_restricted_item: bool,
-) -> EbookAccess:
-    if 'inlibrary' in collections:
-        return EbookAccess.BORROWABLE
-    elif 'printdisabled' in collections:
-        return EbookAccess.PRINTDISABLED
-    elif access_restricted_item or not collections:
-        return EbookAccess.UNCLASSIFIED
-    else:
-        return EbookAccess.PUBLIC
-
-
 class SolrProcessor:
     """Processes data to into a form suitable for adding to works solr."""
 
@@ -562,7 +531,7 @@ def build_data(
         self,
         w: dict,
         editions: list[dict],
-        ia_metadata: dict[str, Optional[IALiteMetadata]],
+        ia_metadata: dict[str, Optional['bp.IALiteMetadata']],
     ) -> dict:
         """
         Get the Solr document to insert for the provided work.
@@ -716,53 +685,61 @@ def get_last_modified(self, work, editions):
     @staticmethod
     def get_ebook_info(
         editions: list[dict],
-        ia_metadata: dict[str, Optional[IALiteMetadata]],
+        ia_metadata: dict[str, Optional['bp.IALiteMetadata']],
     ) -> dict:
         """
         Add ebook information from the editions to the work Solr document.
         """
         ebook_info: dict[str, Any] = {}
+        ia_provider = cast(
+            bp.InternetArchiveProvider, bp.get_book_provider_by_name('ia')
+        )
+
+        # Default values
+        best_access = bp.EbookAccess.NO_EBOOK
+        ebook_count = 0
+
+        for edition in editions:
+            provider = bp.get_book_provider(edition)
+            if provider is None:
+                continue
+
+            if provider == ia_provider:
+                access = provider.get_access(
+                    edition, ia_metadata.get(edition['ocaid'].strip())
+                )
+            else:
+                access = provider.get_access(edition)
+
+            if access > best_access:
+                best_access = access
+
+            if access > bp.EbookAccess.UNCLASSIFIED:
+                ebook_count += 1
+
+        ebook_info["ebook_count_i"] = ebook_count
+        if get_solr_next():
+            ebook_info["ebook_access"] = best_access.to_solr_str()
+        ebook_info["has_fulltext"] = best_access > bp.EbookAccess.UNCLASSIFIED
+        ebook_info["public_scan_b"] = best_access == bp.EbookAccess.PUBLIC
+
+        # IA-specific stuff
 
         def get_ia_sorting_key(ed: dict) -> tuple[int, str]:
             ocaid = ed['ocaid'].strip()
-            md = ia_metadata.get(ocaid)
-            access = EbookAccess.UNCLASSIFIED
-            if md is not None:
-                access = get_ia_access_enum(
-                    md.get('collection', set()),
-                    md.get('access_restricted_item') == "true",
-                )
+            access = ia_provider.get_access(ed, ia_metadata.get(ocaid))
             return (
                 # -1 to sort in reverse and make public first
                 -1 * access.value,
                 # De-prioritize google scans because they are lower quality
                 '0: non-goog' if not ocaid.endswith('goog') else '1: goog',
             )
 
+        # Store identifiers sorted by most-accessible first.
         ia_eds = sorted((e for e in editions if 'ocaid' in e), key=get_ia_sorting_key)
         ebook_info['ia'] = [e['ocaid'].strip() for e in ia_eds]
-        ebook_info["ebook_count_i"] = len(ia_eds)
-
-        # Default values
-        ebook_info["has_fulltext"] = False
-        ebook_info["public_scan_b"] = False
-        if get_solr_next():
-            ebook_info["ebook_access"] = EbookAccess.NO_EBOOK.to_solr_str()
 
         if ia_eds:
-            best_ed = ia_eds[0]
-            best_access = get_ia_access_enum(
-                best_ed.get('ia_collection', []),
-                best_ed.get('access_restricted_item') == "true",
-            )
-            if get_solr_next():
-                ebook_info["ebook_access"] = best_access.to_solr_str()
-
-            if best_access > EbookAccess.UNCLASSIFIED:
-                ebook_info["has_fulltext"] = True
-            if best_access == EbookAccess.PUBLIC:
-                ebook_info['public_scan_b'] = True
-
             all_collection = sorted(
                 uniq(
                     c
@@ -776,7 +753,11 @@ def get_ia_sorting_key(ed: dict) -> tuple[int, str]:
             if all_collection:
                 ebook_info['ia_collection_s'] = ';'.join(all_collection)
 
-            if best_access > EbookAccess.PRINTDISABLED:
+            # --- These should be deprecated and removed ---
+            best_ed = ia_eds[0]
+            best_ocaid = best_ed['ocaid'].strip()
+            best_access = ia_provider.get_access(best_ed, ia_metadata.get(best_ocaid))
+            if best_access > bp.EbookAccess.PRINTDISABLED:
                 ebook_info['lending_edition_s'] = extract_edition_olid(best_ed['key'])
                 ebook_info['lending_identifier_s'] = best_ed['ocaid']
 
@@ -787,12 +768,13 @@ def get_ia_sorting_key(ed: dict) -> tuple[int, str]:
             ]
             if printdisabled:
                 ebook_info['printdisabled_s'] = ';'.join(printdisabled)
+            # ^^^ These should be deprecated and removed ^^^
         return ebook_info
 
 
 async def build_data(
     w: dict,
-    ia_metadata: dict[str, Optional[IALiteMetadata]] = None,
+    ia_metadata: dict[str, Optional['bp.IALiteMetadata']] = None,
 ) -> SolrDocument:
     """
     Construct the Solr document to insert into Solr for the given work
@@ -814,7 +796,10 @@ async def build_data(
 
 
 def build_data2(
-    w: dict, editions: list[dict], authors, ia: dict[str, Optional[IALiteMetadata]]
+    w: dict,
+    editions: list[dict],
+    authors,
+    ia: dict[str, Optional['bp.IALiteMetadata']],
 ) -> SolrDocument:
     """
     Construct the Solr document to insert into Solr for the given work