Merge pull request #6559 from cdrini/solr-availability

Add sortable enum field to solr for ebook access
internetarchive · Jun 15, 2022 · 7768c98 · 7768c98
2 parents fa6ff90 + aef9cf3
commit 7768c98
Show file tree

Hide file tree

Showing 6 changed files with 117 additions and 33 deletions.
diff --git a/conf/solr/conf/enumsConfig.xml b/conf/solr/conf/enumsConfig.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<enumsConfig>
+  <enum name="ebookAccess">
+    <value>no_ebook</value>
+    <value>unclassified</value>
+    <value>printdisabled</value>
+    <value>borrowable</value>
+    <value>public</value>
+  </enum>
+</enumsConfig>
diff --git a/conf/solr/conf/managed-schema b/conf/solr/conf/managed-schema
@@ -164,6 +164,7 @@
     <field name="ia_count" type="pint"/>
     <field name="oclc" type="text_en_splitting" multiValued="true"/>
     <field name="isbn" type="string" multiValued="true"/>
+    <field name="ebook_access" type="ebookAccessLevel" multiValued="false"/>
 
     <!-- Classifications -->
     <field name="lcc" type="string" multiValued="true" />
@@ -385,6 +386,9 @@
     <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
     <fieldType name="binary" class="solr.BinaryField"/>
 
+    <!-- Custom field to allow storing sortable access -->
+    <fieldType name="ebookAccessLevel" class="solr.EnumFieldType" docValues="true" enumsConfig="enumsConfig.xml" enumName="ebookAccess"/>
+
     <!-- solr.TextField allows the specification of custom text analyzers
          specified as a tokenizer and a list of token filters. Different
          analyzers may be specified for indexing and querying.

diff --git a/openlibrary/solr/solr_types.py b/openlibrary/solr/solr_types.py
@@ -1,4 +1,5 @@
 # This file is auto-generated by types_generator.py
+# fmt: off
 from typing import Literal, TypedDict, Optional
 
 
@@ -30,6 +31,7 @@ class SolrDocument(TypedDict):
     ia_count: Optional[int]
     oclc: Optional[list[str]]
     isbn: Optional[list[str]]
+    ebook_access: Optional[Literal['no_ebook', 'unclassified', 'printdisabled', 'borrowable', 'public']]
     lcc: Optional[list[str]]
     lcc_sort: Optional[str]
     ddc: Optional[list[str]]
@@ -67,3 +69,5 @@ class SolrDocument(TypedDict):
     top_work: Optional[str]
     top_subjects: Optional[list[str]]
     subject_type: Optional[str]
+
+# fmt: on
diff --git a/openlibrary/solr/types_generator.py b/openlibrary/solr/types_generator.py
@@ -32,8 +32,30 @@ def generate():
 
         if name in OVERRIDES:
             python_type = OVERRIDES[name]
-        else:
+        elif typ in type_map:
             python_type = type_map[typ]
+        elif (
+            field_type := solr_schema.find(f".//fieldType[@name='{typ}']")
+        ) is not None:
+            field_class = field_type.get('class')
+            if field_class == 'solr.EnumFieldType':
+                enumsConfigFile = field_type.get('enumsConfig')
+                enumsConfig = ET.parse(
+                    os.path.join(root, '../../conf/solr/conf/', enumsConfigFile)
+                )
+                enum_values = [
+                    el.text
+                    for el in enumsConfig.findall(
+                        f".//enum[@name='{field_type.get('enumName')}']/value"
+                    )
+                ]
+                python_type = f"Literal[{', '.join(map(repr, enum_values))}]"
+            else:
+                raise Exception(f"Unknown field type class {field_class}")
+        else:
+            raise Exception(f"Unknown field type {typ}")
+
+        if name not in OVERRIDES:
             if multivalued:
                 python_type = f"list[{python_type}]"
             if not required:
@@ -47,11 +69,14 @@ def generate():
 
     body = '\n'.join(python_fields)
     python = f"""# This file is auto-generated by types_generator.py
+# fmt: off
 from typing import Literal, TypedDict, Optional
 
 
 class SolrDocument(TypedDict):
-{body}"""
+{body}
+
+# fmt: on"""
 
     return python
 

diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py
@@ -3,7 +3,6 @@
 import logging
 import os
 import re
-from enum import IntEnum
 from json import JSONDecodeError
 from math import ceil
 from statistics import median
@@ -33,7 +32,7 @@
     ExternalDataProvider,
 )
 from openlibrary.solr.solr_types import SolrDocument
-from openlibrary.utils import uniq
+from openlibrary.utils import uniq, OrderedEnum
 from openlibrary.utils.ddc import normalize_ddc, choose_sorting_ddc
 from openlibrary.utils.isbn import opposite_isbn
 from openlibrary.utils.lcc import short_lcc_to_sortable_lcc, choose_sorting_lcc
@@ -311,6 +310,32 @@ def datetimestr_to_int(datestr):
     return int(time.mktime(t.timetuple()))
 
 
+class EbookAccess(OrderedEnum):
+    # Keep in sync with solr/conf/enumsConfig.xml !
+    NO_EBOOK = 0
+    UNCLASSIFIED = 1
+    PRINTDISABLED = 2
+    BORROWABLE = 3
+    PUBLIC = 4
+
+    def to_solr_str(self):
+        return self.name.lower()
+
+
+def get_ia_access_enum(
+    collections: set[str],
+    access_restricted_item: bool,
+) -> EbookAccess:
+    if 'inlibrary' in collections:
+        return EbookAccess.BORROWABLE
+    elif 'printdisabled' in collections:
+        return EbookAccess.PRINTDISABLED
+    elif access_restricted_item or not collections:
+        return EbookAccess.UNCLASSIFIED
+    else:
+        return EbookAccess.PUBLIC
+
+
 class SolrProcessor:
     """Processes data to into a form suitable for adding to works solr."""
 
@@ -364,7 +389,9 @@ def process_editions(self, w, editions, ia_metadata, identifiers):
                 e['public_scan'] = ('lendinglibrary' not in collection) and (
                     'printdisabled' not in collection
                 )
-                e['access_restricted_item'] = ia_meta_fields.get('access_restricted_item', False)
+                e['access_restricted_item'] = ia_meta_fields.get(
+                    'access_restricted_item', False
+                )
 
             if 'identifiers' in e:
                 for k, id_list in e['identifiers'].items():
@@ -696,36 +723,18 @@ def get_ebook_info(
         """
         ebook_info: dict[str, Any] = {}
 
-        class AvailabilityEnum(IntEnum):
-            PUBLIC = 1
-            BORROWABLE = 2
-            PRINTDISABLED = 3
-            UNCLASSIFIED = 4
-
-        def get_ia_availability_enum(
-            collections: set[str],
-            access_restricted_item: bool,
-        ) -> AvailabilityEnum:
-            if 'inlibrary' in collections:
-                return AvailabilityEnum.BORROWABLE
-            elif 'printdisabled' in collections:
-                return AvailabilityEnum.PRINTDISABLED
-            elif access_restricted_item or not collections:
-                return AvailabilityEnum.UNCLASSIFIED
-            else:
-                return AvailabilityEnum.PUBLIC
-
-        def get_ia_sorting_key(ed: dict) -> tuple[AvailabilityEnum, str]:
+        def get_ia_sorting_key(ed: dict) -> tuple[int, str]:
             ocaid = ed['ocaid'].strip()
             md = ia_metadata.get(ocaid)
-            availability = AvailabilityEnum.UNCLASSIFIED
+            access = EbookAccess.UNCLASSIFIED
             if md is not None:
-                availability = get_ia_availability_enum(
+                access = get_ia_access_enum(
                     md.get('collection', set()),
                     md.get('access_restricted_item') == "true",
                 )
             return (
-                availability,
+                # -1 to sort in reverse and make public first
+                -1 * access.value,
                 # De-prioritize google scans because they are lower quality
                 '0: non-goog' if not ocaid.endswith('goog') else '1: goog',
             )
@@ -734,16 +743,24 @@ def get_ia_sorting_key(ed: dict) -> tuple[AvailabilityEnum, str]:
         ebook_info['ia'] = [e['ocaid'].strip() for e in ia_eds]
         ebook_info["ebook_count_i"] = len(ia_eds)
 
-        # These should always be set, for some reason.
+        # Default values
         ebook_info["has_fulltext"] = False
         ebook_info["public_scan_b"] = False
+        if get_solr_next():
+            ebook_info["ebook_access"] = EbookAccess.NO_EBOOK.to_solr_str()
 
         if ia_eds:
-            best_availability = get_ia_sorting_key(ia_eds[0])[0]
             best_ed = ia_eds[0]
-            if best_availability < AvailabilityEnum.UNCLASSIFIED:
+            best_access = get_ia_access_enum(
+                best_ed.get('ia_collection', []),
+                best_ed.get('access_restricted_item') == "true",
+            )
+            if get_solr_next():
+                ebook_info["ebook_access"] = best_access.to_solr_str()
+
+            if best_access > EbookAccess.UNCLASSIFIED:
                 ebook_info["has_fulltext"] = True
-            if best_availability == AvailabilityEnum.PUBLIC:
+            if best_access == EbookAccess.PUBLIC:
                 ebook_info['public_scan_b'] = True
 
             all_collection = sorted(
@@ -759,7 +776,7 @@ def get_ia_sorting_key(ed: dict) -> tuple[AvailabilityEnum, str]:
             if all_collection:
                 ebook_info['ia_collection_s'] = ';'.join(all_collection)
 
-            if best_availability < AvailabilityEnum.PRINTDISABLED:
+            if best_access > EbookAccess.PRINTDISABLED:
                 ebook_info['lending_edition_s'] = extract_edition_olid(best_ed['key'])
                 ebook_info['lending_identifier_s'] = best_ed['ocaid']
 

diff --git a/openlibrary/utils/__init__.py b/openlibrary/utils/__init__.py
@@ -1,5 +1,6 @@
 """Generic utilities"""
 
+from enum import Enum
 import re
 from subprocess import PIPE, Popen, STDOUT
 from typing import TypeVar, Iterable, Literal, Callable, Optional
@@ -182,3 +183,26 @@ def is_number(s):
 def get_software_version():  # -> str:
     cmd = "git rev-parse --short HEAD --".split()
     return str(Popen(cmd, stdout=PIPE, stderr=STDOUT).stdout.read().decode().strip())
+
+
+# See https://docs.python.org/3/library/enum.html#orderedenum
+class OrderedEnum(Enum):
+    def __ge__(self, other):
+        if self.__class__ is other.__class__:
+            return self.value >= other.value
+        return NotImplemented
+
+    def __gt__(self, other):
+        if self.__class__ is other.__class__:
+            return self.value > other.value
+        return NotImplemented
+
+    def __le__(self, other):
+        if self.__class__ is other.__class__:
+            return self.value <= other.value
+        return NotImplemented
+
+    def __lt__(self, other):
+        if self.__class__ is other.__class__:
+            return self.value < other.value
+        return NotImplemented