Skip to content

Commit

Permalink
Merge pull request #6559 from cdrini/solr-availability
Browse files Browse the repository at this point in the history
Add sortable enum field to solr for ebook access
  • Loading branch information
mekarpeles authored Jun 15, 2022
2 parents fa6ff90 + aef9cf3 commit 7768c98
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 33 deletions.
10 changes: 10 additions & 0 deletions conf/solr/conf/enumsConfig.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0"?>
<enumsConfig>
<enum name="ebookAccess">
<value>no_ebook</value>
<value>unclassified</value>
<value>printdisabled</value>
<value>borrowable</value>
<value>public</value>
</enum>
</enumsConfig>
4 changes: 4 additions & 0 deletions conf/solr/conf/managed-schema
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@
<field name="ia_count" type="pint"/>
<field name="oclc" type="text_en_splitting" multiValued="true"/>
<field name="isbn" type="string" multiValued="true"/>
<field name="ebook_access" type="ebookAccessLevel" multiValued="false"/>

<!-- Classifications -->
<field name="lcc" type="string" multiValued="true" />
Expand Down Expand Up @@ -385,6 +386,9 @@
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
<fieldType name="binary" class="solr.BinaryField"/>

<!-- Custom field to allow storing sortable access -->
<fieldType name="ebookAccessLevel" class="solr.EnumFieldType" docValues="true" enumsConfig="enumsConfig.xml" enumName="ebookAccess"/>

<!-- solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying.
Expand Down
4 changes: 4 additions & 0 deletions openlibrary/solr/solr_types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# This file is auto-generated by types_generator.py
# fmt: off
from typing import Literal, TypedDict, Optional


Expand Down Expand Up @@ -30,6 +31,7 @@ class SolrDocument(TypedDict):
ia_count: Optional[int]
oclc: Optional[list[str]]
isbn: Optional[list[str]]
ebook_access: Optional[Literal['no_ebook', 'unclassified', 'printdisabled', 'borrowable', 'public']]
lcc: Optional[list[str]]
lcc_sort: Optional[str]
ddc: Optional[list[str]]
Expand Down Expand Up @@ -67,3 +69,5 @@ class SolrDocument(TypedDict):
top_work: Optional[str]
top_subjects: Optional[list[str]]
subject_type: Optional[str]

# fmt: on
29 changes: 27 additions & 2 deletions openlibrary/solr/types_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,30 @@ def generate():

if name in OVERRIDES:
python_type = OVERRIDES[name]
else:
elif typ in type_map:
python_type = type_map[typ]
elif (
field_type := solr_schema.find(f".//fieldType[@name='{typ}']")
) is not None:
field_class = field_type.get('class')
if field_class == 'solr.EnumFieldType':
enumsConfigFile = field_type.get('enumsConfig')
enumsConfig = ET.parse(
os.path.join(root, '../../conf/solr/conf/', enumsConfigFile)
)
enum_values = [
el.text
for el in enumsConfig.findall(
f".//enum[@name='{field_type.get('enumName')}']/value"
)
]
python_type = f"Literal[{', '.join(map(repr, enum_values))}]"
else:
raise Exception(f"Unknown field type class {field_class}")
else:
raise Exception(f"Unknown field type {typ}")

if name not in OVERRIDES:
if multivalued:
python_type = f"list[{python_type}]"
if not required:
Expand All @@ -47,11 +69,14 @@ def generate():

body = '\n'.join(python_fields)
python = f"""# This file is auto-generated by types_generator.py
# fmt: off
from typing import Literal, TypedDict, Optional
class SolrDocument(TypedDict):
{body}"""
{body}
# fmt: on"""

return python

Expand Down
79 changes: 48 additions & 31 deletions openlibrary/solr/update_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
import os
import re
from enum import IntEnum
from json import JSONDecodeError
from math import ceil
from statistics import median
Expand Down Expand Up @@ -33,7 +32,7 @@
ExternalDataProvider,
)
from openlibrary.solr.solr_types import SolrDocument
from openlibrary.utils import uniq
from openlibrary.utils import uniq, OrderedEnum
from openlibrary.utils.ddc import normalize_ddc, choose_sorting_ddc
from openlibrary.utils.isbn import opposite_isbn
from openlibrary.utils.lcc import short_lcc_to_sortable_lcc, choose_sorting_lcc
Expand Down Expand Up @@ -311,6 +310,32 @@ def datetimestr_to_int(datestr):
return int(time.mktime(t.timetuple()))


class EbookAccess(OrderedEnum):
# Keep in sync with solr/conf/enumsConfig.xml !
NO_EBOOK = 0
UNCLASSIFIED = 1
PRINTDISABLED = 2
BORROWABLE = 3
PUBLIC = 4

def to_solr_str(self):
return self.name.lower()


def get_ia_access_enum(
collections: set[str],
access_restricted_item: bool,
) -> EbookAccess:
if 'inlibrary' in collections:
return EbookAccess.BORROWABLE
elif 'printdisabled' in collections:
return EbookAccess.PRINTDISABLED
elif access_restricted_item or not collections:
return EbookAccess.UNCLASSIFIED
else:
return EbookAccess.PUBLIC


class SolrProcessor:
"""Processes data to into a form suitable for adding to works solr."""

Expand Down Expand Up @@ -364,7 +389,9 @@ def process_editions(self, w, editions, ia_metadata, identifiers):
e['public_scan'] = ('lendinglibrary' not in collection) and (
'printdisabled' not in collection
)
e['access_restricted_item'] = ia_meta_fields.get('access_restricted_item', False)
e['access_restricted_item'] = ia_meta_fields.get(
'access_restricted_item', False
)

if 'identifiers' in e:
for k, id_list in e['identifiers'].items():
Expand Down Expand Up @@ -696,36 +723,18 @@ def get_ebook_info(
"""
ebook_info: dict[str, Any] = {}

class AvailabilityEnum(IntEnum):
PUBLIC = 1
BORROWABLE = 2
PRINTDISABLED = 3
UNCLASSIFIED = 4

def get_ia_availability_enum(
collections: set[str],
access_restricted_item: bool,
) -> AvailabilityEnum:
if 'inlibrary' in collections:
return AvailabilityEnum.BORROWABLE
elif 'printdisabled' in collections:
return AvailabilityEnum.PRINTDISABLED
elif access_restricted_item or not collections:
return AvailabilityEnum.UNCLASSIFIED
else:
return AvailabilityEnum.PUBLIC

def get_ia_sorting_key(ed: dict) -> tuple[AvailabilityEnum, str]:
def get_ia_sorting_key(ed: dict) -> tuple[int, str]:
ocaid = ed['ocaid'].strip()
md = ia_metadata.get(ocaid)
availability = AvailabilityEnum.UNCLASSIFIED
access = EbookAccess.UNCLASSIFIED
if md is not None:
availability = get_ia_availability_enum(
access = get_ia_access_enum(
md.get('collection', set()),
md.get('access_restricted_item') == "true",
)
return (
availability,
# -1 to sort in reverse and make public first
-1 * access.value,
# De-prioritize google scans because they are lower quality
'0: non-goog' if not ocaid.endswith('goog') else '1: goog',
)
Expand All @@ -734,16 +743,24 @@ def get_ia_sorting_key(ed: dict) -> tuple[AvailabilityEnum, str]:
ebook_info['ia'] = [e['ocaid'].strip() for e in ia_eds]
ebook_info["ebook_count_i"] = len(ia_eds)

# These should always be set, for some reason.
# Default values
ebook_info["has_fulltext"] = False
ebook_info["public_scan_b"] = False
if get_solr_next():
ebook_info["ebook_access"] = EbookAccess.NO_EBOOK.to_solr_str()

if ia_eds:
best_availability = get_ia_sorting_key(ia_eds[0])[0]
best_ed = ia_eds[0]
if best_availability < AvailabilityEnum.UNCLASSIFIED:
best_access = get_ia_access_enum(
best_ed.get('ia_collection', []),
best_ed.get('access_restricted_item') == "true",
)
if get_solr_next():
ebook_info["ebook_access"] = best_access.to_solr_str()

if best_access > EbookAccess.UNCLASSIFIED:
ebook_info["has_fulltext"] = True
if best_availability == AvailabilityEnum.PUBLIC:
if best_access == EbookAccess.PUBLIC:
ebook_info['public_scan_b'] = True

all_collection = sorted(
Expand All @@ -759,7 +776,7 @@ def get_ia_sorting_key(ed: dict) -> tuple[AvailabilityEnum, str]:
if all_collection:
ebook_info['ia_collection_s'] = ';'.join(all_collection)

if best_availability < AvailabilityEnum.PRINTDISABLED:
if best_access > EbookAccess.PRINTDISABLED:
ebook_info['lending_edition_s'] = extract_edition_olid(best_ed['key'])
ebook_info['lending_identifier_s'] = best_ed['ocaid']

Expand Down
24 changes: 24 additions & 0 deletions openlibrary/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Generic utilities"""

from enum import Enum
import re
from subprocess import PIPE, Popen, STDOUT
from typing import TypeVar, Iterable, Literal, Callable, Optional
Expand Down Expand Up @@ -182,3 +183,26 @@ def is_number(s):
def get_software_version(): # -> str:
cmd = "git rev-parse --short HEAD --".split()
return str(Popen(cmd, stdout=PIPE, stderr=STDOUT).stdout.read().decode().strip())


# See https://docs.python.org/3/library/enum.html#orderedenum
class OrderedEnum(Enum):
def __ge__(self, other):
if self.__class__ is other.__class__:
return self.value >= other.value
return NotImplemented

def __gt__(self, other):
if self.__class__ is other.__class__:
return self.value > other.value
return NotImplemented

def __le__(self, other):
if self.__class__ is other.__class__:
return self.value <= other.value
return NotImplemented

def __lt__(self, other):
if self.__class__ is other.__class__:
return self.value < other.value
return NotImplemented

0 comments on commit 7768c98

Please sign in to comment.