Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sorting ia solr field #6125

Merged
merged 6 commits into from
Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openlibrary/solr/data_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

logger = logging.getLogger("openlibrary.solr.data_provider")

IA_METADATA_FIELDS = ('identifier', 'boxid', 'collection')
IA_METADATA_FIELDS = ('identifier', 'boxid', 'collection', 'access-restricted-item')
OCAID_PATTERN = re.compile(r'^[^\s&#?/]+$')


Expand Down
67 changes: 39 additions & 28 deletions openlibrary/solr/update_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def set_solr_next(val: bool):
class IALiteMetadata(TypedDict):
boxid: set[str]
collection: set[str]
access_restricted_item: Optional[Literal['true', 'false']]


def get_ia_collection_and_box_id(ia: str) -> Optional[IALiteMetadata]:
Expand Down Expand Up @@ -131,6 +132,7 @@ def get_list(d, key):
return {
'boxid': set(get_list(metadata, 'boxid')),
'collection': set(get_list(metadata, 'collection')),
'access_restricted_item': metadata.get('access-restricted-item'),
}


Expand Down Expand Up @@ -369,6 +371,7 @@ def process_editions(self, w, editions, ia_metadata, identifiers):
e['public_scan'] = ('lendinglibrary' not in collection) and (
'printdisabled' not in collection
)
e['access_restricted_item'] = ia_meta_fields.get('access_restricted_item', False)

if 'identifiers' in e:
for k, id_list in e['identifiers'].items():
Expand Down Expand Up @@ -724,7 +727,8 @@ def get_last_modified(self, work, editions):
datetimestr_to_int(doc.get('last_modified')) for doc in [work] + editions
)

def add_ebook_info(self, doc, editions):
@staticmethod
def add_ebook_info(doc, editions):
"""
Add ebook information from the editions to the work Solr document.

Expand All @@ -739,53 +743,60 @@ def add(name, value):
def add_list(name, values):
doc[name] = list(values)

pub_goog = set() # google
pub_nongoog = set()
nonpub_goog = set()
nonpub_nongoog = set()
borrowable_editions = set()
printdisabled_editions = set()
open_editions = set()
unclassified_editions = set()

public_scan = False
printdisabled = set()
all_collection = set()
public_scan = False
lending_edition = None
in_library_edition = None
lending_ia_identifier = None
printdisabled = set()

for e in editions:
if 'ocaid' not in e:
continue

assert isinstance(e['ocaid'], str)
ocaid = e['ocaid'].strip()
collections = e.get('ia_collection', [])
all_collection.update(collections)

if 'inlibrary' in collections:
borrowable_editions.add(ocaid)
elif 'printdisabled' in collections:
printdisabled_editions.add(ocaid)
elif e.get('access_restricted_item', False) == "true" or not collections:
unclassified_editions.add(ocaid)
else:
public_scan = True
open_editions.add(ocaid)

# Legacy
if 'printdisabled' in collections:
printdisabled.add(re_edition_key.match(e['key']).group(1))
# partners may still rely on these legacy fields, leave logic unchanged
if not lending_edition and 'lendinglibrary' in e.get('ia_collection', []):
lending_edition = re_edition_key.match(e['key']).group(1)
lending_ia_identifier = e['ocaid']
if not in_library_edition and 'inlibrary' in e.get('ia_collection', []):
in_library_edition = re_edition_key.match(e['key']).group(1)
lending_ia_identifier = e['ocaid']
if 'printdisabled' in e.get('ia_collection', []):
printdisabled.add(re_edition_key.match(e['key']).group(1))
all_collection.update(e.get('ia_collection', []))
assert isinstance(e['ocaid'], str)
i = e['ocaid'].strip()
if e.get('public_scan'):
public_scan = True
if i.endswith('goog'):
pub_goog.add(i)
else:
pub_nongoog.add(i)
else:
if i.endswith('goog'):
nonpub_goog.add(i)
else:
nonpub_nongoog.add(i)

ia_list = (
list(pub_nongoog)
+ list(pub_goog)
+ list(nonpub_nongoog)
+ list(nonpub_goog)
# deprioritize_low_quality_goog
sorted(list(open_editions), key=lambda ocaid: ocaid.endswith("goog")) +
list(borrowable_editions) +
list(printdisabled_editions) +
list(unclassified_editions)
)
add_list('ia', ia_list)
add("ebook_count_i", len(ia_list))

has_fulltext = any(e.get('ocaid', None) for e in editions)

add_list('ia', ia_list)
if has_fulltext:
add('public_scan_b', public_scan)
if all_collection:
Expand Down
48 changes: 41 additions & 7 deletions openlibrary/tests/solr/test_update_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from openlibrary.solr import update_work
from openlibrary.solr.data_provider import DataProvider
from openlibrary.solr.update_work import (
SolrProcessor,
build_data,
pick_cover_edition,
pick_number_of_pages_median,
Expand Down Expand Up @@ -252,7 +253,7 @@ async def test_with_one_lending_edition(self):
w,
key="/books/OL1M",
ocaid='foo00bar',
_ia_meta={"collection": ['lendinglibrary', 'americana']},
_ia_meta={"collection": ['inlibrary', 'americana']},
),
]
)
Expand All @@ -262,7 +263,7 @@ async def test_with_one_lending_edition(self):
assert 'printdisabled_s' not in d
assert d['lending_edition_s'] == 'OL1M'
assert d['ia'] == ['foo00bar']
assert sss(d['ia_collection_s']) == sss("americana;lendinglibrary")
assert sss(d['ia_collection_s']) == sss("americana;inlibrary")
assert d['edition_count'] == 1
assert d['ebook_count_i'] == 1

Expand All @@ -276,13 +277,13 @@ async def test_with_two_lending_editions(self):
w,
key="/books/OL1M",
ocaid='foo01bar',
_ia_meta={"collection": ['lendinglibrary', 'americana']},
_ia_meta={"collection": ['inlibrary', 'americana']},
),
make_edition(
w,
key="/books/OL2M",
ocaid='foo02bar',
_ia_meta={"collection": ['lendinglibrary', 'internetarchivebooks']},
_ia_meta={"collection": ['inlibrary', 'internetarchivebooks']},
),
]
)
Expand All @@ -293,7 +294,7 @@ async def test_with_two_lending_editions(self):
assert d['lending_edition_s'] == 'OL1M'
assert sorted(d['ia']) == ['foo01bar', 'foo02bar']
assert sss(d['ia_collection_s']) == sss(
"lendinglibrary;americana;internetarchivebooks"
"inlibrary;americana;internetarchivebooks"
)
assert d['edition_count'] == 2
assert d['ebook_count_i'] == 2
Expand Down Expand Up @@ -363,7 +364,7 @@ async def test_with_multiple_editions(self):
w,
key="/books/OL3M",
ocaid='foo01bar',
_ia_meta={"collection": ['lendinglibrary', 'americana']},
_ia_meta={"collection": ['inlibrary', 'americana']},
),
make_edition(
w,
Expand All @@ -380,7 +381,7 @@ async def test_with_multiple_editions(self):
assert d['lending_edition_s'] == 'OL3M'
assert sorted(d['ia']) == ['foo00bar', 'foo01bar', 'foo02bar']
assert sss(d['ia_collection_s']) == sss(
"americana;inlibrary;lendinglibrary;printdisabled"
"americana;inlibrary;printdisabled"
)

assert d['edition_count'] == 4
Expand Down Expand Up @@ -687,3 +688,36 @@ def test_normal_case(self):
assert pick_number_of_pages_median(eds) == 122
eds = [{}, {}] + [{'number_of_pages': n} for n in [123, 122, 1]]
assert pick_number_of_pages_median(eds) == 122

class Test_Sort_Editions_Ocaids:

def test_sort(self):
doc = {}
editions = [{
"key": "/books/OL789M",
"ocaid": "ocaid_restricted",
"access_restricted_item": "true",
"ia_collection": []
}, {
"key": "/books/OL567M",
"ocaid": "ocaid_printdisabled",
"access_restricted_item": "true",
"ia_collection": ["printdisabled"]
}, {
"key": "/books/OL234M",
"ocaid": "ocaid_borrowable",
"access_restricted_item": "true",
"ia_collection": ["inlibrary"]
}, {
"key": "/books/OL123M",
"ocaid": "ocaid_open",
"access_restricted_item": "false",
"ia_collection": ["americanlibraries"]
}]
SolrProcessor.add_ebook_info(doc, editions)
assert doc['ia'] == [
"ocaid_open",
"ocaid_borrowable",
"ocaid_printdisabled",
"ocaid_restricted"
]