From 302b25c829d3381c3a4e6da77b7217c9ee14a9e6 Mon Sep 17 00:00:00 2001 From: "Michael E. Karpeles (Mek)" Date: Tue, 8 Feb 2022 08:55:51 +0000 Subject: [PATCH 1/6] mvp for sorting ia solr field --- openlibrary/solr/update_work.py | 65 +++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py index 7924c118407..b5b0f1e6daf 100644 --- a/openlibrary/solr/update_work.py +++ b/openlibrary/solr/update_work.py @@ -369,6 +369,7 @@ def process_editions(self, w, editions, ia_metadata, identifiers): e['public_scan'] = ('lendinglibrary' not in collection) and ( 'printdisabled' not in collection ) + e['access_restricted_item'] = ia_meta_fields.get('access-restricted-item', False) if 'identifiers' in e: for k, id_list in e['identifiers'].items(): @@ -739,53 +740,69 @@ def add(name, value): def add_list(name, values): doc[name] = list(values) + # Q: How do I change what values I have available from Archive.org metadata? + # Because I'm pretty sure we're missing e.g. lending___status etc... + # It looks like it may? Just not computed values + # 1. is in inlibrary? i.e. borrowable + # 2. is it not access-restricted-item? + + borrowable_editions = set() + printdisabled_editions = set() + open_editions = set() + unclassified_editions = set() + + language_editions = {} + pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() - - public_scan = False + all_collection = set() + public_scan = False lending_edition = None in_library_edition = None lending_ia_identifier = None - printdisabled = set() + for e in editions: if 'ocaid' not in e: continue + + assert isinstance(e['ocaid'], str) + ocaid = e['ocaid'].strip() + collections = e.get('ia_collection', []) + all_collection.update(collections) + + if 'inlibrary' in collections: + borrowable_editions.add(ocaid) + elif 'printdisabled' in collections: + printdisabled_editions.add(ocaid) + printdisabled.add(re_edition_key.match(e['key']).group(1)) + elif e.get('public_scan') and not e.get('access_restricted_item', False): + public_scan = True + open_editions.add(ocaid) + else: + unclassified_editions.add(ocaid) + + # partners may still rely on these legacy fields, leave logic unchanged if not lending_edition and 'lendinglibrary' in e.get('ia_collection', []): lending_edition = re_edition_key.match(e['key']).group(1) lending_ia_identifier = e['ocaid'] if not in_library_edition and 'inlibrary' in e.get('ia_collection', []): in_library_edition = re_edition_key.match(e['key']).group(1) lending_ia_identifier = e['ocaid'] - if 'printdisabled' in e.get('ia_collection', []): - printdisabled.add(re_edition_key.match(e['key']).group(1)) - all_collection.update(e.get('ia_collection', [])) - assert isinstance(e['ocaid'], str) - i = e['ocaid'].strip() - if e.get('public_scan'): - public_scan = True - if i.endswith('goog'): - pub_goog.add(i) - else: - pub_nongoog.add(i) - else: - if i.endswith('goog'): - nonpub_goog.add(i) - else: - nonpub_nongoog.add(i) + ia_list = ( - list(pub_nongoog) - + list(pub_goog) - + list(nonpub_nongoog) - + list(nonpub_goog) + list(open_editions) + + list(borrowable_editions) + + list(printdisabled_editions) + + list(unclassified_editions) ) + add_list('ia', ia_list) add("ebook_count_i", len(ia_list)) has_fulltext = any(e.get('ocaid', None) for e in editions) - add_list('ia', ia_list) if has_fulltext: add('public_scan_b', public_scan) if all_collection: From 2ebd3742dcff9c7548b7195924abf7a21198b92c Mon Sep 17 00:00:00 2001 From: Mek Date: Tue, 8 Feb 2022 15:08:43 -0500 Subject: [PATCH 2/6] Update openlibrary/solr/update_work.py --- openlibrary/solr/update_work.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py index b5b0f1e6daf..01456df3159 100644 --- a/openlibrary/solr/update_work.py +++ b/openlibrary/solr/update_work.py @@ -778,7 +778,7 @@ def add_list(name, values): elif 'printdisabled' in collections: printdisabled_editions.add(ocaid) printdisabled.add(re_edition_key.match(e['key']).group(1)) - elif e.get('public_scan') and not e.get('access_restricted_item', False): + elif not e.get('access_restricted_item', False): public_scan = True open_editions.add(ocaid) else: From a1bd803d5fe9e5fd6821a568dc283f98038b5e19 Mon Sep 17 00:00:00 2001 From: Mek Date: Tue, 8 Feb 2022 15:11:00 -0500 Subject: [PATCH 3/6] Update openlibrary/solr/update_work.py --- openlibrary/solr/data_provider.py | 2 +- openlibrary/solr/update_work.py | 29 +++++++------ openlibrary/tests/solr/test_update_work.py | 47 ++++++++++++++++++---- 3 files changed, 55 insertions(+), 23 deletions(-) diff --git a/openlibrary/solr/data_provider.py b/openlibrary/solr/data_provider.py index baa160db8a7..8d719589377 100644 --- a/openlibrary/solr/data_provider.py +++ b/openlibrary/solr/data_provider.py @@ -22,7 +22,7 @@ logger = logging.getLogger("openlibrary.solr.data_provider") -IA_METADATA_FIELDS = ('identifier', 'boxid', 'collection') +IA_METADATA_FIELDS = ('identifier', 'boxid', 'collection', 'access-restricted-item') OCAID_PATTERN = re.compile(r'^[^\s&#?/]+$') diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py index 01456df3159..234e52d1eb0 100644 --- a/openlibrary/solr/update_work.py +++ b/openlibrary/solr/update_work.py @@ -95,6 +95,7 @@ def set_solr_next(val: bool): class IALiteMetadata(TypedDict): boxid: set[str] collection: set[str] + access_restricted_item: str def get_ia_collection_and_box_id(ia: str) -> Optional[IALiteMetadata]: @@ -131,6 +132,7 @@ def get_list(d, key): return { 'boxid': set(get_list(metadata, 'boxid')), 'collection': set(get_list(metadata, 'collection')), + 'access_restricted_item': metadata.get('access-restricted-item'), } @@ -369,7 +371,7 @@ def process_editions(self, w, editions, ia_metadata, identifiers): e['public_scan'] = ('lendinglibrary' not in collection) and ( 'printdisabled' not in collection ) - e['access_restricted_item'] = ia_meta_fields.get('access-restricted-item', False) + e['access_restricted_item'] = ia_meta_fields.get('access_restricted_item', False) if 'identifiers' in e: for k, id_list in e['identifiers'].items(): @@ -725,7 +727,8 @@ def get_last_modified(self, work, editions): datetimestr_to_int(doc.get('last_modified')) for doc in [work] + editions ) - def add_ebook_info(self, doc, editions): + @staticmethod + def add_ebook_info(doc, editions): """ Add ebook information from the editions to the work Solr document. @@ -740,24 +743,18 @@ def add(name, value): def add_list(name, values): doc[name] = list(values) - # Q: How do I change what values I have available from Archive.org metadata? - # Because I'm pretty sure we're missing e.g. lending___status etc... - # It looks like it may? Just not computed values - # 1. is in inlibrary? i.e. borrowable - # 2. is it not access-restricted-item? - borrowable_editions = set() printdisabled_editions = set() open_editions = set() unclassified_editions = set() language_editions = {} - + pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() - + printdisabled = set() all_collection = set() public_scan = False lending_edition = None @@ -772,18 +769,20 @@ def add_list(name, values): ocaid = e['ocaid'].strip() collections = e.get('ia_collection', []) all_collection.update(collections) - + if 'inlibrary' in collections: borrowable_editions.add(ocaid) elif 'printdisabled' in collections: printdisabled_editions.add(ocaid) - printdisabled.add(re_edition_key.match(e['key']).group(1)) - elif not e.get('access_restricted_item', False): + elif e.get('access_restricted_item', False) == "true": + unclassified_editions.add(ocaid) + else: public_scan = True open_editions.add(ocaid) - else: - unclassified_editions.add(ocaid) + # Legacy + if 'printdisabled' in collections: + printdisabled.add(re_edition_key.match(e['key']).group(1)) # partners may still rely on these legacy fields, leave logic unchanged if not lending_edition and 'lendinglibrary' in e.get('ia_collection', []): lending_edition = re_edition_key.match(e['key']).group(1) diff --git a/openlibrary/tests/solr/test_update_work.py b/openlibrary/tests/solr/test_update_work.py index 2714fd94144..a6c00baaf01 100644 --- a/openlibrary/tests/solr/test_update_work.py +++ b/openlibrary/tests/solr/test_update_work.py @@ -3,6 +3,7 @@ from openlibrary.solr import update_work from openlibrary.solr.data_provider import DataProvider from openlibrary.solr.update_work import ( + SolrProcessor, build_data, pick_cover_edition, pick_number_of_pages_median, @@ -252,7 +253,7 @@ async def test_with_one_lending_edition(self): w, key="/books/OL1M", ocaid='foo00bar', - _ia_meta={"collection": ['lendinglibrary', 'americana']}, + _ia_meta={"collection": ['inlibrary', 'americana']}, ), ] ) @@ -262,7 +263,7 @@ async def test_with_one_lending_edition(self): assert 'printdisabled_s' not in d assert d['lending_edition_s'] == 'OL1M' assert d['ia'] == ['foo00bar'] - assert sss(d['ia_collection_s']) == sss("americana;lendinglibrary") + assert sss(d['ia_collection_s']) == sss("americana;inlibrary") assert d['edition_count'] == 1 assert d['ebook_count_i'] == 1 @@ -276,13 +277,13 @@ async def test_with_two_lending_editions(self): w, key="/books/OL1M", ocaid='foo01bar', - _ia_meta={"collection": ['lendinglibrary', 'americana']}, + _ia_meta={"collection": ['inlibrary', 'americana']}, ), make_edition( w, key="/books/OL2M", ocaid='foo02bar', - _ia_meta={"collection": ['lendinglibrary', 'internetarchivebooks']}, + _ia_meta={"collection": ['inlibrary', 'internetarchivebooks']}, ), ] ) @@ -293,7 +294,7 @@ async def test_with_two_lending_editions(self): assert d['lending_edition_s'] == 'OL1M' assert sorted(d['ia']) == ['foo01bar', 'foo02bar'] assert sss(d['ia_collection_s']) == sss( - "lendinglibrary;americana;internetarchivebooks" + "inlibrary;americana;internetarchivebooks" ) assert d['edition_count'] == 2 assert d['ebook_count_i'] == 2 @@ -363,7 +364,7 @@ async def test_with_multiple_editions(self): w, key="/books/OL3M", ocaid='foo01bar', - _ia_meta={"collection": ['lendinglibrary', 'americana']}, + _ia_meta={"collection": ['inlibrary', 'americana']}, ), make_edition( w, @@ -380,7 +381,7 @@ async def test_with_multiple_editions(self): assert d['lending_edition_s'] == 'OL3M' assert sorted(d['ia']) == ['foo00bar', 'foo01bar', 'foo02bar'] assert sss(d['ia_collection_s']) == sss( - "americana;inlibrary;lendinglibrary;printdisabled" + "americana;inlibrary;printdisabled" ) assert d['edition_count'] == 4 @@ -687,3 +688,35 @@ def test_normal_case(self): assert pick_number_of_pages_median(eds) == 122 eds = [{}, {}] + [{'number_of_pages': n} for n in [123, 122, 1]] assert pick_number_of_pages_median(eds) == 122 + +class Test_Sort_Editions_Ocaids: + + def test_sort(self): + doc = {} + editions = [{ + "key": "/books/OL789M", + "ocaid": "ocaid_restricted", + "access_restricted_item": "true", + "ia_collection": [] + }, { + "key": "/books/OL567M", + "ocaid": "ocaid_printdisabled", + "access_restricted_item": "true", + "ia_collection": ["printdisabled"] + }, { + "key": "/books/OL234M", + "ocaid": "ocaid_borrowable", + "access_restricted_item": "true", + "ia_collection": ["inlibrary"] + }, { + "key": "/books/OL123M", + "ocaid": "ocaid_open", + "access_restricted_item": "false", + "ia_collection": [] + }] + SolrProcessor.add_ebook_info(doc, editions) + assert len(doc.get('ia', [])) == 4 + assert doc['ia'][0] == "ocaid_open" + assert doc['ia'][1] == "ocaid_borrowable" + assert doc['ia'][2] == "ocaid_printdisabled" + assert doc['ia'][3] == "ocaid_restricted" From 97f7df91fe703e5eda3ba01368863685fdef820b Mon Sep 17 00:00:00 2001 From: Mek Date: Thu, 17 Mar 2022 11:59:13 -0700 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Drini Cami --- openlibrary/solr/update_work.py | 2 +- openlibrary/tests/solr/test_update_work.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py index 234e52d1eb0..afd10e91262 100644 --- a/openlibrary/solr/update_work.py +++ b/openlibrary/solr/update_work.py @@ -95,7 +95,7 @@ def set_solr_next(val: bool): class IALiteMetadata(TypedDict): boxid: set[str] collection: set[str] - access_restricted_item: str + access_restricted_item: Optional[Literal['true', 'false']] def get_ia_collection_and_box_id(ia: str) -> Optional[IALiteMetadata]: diff --git a/openlibrary/tests/solr/test_update_work.py b/openlibrary/tests/solr/test_update_work.py index a6c00baaf01..5d00900cdf2 100644 --- a/openlibrary/tests/solr/test_update_work.py +++ b/openlibrary/tests/solr/test_update_work.py @@ -715,8 +715,9 @@ def test_sort(self): "ia_collection": [] }] SolrProcessor.add_ebook_info(doc, editions) - assert len(doc.get('ia', [])) == 4 - assert doc['ia'][0] == "ocaid_open" - assert doc['ia'][1] == "ocaid_borrowable" - assert doc['ia'][2] == "ocaid_printdisabled" - assert doc['ia'][3] == "ocaid_restricted" + assert doc['ia'] == [ + "ocaid_open", + "ocaid_borrowable", + "ocaid_printdisabled", + "ocaid_restricted" + ] From 8952b38c24b38d4f0c37b294db2ecd849252ecef Mon Sep 17 00:00:00 2001 From: "Michael E. Karpeles (Mek)" Date: Thu, 17 Mar 2022 12:11:54 -0700 Subject: [PATCH 5/6] deprioritizing low quality open google --- openlibrary/solr/update_work.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py index afd10e91262..33c2e546f96 100644 --- a/openlibrary/solr/update_work.py +++ b/openlibrary/solr/update_work.py @@ -748,12 +748,6 @@ def add_list(name, values): open_editions = set() unclassified_editions = set() - language_editions = {} - - pub_goog = set() # google - pub_nongoog = set() - nonpub_goog = set() - nonpub_nongoog = set() printdisabled = set() all_collection = set() public_scan = False @@ -792,7 +786,8 @@ def add_list(name, values): lending_ia_identifier = e['ocaid'] ia_list = ( - list(open_editions) + + # deprioritize_low_quality_goog + sorted(list(open_editions), key=lambda ocaid: ocaid.endswith("goog")) + list(borrowable_editions) + list(printdisabled_editions) + list(unclassified_editions) From 6e693544e999c5ad54b77d9aa77629b02cce9ef1 Mon Sep 17 00:00:00 2001 From: "Michael E. Karpeles (Mek)" Date: Thu, 17 Mar 2022 12:26:58 -0700 Subject: [PATCH 6/6] deprioritize darked/broken items --- openlibrary/solr/update_work.py | 2 +- openlibrary/tests/solr/test_update_work.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py index 33c2e546f96..a5b81706479 100644 --- a/openlibrary/solr/update_work.py +++ b/openlibrary/solr/update_work.py @@ -768,7 +768,7 @@ def add_list(name, values): borrowable_editions.add(ocaid) elif 'printdisabled' in collections: printdisabled_editions.add(ocaid) - elif e.get('access_restricted_item', False) == "true": + elif e.get('access_restricted_item', False) == "true" or not collections: unclassified_editions.add(ocaid) else: public_scan = True diff --git a/openlibrary/tests/solr/test_update_work.py b/openlibrary/tests/solr/test_update_work.py index 5d00900cdf2..1b6320b1f54 100644 --- a/openlibrary/tests/solr/test_update_work.py +++ b/openlibrary/tests/solr/test_update_work.py @@ -712,7 +712,7 @@ def test_sort(self): "key": "/books/OL123M", "ocaid": "ocaid_open", "access_restricted_item": "false", - "ia_collection": [] + "ia_collection": ["americanlibraries"] }] SolrProcessor.add_ebook_info(doc, editions) assert doc['ia'] == [