From 5b01750393b2a0bddf6366b6c553a6178ff15033 Mon Sep 17 00:00:00 2001 From: Alex McKee Date: Thu, 6 Jan 2022 20:36:57 +0000 Subject: [PATCH 1/2] Use SolrProcessor.get_pub_year to get edition years as get_pub_year already supports ISO dates. Update test as ISO date now supported. Filter out None values from pub_years set. --- openlibrary/solr/update_work.py | 3 ++- openlibrary/tests/solr/test_update_work.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py index cd2a9cb687d..8955acbd824 100644 --- a/openlibrary/solr/update_work.py +++ b/openlibrary/solr/update_work.py @@ -595,8 +595,9 @@ def add_list(name, values): pub_dates = {e[k] for e in editions if e.get(k)} add_list(k, pub_dates) pub_years = { - m.group(1) for m in (re_year.search(date) for date in pub_dates) if m + self.get_pub_year(e) for e in editions } + pub_years = pub_years - {None,} if pub_years: add_list('publish_year', pub_years) add('first_publish_year', min(int(y) for y in pub_years)) diff --git a/openlibrary/tests/solr/test_update_work.py b/openlibrary/tests/solr/test_update_work.py index b86dfffd4f5..f6521668255 100644 --- a/openlibrary/tests/solr/test_update_work.py +++ b/openlibrary/tests/solr/test_update_work.py @@ -164,7 +164,7 @@ def test_publish_year(self): test_dates = [ "2000", "Another 2000", - "2001-01-02", # Doesn't seems to be handling this case + "2001-01-02", # ISO 8601 formatted dates now supported "01-02-2003", "Jan 2002", "Bad date 12", @@ -175,7 +175,7 @@ def test_publish_year(self): ) d = build_data(work) - assert sorted(d['publish_year']) == ["2000", "2002", "2003"] + assert sorted(d['publish_year']) == ["2000", "2001", "2002", "2003"] assert d["first_publish_year"] == 2000 def test_isbns(self): From b7214e72a2ee4c3029da8143e0f668bfe1f07de3 Mon Sep 17 00:00:00 2001 From: Drini Cami Date: Tue, 8 Mar 2022 17:19:27 -0500 Subject: [PATCH 2/2] Make solr a bit more lenient with date formats --- openlibrary/solr/update_work.py | 6 +----- openlibrary/tests/solr/test_update_work.py | 4 +++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/openlibrary/solr/update_work.py b/openlibrary/solr/update_work.py index 8955acbd824..c73eedc7b38 100644 --- a/openlibrary/solr/update_work.py +++ b/openlibrary/solr/update_work.py @@ -43,9 +43,8 @@ re_author_key = re.compile(r'^/(?:a|authors)/(OL\d+A)') re_bad_char = re.compile('[\x01\x0b\x1a-\x1e]') re_edition_key = re.compile(r"/books/([^/]+)") -re_iso_date = re.compile(r'^(\d{4})-\d\d-\d\d$') re_solr_field = re.compile(r'^[-\w]+$', re.U) -re_year = re.compile(r'(\d{4})$') +re_year = re.compile(r'\b(\d{4})\b') # This will be set to a data provider; have faith, mypy! data_provider = cast(DataProvider, None) @@ -491,9 +490,6 @@ def get_pub_year(self, e): """ pub_date = e.get('publish_date', None) if pub_date: - m = re_iso_date.match(pub_date) - if m: - return m.group(1) m = re_year.search(pub_date) if m: return m.group(1) diff --git a/openlibrary/tests/solr/test_update_work.py b/openlibrary/tests/solr/test_update_work.py index f6521668255..21593748903 100644 --- a/openlibrary/tests/solr/test_update_work.py +++ b/openlibrary/tests/solr/test_update_work.py @@ -166,8 +166,10 @@ def test_publish_year(self): "Another 2000", "2001-01-02", # ISO 8601 formatted dates now supported "01-02-2003", + "2004 May 23", "Jan 2002", "Bad date 12", + "Bad date 123412314", ] work = make_work() update_work.data_provider = FakeDataProvider( @@ -175,7 +177,7 @@ def test_publish_year(self): ) d = build_data(work) - assert sorted(d['publish_year']) == ["2000", "2001", "2002", "2003"] + assert sorted(d['publish_year']) == ["2000", "2001", "2002", "2003", "2004"] assert d["first_publish_year"] == 2000 def test_isbns(self):