From 02792d44253c54be9ae4280f0aa4e7bb1a72c804 Mon Sep 17 00:00:00 2001 From: Darrel Herbst Date: Mon, 1 Feb 2021 08:01:29 -0500 Subject: [PATCH] Refactor get_ia.py to use requests instead of urllib.urlopen (#4436) * Move from urllib to requests for 2852 * Use string instead file object. * Use named headers in call for documentation. * Use the raw HTTPRequest when you need to read the response like a file. * Use bytes if reading binary. * Return bytes when needed. * Remove TODO, and mention where this is called. * Correct to use .text for string comparison. --- openlibrary/catalog/get_ia.py | 39 ++++++++++++------------ openlibrary/catalog/marc/marc_subject.py | 4 +-- openlibrary/tests/catalog/test_get_ia.py | 11 ++++++- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/openlibrary/catalog/get_ia.py b/openlibrary/catalog/get_ia.py index c6182778a60c..008a2cef1911 100644 --- a/openlibrary/catalog/get_ia.py +++ b/openlibrary/catalog/get_ia.py @@ -6,7 +6,7 @@ from deprecated import deprecated from infogami import config from lxml import etree -from six.moves import urllib +import requests from time import sleep from openlibrary.catalog.marc.marc_binary import MarcBinary @@ -26,16 +26,17 @@ class NoMARCXML(IOError): pass -def urlopen_keep_trying(url): +# This function is called in openlibrary/catalog/marc/marc_subject.py as well as this file. +def urlopen_keep_trying(url, headers=None, **kwargs): + """Tries to request the url three times, raises HTTPError if 403, 404, or 416. Returns a requests.Response""" for i in range(3): try: - f = urllib.request.urlopen(url) - return f - except urllib.error.HTTPError as error: - if error.code in (403, 404, 416): + resp = requests.get(url, headers=headers, **kwargs) + resp.raise_for_status() + return resp + except requests.HTTPError as error: + if error.response.status_code in (403, 404, 416): raise - except urllib.error.URLError: - pass sleep(2) @@ -46,7 +47,7 @@ def bad_ia_xml(identifier): # need to handle 404s: # http://www.archive.org/details/index1858mary loc = "{0}/{0}_marc.xml".format(identifier) - return '