diff --git a/openlibrary/catalog/get_ia.py b/openlibrary/catalog/get_ia.py index c6182778a60..008a2cef191 100644 --- a/openlibrary/catalog/get_ia.py +++ b/openlibrary/catalog/get_ia.py @@ -6,7 +6,7 @@ from deprecated import deprecated from infogami import config from lxml import etree -from six.moves import urllib +import requests from time import sleep from openlibrary.catalog.marc.marc_binary import MarcBinary @@ -26,16 +26,17 @@ class NoMARCXML(IOError): pass -def urlopen_keep_trying(url): +# This function is called in openlibrary/catalog/marc/marc_subject.py as well as this file. +def urlopen_keep_trying(url, headers=None, **kwargs): + """Tries to request the url three times, raises HTTPError if 403, 404, or 416. Returns a requests.Response""" for i in range(3): try: - f = urllib.request.urlopen(url) - return f - except urllib.error.HTTPError as error: - if error.code in (403, 404, 416): + resp = requests.get(url, headers=headers, **kwargs) + resp.raise_for_status() + return resp + except requests.HTTPError as error: + if error.response.status_code in (403, 404, 416): raise - except urllib.error.URLError: - pass sleep(2) @@ -46,7 +47,7 @@ def bad_ia_xml(identifier): # need to handle 404s: # http://www.archive.org/details/index1858mary loc = "{0}/{0}_marc.xml".format(identifier) - return '