Skip to content

Commit

Permalink
Only read the feed in memory if the underlying resource is not seekable.
Browse files Browse the repository at this point in the history
  • Loading branch information
lemon24 committed Jan 28, 2022
1 parent 4503fa8 commit 00b29e4
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 27 deletions.
69 changes: 47 additions & 22 deletions feedparser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,20 +106,42 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
if request_headers is supplied it is a dictionary of HTTP request headers
that will override the values generated by FeedParser.
:return: A bytes object.
:return: A seekable, readable file object.
"""

if hasattr(url_file_stream_or_string, 'read'):
return url_file_stream_or_string.read()
# Some notes on the history of the implementation of _open_resource().
#
# parse() might need to go over the feed content twice:
# if the strict parser fails, it tries again with the loose parser.
#
# In 5.2.0, this returned an open file, to be read() by parse().
# By 6.0.8, this returned bytes directly.
#
# Since #296 (>6.0.8), this once again returns an open file
# (to reduce memory usage, see convert_file_to_utf8() for details).
# However, to accommodate parse() needing the content twice,
# the returned file is guaranteed to be seekable.
# (If the underlying resource is not seekable,
# the content is read and wrapped in a io.BytesIO/StringIO.)

if isinstance(url_file_stream_or_string, str) \
and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
if hasattr(url_file_stream_or_string, 'read'):
if hasattr(url_file_stream_or_string, 'seekable'):
if url_file_stream_or_string.seekable():
return url_file_stream_or_string
return _to_in_memory_file(url_file_stream_or_string.read())

looks_like_url = (
isinstance(url_file_stream_or_string, str)
and urllib.parse.urlparse(url_file_stream_or_string)[0]
in ('http', 'https', 'ftp', 'file', 'feed')
)
if looks_like_url:
data = http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
return io.BytesIO(data)

# try to open with native open function (if url_file_stream_or_string is a filename)
try:
with open(url_file_stream_or_string, 'rb') as f:
data = f.read()
return open(url_file_stream_or_string, 'rb')
except (IOError, UnicodeEncodeError, TypeError, ValueError):
# if url_file_stream_or_string is a str object that
# cannot be converted to the encoding returned by
Expand All @@ -129,13 +151,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
# (such as an XML document encoded in UTF-32), TypeError will
# be thrown.
pass
else:
return data

# treat url_file_stream_or_string as string
if not isinstance(url_file_stream_or_string, bytes):
return url_file_stream_or_string.encode('utf-8')
return url_file_stream_or_string
# treat url_file_stream_or_string as bytes/string
return _to_in_memory_file(url_file_stream_or_string)


def _to_in_memory_file(data):
if isinstance(data, str):
return io.StringIO(data)
else:
return io.BytesIO(data)


class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
Expand Down Expand Up @@ -221,25 +246,25 @@ def parse(
)

try:
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
file = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
except urllib.error.URLError as error:
result.update({
'bozo': True,
'bozo_exception': error,
})
return result

if not data:
# at this point, the file is guaranteed to be seekable;
# we read 1 byte/character to see if it's empty and return early
# (this preserves the behavior in 6.0.8)
initial_file_offset = file.tell()
if not file.read(1):
return result
file.seek(initial_file_offset)

# overwrite existing headers using response_headers
result['headers'].update(response_headers or {})

# TODO (lemon24): remove this once _open_resource() returns an open file
file = io.BytesIO(data)

# TODO (lemon24): handle io.UnsupportedOperation raised by seek() attempts

try:
_parse_file_inplace(
file,
Expand All @@ -257,7 +282,7 @@ def parse(


def _parse_file_inplace(
file: IO[bytes],
file: Union[IO[bytes], IO[str]],
result: dict,
*,
resolve_relative_uris: bool = None,
Expand Down
10 changes: 5 additions & 5 deletions tests/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ class TestOpenResource(unittest.TestCase):
"""Ensure that `_open_resource()` interprets its arguments as URIs, file-like objects, or in-memory feeds as expected"""

def test_fileobj(self):
r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {})
r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {}).read()
self.assertEqual(r, b'')

def test_feed(self):
Expand All @@ -489,22 +489,22 @@ def test_feed_http(self):

def test_bytes(self):
s = b'<feed><item><title>text</title></item></feed>'
r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
self.assertEqual(s, r)

def test_string(self):
s = b'<feed><item><title>text</title></item></feed>'
r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
self.assertEqual(s, r)

def test_unicode_1(self):
s = b'<feed><item><title>text</title></item></feed>'
r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
self.assertEqual(s, r)

def test_unicode_2(self):
s = br'<feed><item><title>t\u00e9xt</title></item></feed>'
r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
self.assertEqual(s, r)

def test_http_client_ascii_unicode_encode_error(self):
Expand Down

0 comments on commit 00b29e4

Please sign in to comment.