Skip to content

Commit

Permalink
Merge pull request #302 from lemon24/296-memory
Browse files Browse the repository at this point in the history
Memory usage reduction (#296)
  • Loading branch information
kurtmckee authored Jun 24, 2022
2 parents 5fcb3ae + 73dcc55 commit a48e403
Show file tree
Hide file tree
Showing 7 changed files with 743 additions and 38 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ bug report!
* `John Beimler <http://john.beimler.org/>`_
* `Beat Bolli <https://drbeat.li/>`_
* `François Boulogne <http://www.sciunto.org/>`_
* `Adrian Damian <https://death.andgravity.com/>`_
* `Jason Diamond <http://injektilo.org/>`_
* `Jakub Kuczys <https://github.com/jack1142>`_
* `Fazal Majid <https://majid.info/blog/>`_
Expand Down
10 changes: 10 additions & 0 deletions changelog.d/20220410_193326_lemon24_296_memory.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Changed
-------

* Use only a prefix of the feed to detect encodings,
instead of reading the whole feed in memory.
This reduces the memory usage of parse() by up to ~3x (66-73%),
but may result in the wrong encoding being detected in rare cases;
use ``feedparser.parse(optimistic_encoding_detection=False)``
to get the original behavior (read the whole feed in memory).
(#296, #302)
5 changes: 5 additions & 0 deletions feedparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,8 @@
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1


# If you want feedparser to use only a prefix of the feed to detect encodings
# (uses less memory), set this to 1.
OPTIMISTIC_ENCODING_DETECTION = 1
165 changes: 134 additions & 31 deletions feedparser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@
import datetime
import io
import time
from typing import Dict, List, Union
from typing import Dict, List, Union, IO
import urllib.error
import urllib.parse
import xml.sax

from .datetimes import registerDateHandler, _parse_date
from .encodings import convert_to_utf8
from .encodings import convert_file_to_utf8, MissingEncoding
from .html import BaseHTMLProcessor
from . import http
from .mixin import XMLParserMixin
Expand Down Expand Up @@ -106,20 +106,42 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
if request_headers is supplied it is a dictionary of HTTP request headers
that will override the values generated by FeedParser.
:return: A bytes object.
:return: A seekable, readable file object.
"""

if hasattr(url_file_stream_or_string, 'read'):
return url_file_stream_or_string.read()

if isinstance(url_file_stream_or_string, str) \
and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
# Some notes on the history of the implementation of _open_resource().
#
# parse() might need to go over the feed content twice:
# if the strict parser fails, it tries again with the loose parser.
#
# In 5.2.0, this returned an open file, to be read() by parse().
# By 6.0.8, this returned bytes directly.
#
# Since #296 (>6.0.8), this once again returns an open file
# (to reduce memory usage, see convert_file_to_utf8() for details).
# However, to accommodate parse() needing the content twice,
# the returned file is guaranteed to be seekable.
# (If the underlying resource is not seekable,
# the content is read and wrapped in a io.BytesIO/StringIO.)

if callable(getattr(url_file_stream_or_string, 'read', None)):
if callable(getattr(url_file_stream_or_string, 'seekable', None)):
if url_file_stream_or_string.seekable():
return url_file_stream_or_string
return _to_in_memory_file(url_file_stream_or_string.read())

looks_like_url = (
isinstance(url_file_stream_or_string, str)
and urllib.parse.urlparse(url_file_stream_or_string)[0]
in ('http', 'https', 'ftp', 'file', 'feed')
)
if looks_like_url:
data = http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
return io.BytesIO(data)

# try to open with native open function (if url_file_stream_or_string is a filename)
try:
with open(url_file_stream_or_string, 'rb') as f:
data = f.read()
return open(url_file_stream_or_string, 'rb')
except (IOError, UnicodeEncodeError, TypeError, ValueError):
# if url_file_stream_or_string is a str object that
# cannot be converted to the encoding returned by
Expand All @@ -129,13 +151,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
# (such as an XML document encoded in UTF-32), TypeError will
# be thrown.
pass
else:
return data

# treat url_file_stream_or_string as string
if not isinstance(url_file_stream_or_string, bytes):
return url_file_stream_or_string.encode('utf-8')
return url_file_stream_or_string
# treat url_file_stream_or_string as bytes/string
return _to_in_memory_file(url_file_stream_or_string)


def _to_in_memory_file(data):
if isinstance(data, str):
return io.StringIO(data)
else:
return io.BytesIO(data)


class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
Expand All @@ -156,6 +181,7 @@ def parse(
response_headers: Dict[str, str] = None,
resolve_relative_uris: bool = None,
sanitize_html: bool = None,
optimistic_encoding_detection: bool = None,
) -> FeedParserDict:
"""Parse a feed from a URL, file, stream, or string.
Expand Down Expand Up @@ -199,19 +225,18 @@ def parse(
Should feedparser skip HTML sanitization? Only disable this if you know
what you are doing! Defaults to the value of
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
:param optimistic_encoding_detection:
Should feedparser use only a prefix of the feed to detect encodings
(uses less memory, but the wrong encoding may be detected in rare cases).
Defaults to the value of
:data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``.
"""

# Avoid a cyclic import.
if not agent:
import feedparser
agent = feedparser.USER_AGENT
if sanitize_html is None:
import feedparser
sanitize_html = bool(feedparser.SANITIZE_HTML)
if resolve_relative_uris is None:
import feedparser
resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)

result = FeedParserDict(
bozo=False,
Expand All @@ -221,26 +246,80 @@ def parse(
)

try:
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
file = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
except urllib.error.URLError as error:
result.update({
'bozo': True,
'bozo_exception': error,
})
return result

if not data:
# at this point, the file is guaranteed to be seekable;
# we read 1 byte/character to see if it's empty and return early
# (this preserves the behavior in 6.0.8)
initial_file_offset = file.tell()
if not file.read(1):
return result
file.seek(initial_file_offset)

# overwrite existing headers using response_headers
result['headers'].update(response_headers or {})

data = convert_to_utf8(result['headers'], data, result)
try:
_parse_file_inplace(
file,
result,
resolve_relative_uris=resolve_relative_uris,
sanitize_html=sanitize_html,
optimistic_encoding_detection=optimistic_encoding_detection,
)
finally:
if not hasattr(url_file_stream_or_string, 'read'):
# the file does not come from the user, close it
file.close()

return result


def _parse_file_inplace(
file: Union[IO[bytes], IO[str]],
result: dict,
*,
resolve_relative_uris: bool = None,
sanitize_html: bool = None,
optimistic_encoding_detection: bool = None,
) -> None:

# Avoid a cyclic import.
import feedparser
if sanitize_html is None:
sanitize_html = bool(feedparser.SANITIZE_HTML)
if resolve_relative_uris is None:
resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
if optimistic_encoding_detection is None:
optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION)

stream_factory = convert_file_to_utf8(
result['headers'], file, result, optimistic_encoding_detection
)
# We're done with file, all access must happen through stream_factory.
del file

# Some notes about the stream_factory.get_{text,binary}_file() methods:
#
# Calling them a second time will raise io.UnsupportedOperation
# if the underlying file was not seekable.
#
# Calling close() on the returned file is ignored
# (that is, the underlying file is *not* closed),
# because the SAX parser closes the file when done;
# we don't want that, since we might try again with the loose parser.

use_json_parser = result['content-type'] == 'application/json'
use_strict_parser = result['encoding'] and True or False

if not use_json_parser:
result['version'], data, entities = replace_doctype(data)
result['version'], stream_factory.prefix, entities = replace_doctype(stream_factory.prefix)

# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
contentloc = result['headers'].get('content-location', '')
Expand All @@ -253,15 +332,18 @@ def parse(

if not _XML_AVAILABLE:
use_strict_parser = False

feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]

if use_json_parser:
result['version'] = None
feed_parser = JSONParser(baseuri, baselang, 'utf-8')
try:
feed_parser.feed(data)
feed_parser.feed(stream_factory.get_file())
except Exception as e:
result['bozo'] = 1
result['bozo_exception'] = e

elif use_strict_parser:
# Initialize the SAX parser.
feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8')
Expand All @@ -277,7 +359,14 @@ def parse(
saxparser.setContentHandler(feed_parser)
saxparser.setErrorHandler(feed_parser)
source = xml.sax.xmlreader.InputSource()
source.setByteStream(io.BytesIO(data))

# If an encoding was detected, decode the file on the fly;
# otherwise, pass it as-is and let the SAX parser deal with it.
try:
source.setCharacterStream(stream_factory.get_text_file())
except MissingEncoding:
source.setByteStream(stream_factory.get_binary_file())

try:
saxparser.parse(source)
except xml.sax.SAXException as e:
Expand All @@ -291,7 +380,22 @@ def parse(
feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
feed_parser.resolve_relative_uris = resolve_relative_uris
feed_parser.sanitize_html = sanitize_html
feed_parser.feed(data.decode('utf-8', 'replace'))

# If an encoding was detected, use it; otherwise, assume utf-8 and do your best.
# Will raise io.UnsupportedOperation if the underlying file is not seekable.
data = stream_factory.get_text_file('utf-8', 'replace').read()

# As of 6.0.8, LooseFeedParser.feed() can be called exactly once
# with the entire data (it does some re.sub() and str.replace() on it).
#
# SGMLParser (of which LooseFeedParser is a subclass)
# *can* be fed in a streaming fashion,
# by calling feed() repeatedly with chunks of text.
#
# When/if LooseFeedParser will support being fed chunks,
# replace the read() call above with read(size)/feed() calls in a loop.

feed_parser.feed(data)

result['feed'] = feed_parser.feeddata
result['entries'] = feed_parser.entries
Expand All @@ -300,4 +404,3 @@ def parse(
result['namespaces'] = {}
else:
result['namespaces'] = feed_parser.namespaces_in_use
return result
Loading

0 comments on commit a48e403

Please sign in to comment.