Merge pull request #302 from lemon24/296-memory

Memory usage reduction (#296)
kurtmckee · Jun 24, 2022 · a48e403 · a48e403
2 parents 5fcb3ae + 73dcc55
commit a48e403
Show file tree

Hide file tree

Showing 7 changed files with 743 additions and 38 deletions.
diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
@@ -14,6 +14,7 @@ bug report!
 * `John Beimler <http://john.beimler.org/>`_
 * `Beat Bolli <https://drbeat.li/>`_
 * `François Boulogne <http://www.sciunto.org/>`_
+* `Adrian Damian <https://death.andgravity.com/>`_
 * `Jason Diamond <http://injektilo.org/>`_
 * `Jakub Kuczys <https://github.com/jack1142>`_
 * `Fazal Majid <https://majid.info/blog/>`_

diff --git a/changelog.d/20220410_193326_lemon24_296_memory.rst b/changelog.d/20220410_193326_lemon24_296_memory.rst
@@ -0,0 +1,10 @@
+Changed
+-------
+
+*   Use only a prefix of the feed to detect encodings,
+    instead of reading the whole feed in memory.
+    This reduces the memory usage of parse() by up to ~3x (66-73%),
+    but may result in the wrong encoding being detected in rare cases;
+    use ``feedparser.parse(optimistic_encoding_detection=False)``
+    to get the original behavior (read the whole feed in memory).
+    (#296, #302)
diff --git a/feedparser/__init__.py b/feedparser/__init__.py
@@ -46,3 +46,8 @@
 # If you want feedparser to automatically sanitize all potentially unsafe
 # HTML content, set this to 1.
 SANITIZE_HTML = 1
+
+
+# If you want feedparser to use only a prefix of the feed to detect encodings
+# (uses less memory), set this to 1.
+OPTIMISTIC_ENCODING_DETECTION = 1
diff --git a/feedparser/api.py b/feedparser/api.py
@@ -29,13 +29,13 @@
 import datetime
 import io
 import time
-from typing import Dict, List, Union
+from typing import Dict, List, Union, IO
 import urllib.error
 import urllib.parse
 import xml.sax
 
 from .datetimes import registerDateHandler, _parse_date
-from .encodings import convert_to_utf8
+from .encodings import convert_file_to_utf8, MissingEncoding
 from .html import BaseHTMLProcessor
 from . import http
 from .mixin import XMLParserMixin
@@ -106,20 +106,42 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
     if request_headers is supplied it is a dictionary of HTTP request headers
     that will override the values generated by FeedParser.
 
-    :return: A bytes object.
+    :return: A seekable, readable file object.
     """
 
-    if hasattr(url_file_stream_or_string, 'read'):
-        return url_file_stream_or_string.read()
-
-    if isinstance(url_file_stream_or_string, str) \
-       and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
-        return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+    # Some notes on the history of the implementation of _open_resource().
+    #
+    # parse() might need to go over the feed content twice:
+    # if the strict parser fails, it tries again with the loose parser.
+    #
+    # In 5.2.0, this returned an open file, to be read() by parse().
+    # By 6.0.8, this returned bytes directly.
+    #
+    # Since #296 (>6.0.8), this once again returns an open file
+    # (to reduce memory usage, see convert_file_to_utf8() for details).
+    # However, to accommodate parse() needing the content twice,
+    # the returned file is guaranteed to be seekable.
+    # (If the underlying resource is not seekable,
+    # the content is read and wrapped in a io.BytesIO/StringIO.)
+
+    if callable(getattr(url_file_stream_or_string, 'read', None)):
+        if callable(getattr(url_file_stream_or_string, 'seekable', None)):
+            if url_file_stream_or_string.seekable():
+                return url_file_stream_or_string
+        return _to_in_memory_file(url_file_stream_or_string.read())
+
+    looks_like_url = (
+        isinstance(url_file_stream_or_string, str)
+        and urllib.parse.urlparse(url_file_stream_or_string)[0]
+            in ('http', 'https', 'ftp', 'file', 'feed')
+    )
+    if looks_like_url:
+        data = http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+        return io.BytesIO(data)
 
     # try to open with native open function (if url_file_stream_or_string is a filename)
     try:
-        with open(url_file_stream_or_string, 'rb') as f:
-            data = f.read()
+        return open(url_file_stream_or_string, 'rb')
     except (IOError, UnicodeEncodeError, TypeError, ValueError):
         # if url_file_stream_or_string is a str object that
         # cannot be converted to the encoding returned by
@@ -129,13 +151,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
         # (such as an XML document encoded in UTF-32), TypeError will
         # be thrown.
         pass
-    else:
-        return data
 
-    # treat url_file_stream_or_string as string
-    if not isinstance(url_file_stream_or_string, bytes):
-        return url_file_stream_or_string.encode('utf-8')
-    return url_file_stream_or_string
+    # treat url_file_stream_or_string as bytes/string
+    return _to_in_memory_file(url_file_stream_or_string)
+
+
+def _to_in_memory_file(data):
+    if isinstance(data, str):
+        return io.StringIO(data)
+    else:
+        return io.BytesIO(data)
 
 
 class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
@@ -156,6 +181,7 @@ def parse(
         response_headers: Dict[str, str] = None,
         resolve_relative_uris: bool = None,
         sanitize_html: bool = None,
+        optimistic_encoding_detection: bool = None,
 ) -> FeedParserDict:
     """Parse a feed from a URL, file, stream, or string.
 
@@ -199,19 +225,18 @@ def parse(
         Should feedparser skip HTML sanitization? Only disable this if you know
         what you are doing!  Defaults to the value of
         :data:`feedparser.SANITIZE_HTML`, which is ``True``.
+    :param optimistic_encoding_detection:
+        Should feedparser use only a prefix of the feed to detect encodings
+        (uses less memory, but the wrong encoding may be detected in rare cases).
+        Defaults to the value of
+        :data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``.
 
     """
 
     # Avoid a cyclic import.
     if not agent:
         import feedparser
         agent = feedparser.USER_AGENT
-    if sanitize_html is None:
-        import feedparser
-        sanitize_html = bool(feedparser.SANITIZE_HTML)
-    if resolve_relative_uris is None:
-        import feedparser
-        resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
 
     result = FeedParserDict(
         bozo=False,
@@ -221,26 +246,80 @@ def parse(
     )
 
     try:
-        data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+        file = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
     except urllib.error.URLError as error:
         result.update({
             'bozo': True,
             'bozo_exception': error,
         })
         return result
 
-    if not data:
+    # at this point, the file is guaranteed to be seekable;
+    # we read 1 byte/character to see if it's empty and return early
+    # (this preserves the behavior in 6.0.8)
+    initial_file_offset = file.tell()
+    if not file.read(1):
         return result
+    file.seek(initial_file_offset)
 
     # overwrite existing headers using response_headers
     result['headers'].update(response_headers or {})
 
-    data = convert_to_utf8(result['headers'], data, result)
+    try:
+        _parse_file_inplace(
+            file,
+            result,
+            resolve_relative_uris=resolve_relative_uris,
+            sanitize_html=sanitize_html,
+            optimistic_encoding_detection=optimistic_encoding_detection,
+        )
+    finally:
+        if not hasattr(url_file_stream_or_string, 'read'):
+            # the file does not come from the user, close it
+            file.close()
+
+    return result
+
+
+def _parse_file_inplace(
+    file: Union[IO[bytes], IO[str]],
+    result: dict,
+    *,
+    resolve_relative_uris: bool = None,
+    sanitize_html: bool = None,
+    optimistic_encoding_detection: bool = None,
+) -> None:
+
+    # Avoid a cyclic import.
+    import feedparser
+    if sanitize_html is None:
+        sanitize_html = bool(feedparser.SANITIZE_HTML)
+    if resolve_relative_uris is None:
+        resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
+    if optimistic_encoding_detection is None:
+        optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION)
+
+    stream_factory = convert_file_to_utf8(
+        result['headers'], file, result, optimistic_encoding_detection
+    )
+    # We're done with file, all access must happen through stream_factory.
+    del file
+
+    # Some notes about the stream_factory.get_{text,binary}_file() methods:
+    #
+    # Calling them a second time will raise io.UnsupportedOperation
+    # if the underlying file was not seekable.
+    #
+    # Calling close() on the returned file is ignored
+    # (that is, the underlying file is *not* closed),
+    # because the SAX parser closes the file when done;
+    # we don't want that, since we might try again with the loose parser.
+
     use_json_parser = result['content-type'] == 'application/json'
     use_strict_parser = result['encoding'] and True or False
 
     if not use_json_parser:
-        result['version'], data, entities = replace_doctype(data)
+        result['version'], stream_factory.prefix, entities = replace_doctype(stream_factory.prefix)
 
     # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
     contentloc = result['headers'].get('content-location', '')
@@ -253,15 +332,18 @@ def parse(
 
     if not _XML_AVAILABLE:
         use_strict_parser = False
+
     feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]
+
     if use_json_parser:
         result['version'] = None
         feed_parser = JSONParser(baseuri, baselang, 'utf-8')
         try:
-            feed_parser.feed(data)
+            feed_parser.feed(stream_factory.get_file())
         except Exception as e:
             result['bozo'] = 1
             result['bozo_exception'] = e
+
     elif use_strict_parser:
         # Initialize the SAX parser.
         feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8')
@@ -277,7 +359,14 @@ def parse(
         saxparser.setContentHandler(feed_parser)
         saxparser.setErrorHandler(feed_parser)
         source = xml.sax.xmlreader.InputSource()
-        source.setByteStream(io.BytesIO(data))
+
+        # If an encoding was detected, decode the file on the fly;
+        # otherwise, pass it as-is and let the SAX parser deal with it.
+        try:
+            source.setCharacterStream(stream_factory.get_text_file())
+        except MissingEncoding:
+            source.setByteStream(stream_factory.get_binary_file())
+
         try:
             saxparser.parse(source)
         except xml.sax.SAXException as e:
@@ -291,7 +380,22 @@ def parse(
         feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
         feed_parser.resolve_relative_uris = resolve_relative_uris
         feed_parser.sanitize_html = sanitize_html
-        feed_parser.feed(data.decode('utf-8', 'replace'))
+
+        # If an encoding was detected, use it; otherwise, assume utf-8 and do your best.
+        # Will raise io.UnsupportedOperation if the underlying file is not seekable.
+        data =  stream_factory.get_text_file('utf-8', 'replace').read()
+
+        # As of 6.0.8, LooseFeedParser.feed() can be called exactly once
+        # with the entire data (it does some re.sub() and str.replace() on it).
+        #
+        # SGMLParser (of which LooseFeedParser is a subclass)
+        # *can* be fed in a streaming fashion,
+        # by calling feed() repeatedly with chunks of text.
+        #
+        # When/if LooseFeedParser will support being fed chunks,
+        # replace the read() call above with read(size)/feed() calls in a loop.
+
+        feed_parser.feed(data)
 
     result['feed'] = feed_parser.feeddata
     result['entries'] = feed_parser.entries
@@ -300,4 +404,3 @@ def parse(
         result['namespaces'] = {}
     else:    
         result['namespaces'] = feed_parser.namespaces_in_use
-    return result