From a9fd6a640ccf93e5ff09b243cee53b15707c1f8c Mon Sep 17 00:00:00 2001 From: lemon24 Date: Wed, 26 Jan 2022 00:15:23 +0200 Subject: [PATCH] Update _parse_file_inplace() to use convert_file_to_utf8(). For #296. --- feedparser/__init__.py | 5 +++ feedparser/api.py | 69 ++++++++++++++++++++++++++++++++++++------ tests/runtests.py | 31 +++++++++++++++++++ 3 files changed, 95 insertions(+), 10 deletions(-) diff --git a/feedparser/__init__.py b/feedparser/__init__.py index 68c2a424..3f0d3905 100644 --- a/feedparser/__init__.py +++ b/feedparser/__init__.py @@ -46,3 +46,8 @@ # If you want feedparser to automatically sanitize all potentially unsafe # HTML content, set this to 1. SANITIZE_HTML = 1 + + +# If you want feedparser to use only a prefix of the feed to detect encodings +# (uses less memory), set this to 1. +OPTIMISTIC_ENCODING_DETECTION = 1 diff --git a/feedparser/api.py b/feedparser/api.py index d00dc323..c4e6630a 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -35,7 +35,7 @@ import xml.sax from .datetimes import registerDateHandler, _parse_date -from .encodings import convert_to_utf8 +from .encodings import convert_file_to_utf8, MissingEncoding from .html import BaseHTMLProcessor from . import http from .mixin import XMLParserMixin @@ -156,6 +156,7 @@ def parse( response_headers: Dict[str, str] = None, resolve_relative_uris: bool = None, sanitize_html: bool = None, + optimistic_encoding_detection: bool = None, ) -> FeedParserDict: """Parse a feed from a URL, file, stream, or string. @@ -199,6 +200,11 @@ def parse( Should feedparser skip HTML sanitization? Only disable this if you know what you are doing! Defaults to the value of :data:`feedparser.SANITIZE_HTML`, which is ``True``. + :param optimistic_encoding_detection: + Should feedparser use only a prefix of the feed to detect encodings + (uses less memory, but the wrong encoding may be detected in rare cases). + Defaults to the value of + :data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``. """ @@ -230,7 +236,9 @@ def parse( result['headers'].update(response_headers or {}) # TODO (lemon24): remove this once _open_resource() returns an open file - file = io.BytesIO(data) + file = io.BytesIO(data) if isinstance(data, bytes) else io.StringIO(data) + + # TODO (lemon24): handle io.UnsupportedOperation raised by seek() attempts try: _parse_file_inplace( @@ -238,6 +246,7 @@ def parse( result, resolve_relative_uris=resolve_relative_uris, sanitize_html=sanitize_html, + optimistic_encoding_detection=optimistic_encoding_detection, ) finally: if not hasattr(url_file_stream_or_string, 'read'): @@ -253,24 +262,39 @@ def _parse_file_inplace( *, resolve_relative_uris: bool = None, sanitize_html: bool = None, + optimistic_encoding_detection: bool = None, ) -> None: - # TODO (lemon24): remove this once we start using convert_file_to_utf8() - data = file.read() - # Avoid a cyclic import. import feedparser if sanitize_html is None: sanitize_html = bool(feedparser.SANITIZE_HTML) if resolve_relative_uris is None: resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS) + if optimistic_encoding_detection is None: + optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION) + + stream_factory = convert_file_to_utf8( + result['headers'], file, result, optimistic_encoding_detection + ) + # We're done with file, all access must happen through stream_factory. + del file + + # Some notes about the stream_factory.get_{text,binary}_file() methods: + # + # Calling them a second time will raise io.UnsupportedOperation + # if the underlying file was not seekable. + # + # Calling close() on the returned file is ignored + # (that is, the underlying file is *not* closed), + # because the SAX parser closes the file when done; + # we don't want that, since we might try again with the loose parser. - data = convert_to_utf8(result['headers'], data, result) use_json_parser = result['content-type'] == 'application/json' use_strict_parser = result['encoding'] and True or False if not use_json_parser: - result['version'], data, entities = replace_doctype(data) + result['version'], stream_factory.prefix, entities = replace_doctype(stream_factory.prefix) # Ensure that baseuri is an absolute URI using an acceptable URI scheme. contentloc = result['headers'].get('content-location', '') @@ -283,15 +307,18 @@ def _parse_file_inplace( if not _XML_AVAILABLE: use_strict_parser = False + feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser] + if use_json_parser: result['version'] = None feed_parser = JSONParser(baseuri, baselang, 'utf-8') try: - feed_parser.feed(io.BytesIO(data)) + feed_parser.feed(stream_factory.get_file()) except Exception as e: result['bozo'] = 1 result['bozo_exception'] = e + elif use_strict_parser: # Initialize the SAX parser. feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8') @@ -307,7 +334,14 @@ def _parse_file_inplace( saxparser.setContentHandler(feed_parser) saxparser.setErrorHandler(feed_parser) source = xml.sax.xmlreader.InputSource() - source.setByteStream(io.BytesIO(data)) + + # If an encoding was detected, decode the file on the fly; + # otherwise, pass it as-is and let the SAX parser deal with it. + try: + source.setCharacterStream(stream_factory.get_text_file()) + except MissingEncoding: + source.setByteStream(stream_factory.get_binary_file()) + try: saxparser.parse(source) except xml.sax.SAXException as e: @@ -321,7 +355,22 @@ def _parse_file_inplace( feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) feed_parser.resolve_relative_uris = resolve_relative_uris feed_parser.sanitize_html = sanitize_html - feed_parser.feed(data.decode('utf-8', 'replace')) + + # If an encoding was detected, use it; otherwise, assume utf-8 and do your best. + # Will raise io.UnsupportedOperation if the underlying file is not seekable. + data = stream_factory.get_text_file('utf-8', 'replace').read() + + # As of 6.0.8, LooseFeedParser.feed() can be called exactly once + # with the entire data (it does some re.sub() and str.replace() on it). + # + # SGMLParser (of which LooseFeedParser is a subclass) + # *can* be fed in a streaming fashion, + # by calling feed() repeatedly with chunks of text. + # + # When/if LooseFeedParser will support being fed chunks, + # replace the read() call above with read(size)/feed() calls in a loop. + + feed_parser.feed(data) result['feed'] = feed_parser.feeddata result['entries'] = feed_parser.entries diff --git a/tests/runtests.py b/tests/runtests.py index 61281b26..7c3cec0b 100644 --- a/tests/runtests.py +++ b/tests/runtests.py @@ -1046,6 +1046,37 @@ def test_resolve_relative_uris_off(self): resolve_relative_uris=False) self.assertEqual(u'boo', d.entries[1].content[0].value) + def test_optimistic_encoding_detection(self): + length = feedparser.encodings.CONVERT_FILE_PREFIX_LEN + digits = '0123456789abcdef😀' + description = digits * int(length / len(digits) * 1.5) + + feed_xml = f""" + + + + id + {description} + + + + """ + + kwargs_params = { + 'default': dict(), + 'on': dict(optimistic_encoding_detection=True), + 'off': dict(optimistic_encoding_detection=False), + } + input_params = { + 'binary_file': lambda: io.BytesIO(feed_xml.encode('utf-8')), + 'text_file': lambda: io.StringIO(feed_xml), + } + + for kwargs_name, kwargs in kwargs_params.items(): + for input_name, make_input in input_params.items(): + with self.subTest(f"{kwargs_name} {input_name}"): + d = feedparser.parse(make_input(), **kwargs) + self.assertEqual(d.entries[0].description, description) class TestSanitizer(unittest.TestCase): def test_style_attr_is_enabled(self):