diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 7c9077f3..75fc9a9c 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -14,6 +14,7 @@ bug report! * `John Beimler `_ * `Beat Bolli `_ * `Franรงois Boulogne `_ +* `Adrian Damian `_ * `Jason Diamond `_ * `Jakub Kuczys `_ * `Fazal Majid `_ diff --git a/changelog.d/20220410_193326_lemon24_296_memory.rst b/changelog.d/20220410_193326_lemon24_296_memory.rst new file mode 100644 index 00000000..708c38e0 --- /dev/null +++ b/changelog.d/20220410_193326_lemon24_296_memory.rst @@ -0,0 +1,10 @@ +Changed +------- + +* Use only a prefix of the feed to detect encodings, + instead of reading the whole feed in memory. + This reduces the memory usage of parse() by up to ~3x (66-73%), + but may result in the wrong encoding being detected in rare cases; + use ``feedparser.parse(optimistic_encoding_detection=False)`` + to get the original behavior (read the whole feed in memory). + (#296, #302) diff --git a/feedparser/__init__.py b/feedparser/__init__.py index 1e8877c0..0f9b0f71 100644 --- a/feedparser/__init__.py +++ b/feedparser/__init__.py @@ -46,3 +46,8 @@ # If you want feedparser to automatically sanitize all potentially unsafe # HTML content, set this to 1. SANITIZE_HTML = 1 + + +# If you want feedparser to use only a prefix of the feed to detect encodings +# (uses less memory), set this to 1. +OPTIMISTIC_ENCODING_DETECTION = 1 diff --git a/feedparser/api.py b/feedparser/api.py index 69c293da..c0260670 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -29,13 +29,13 @@ import datetime import io import time -from typing import Dict, List, Union +from typing import Dict, List, Union, IO import urllib.error import urllib.parse import xml.sax from .datetimes import registerDateHandler, _parse_date -from .encodings import convert_to_utf8 +from .encodings import convert_file_to_utf8, MissingEncoding from .html import BaseHTMLProcessor from . import http from .mixin import XMLParserMixin @@ -106,20 +106,42 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h if request_headers is supplied it is a dictionary of HTTP request headers that will override the values generated by FeedParser. - :return: A bytes object. + :return: A seekable, readable file object. """ - if hasattr(url_file_stream_or_string, 'read'): - return url_file_stream_or_string.read() - - if isinstance(url_file_stream_or_string, str) \ - and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): - return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + # Some notes on the history of the implementation of _open_resource(). + # + # parse() might need to go over the feed content twice: + # if the strict parser fails, it tries again with the loose parser. + # + # In 5.2.0, this returned an open file, to be read() by parse(). + # By 6.0.8, this returned bytes directly. + # + # Since #296 (>6.0.8), this once again returns an open file + # (to reduce memory usage, see convert_file_to_utf8() for details). + # However, to accommodate parse() needing the content twice, + # the returned file is guaranteed to be seekable. + # (If the underlying resource is not seekable, + # the content is read and wrapped in a io.BytesIO/StringIO.) + + if callable(getattr(url_file_stream_or_string, 'read', None)): + if callable(getattr(url_file_stream_or_string, 'seekable', None)): + if url_file_stream_or_string.seekable(): + return url_file_stream_or_string + return _to_in_memory_file(url_file_stream_or_string.read()) + + looks_like_url = ( + isinstance(url_file_stream_or_string, str) + and urllib.parse.urlparse(url_file_stream_or_string)[0] + in ('http', 'https', 'ftp', 'file', 'feed') + ) + if looks_like_url: + data = http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + return io.BytesIO(data) # try to open with native open function (if url_file_stream_or_string is a filename) try: - with open(url_file_stream_or_string, 'rb') as f: - data = f.read() + return open(url_file_stream_or_string, 'rb') except (IOError, UnicodeEncodeError, TypeError, ValueError): # if url_file_stream_or_string is a str object that # cannot be converted to the encoding returned by @@ -129,13 +151,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h # (such as an XML document encoded in UTF-32), TypeError will # be thrown. pass - else: - return data - # treat url_file_stream_or_string as string - if not isinstance(url_file_stream_or_string, bytes): - return url_file_stream_or_string.encode('utf-8') - return url_file_stream_or_string + # treat url_file_stream_or_string as bytes/string + return _to_in_memory_file(url_file_stream_or_string) + + +def _to_in_memory_file(data): + if isinstance(data, str): + return io.StringIO(data) + else: + return io.BytesIO(data) class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor): @@ -156,6 +181,7 @@ def parse( response_headers: Dict[str, str] = None, resolve_relative_uris: bool = None, sanitize_html: bool = None, + optimistic_encoding_detection: bool = None, ) -> FeedParserDict: """Parse a feed from a URL, file, stream, or string. @@ -199,6 +225,11 @@ def parse( Should feedparser skip HTML sanitization? Only disable this if you know what you are doing! Defaults to the value of :data:`feedparser.SANITIZE_HTML`, which is ``True``. + :param optimistic_encoding_detection: + Should feedparser use only a prefix of the feed to detect encodings + (uses less memory, but the wrong encoding may be detected in rare cases). + Defaults to the value of + :data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``. """ @@ -206,12 +237,6 @@ def parse( if not agent: import feedparser agent = feedparser.USER_AGENT - if sanitize_html is None: - import feedparser - sanitize_html = bool(feedparser.SANITIZE_HTML) - if resolve_relative_uris is None: - import feedparser - resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS) result = FeedParserDict( bozo=False, @@ -221,7 +246,7 @@ def parse( ) try: - data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + file = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) except urllib.error.URLError as error: result.update({ 'bozo': True, @@ -229,18 +254,72 @@ def parse( }) return result - if not data: + # at this point, the file is guaranteed to be seekable; + # we read 1 byte/character to see if it's empty and return early + # (this preserves the behavior in 6.0.8) + initial_file_offset = file.tell() + if not file.read(1): return result + file.seek(initial_file_offset) # overwrite existing headers using response_headers result['headers'].update(response_headers or {}) - data = convert_to_utf8(result['headers'], data, result) + try: + _parse_file_inplace( + file, + result, + resolve_relative_uris=resolve_relative_uris, + sanitize_html=sanitize_html, + optimistic_encoding_detection=optimistic_encoding_detection, + ) + finally: + if not hasattr(url_file_stream_or_string, 'read'): + # the file does not come from the user, close it + file.close() + + return result + + +def _parse_file_inplace( + file: Union[IO[bytes], IO[str]], + result: dict, + *, + resolve_relative_uris: bool = None, + sanitize_html: bool = None, + optimistic_encoding_detection: bool = None, +) -> None: + + # Avoid a cyclic import. + import feedparser + if sanitize_html is None: + sanitize_html = bool(feedparser.SANITIZE_HTML) + if resolve_relative_uris is None: + resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS) + if optimistic_encoding_detection is None: + optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION) + + stream_factory = convert_file_to_utf8( + result['headers'], file, result, optimistic_encoding_detection + ) + # We're done with file, all access must happen through stream_factory. + del file + + # Some notes about the stream_factory.get_{text,binary}_file() methods: + # + # Calling them a second time will raise io.UnsupportedOperation + # if the underlying file was not seekable. + # + # Calling close() on the returned file is ignored + # (that is, the underlying file is *not* closed), + # because the SAX parser closes the file when done; + # we don't want that, since we might try again with the loose parser. + use_json_parser = result['content-type'] == 'application/json' use_strict_parser = result['encoding'] and True or False if not use_json_parser: - result['version'], data, entities = replace_doctype(data) + result['version'], stream_factory.prefix, entities = replace_doctype(stream_factory.prefix) # Ensure that baseuri is an absolute URI using an acceptable URI scheme. contentloc = result['headers'].get('content-location', '') @@ -253,15 +332,18 @@ def parse( if not _XML_AVAILABLE: use_strict_parser = False + feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser] + if use_json_parser: result['version'] = None feed_parser = JSONParser(baseuri, baselang, 'utf-8') try: - feed_parser.feed(data) + feed_parser.feed(stream_factory.get_file()) except Exception as e: result['bozo'] = 1 result['bozo_exception'] = e + elif use_strict_parser: # Initialize the SAX parser. feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8') @@ -277,7 +359,14 @@ def parse( saxparser.setContentHandler(feed_parser) saxparser.setErrorHandler(feed_parser) source = xml.sax.xmlreader.InputSource() - source.setByteStream(io.BytesIO(data)) + + # If an encoding was detected, decode the file on the fly; + # otherwise, pass it as-is and let the SAX parser deal with it. + try: + source.setCharacterStream(stream_factory.get_text_file()) + except MissingEncoding: + source.setByteStream(stream_factory.get_binary_file()) + try: saxparser.parse(source) except xml.sax.SAXException as e: @@ -291,7 +380,22 @@ def parse( feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) feed_parser.resolve_relative_uris = resolve_relative_uris feed_parser.sanitize_html = sanitize_html - feed_parser.feed(data.decode('utf-8', 'replace')) + + # If an encoding was detected, use it; otherwise, assume utf-8 and do your best. + # Will raise io.UnsupportedOperation if the underlying file is not seekable. + data = stream_factory.get_text_file('utf-8', 'replace').read() + + # As of 6.0.8, LooseFeedParser.feed() can be called exactly once + # with the entire data (it does some re.sub() and str.replace() on it). + # + # SGMLParser (of which LooseFeedParser is a subclass) + # *can* be fed in a streaming fashion, + # by calling feed() repeatedly with chunks of text. + # + # When/if LooseFeedParser will support being fed chunks, + # replace the read() call above with read(size)/feed() calls in a loop. + + feed_parser.feed(data) result['feed'] = feed_parser.feeddata result['entries'] = feed_parser.entries @@ -300,4 +404,3 @@ def parse( result['namespaces'] = {} else: result['namespaces'] = feed_parser.namespaces_in_use - return result diff --git a/feedparser/encodings.py b/feedparser/encodings.py index 73251fc1..9497877b 100644 --- a/feedparser/encodings.py +++ b/feedparser/encodings.py @@ -27,6 +27,7 @@ # POSSIBILITY OF SUCH DAMAGE. import codecs +import io import re import typing as t @@ -312,3 +313,312 @@ def convert_to_utf8(http_headers, data, result): result['bozo'] = True result['bozo_exception'] = error return data + + +# How much to read from a binary file in order to detect encoding. +# In inital tests, 4k was enough for ~160 mostly-English feeds; +# 64k seems like a safe margin. +CONVERT_FILE_PREFIX_LEN = 2 ** 16 + +# How much to read from a text file, and use as an utf-8 bytes prefix. +# Note that no encoding detection is needed in this case. +CONVERT_FILE_STR_PREFIX_LEN = 2 ** 13 + +CONVERT_FILE_TEST_CHUNK_LEN = 2 ** 16 + + +def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True): + """Like convert_to_utf8(), but for a stream. + + Unlike convert_to_utf8(), do not read the the entire file in memory; + instead, return a text stream that decodes it on the fly. + This should consume significantly less memory, + because it avoids (repeatedly) converting the entire file contents + from bytes to str and back. + + To detect the encoding, only a prefix of the file contents is used. + In rare cases, the wrong encoding may be detected for this prefix; + use optimistic_encoding_detection=False to use the entire file contents + (equivalent to a plain convert_to_utf8() call). + + Args: + http_headers (dict): The response headers. + file (IO[bytes] or IO[str]): A read()-able (binary) stream. + result (dict): The result dictionary. + optimistic_encoding_detection (bool): + If true, use only a prefix of the file content to detect encoding. + + Returns: + StreamFactory: a stream factory, with the detected encoding set, if any + + """ + # Currently, this wraps convert_to_utf8(), because the logic is simply + # too complicated to ensure it's re-implemented correctly for a stream. + # That said, it should be possible to change the implementation + # transparently (not sure it's worth it, though). + + # If file is a text stream, we don't need to detect encoding; + # we still need a bytes prefix to run functions on for side effects: + # convert_to_utf8() to sniff / set result['content-type'], and + # replace_doctype() to extract safe_entities. + + if isinstance(file.read(0), str): + prefix = file.read(CONVERT_FILE_STR_PREFIX_LEN).encode('utf-8') + prefix = convert_to_utf8(http_headers, prefix, result) + result['encoding'] = 'utf-8' + return StreamFactory(prefix, file, 'utf-8') + + if optimistic_encoding_detection: + prefix = convert_file_prefix_to_utf8(http_headers, file, result) + factory = StreamFactory(prefix, file, result.get('encoding')) + + # Before returning factory, ensure the entire file can be decoded; + # if it cannot, fall back to convert_to_utf8(). + # + # Not doing this means feedparser.parse() may raise UnicodeDecodeError + # instead of setting bozo_exception to CharacterEncodingOverride, + # breaking the 6.x API. + + try: + text_file = factory.get_text_file() + except MissingEncoding: + return factory + try: + # read in chunks to limit memory usage + while text_file.read(CONVERT_FILE_TEST_CHUNK_LEN): + pass + except UnicodeDecodeError: + # fall back to convert_to_utf8() + file = factory.get_binary_file() + else: + return factory + + # this shouldn't increase memory usage if file is BytesIO, + # since BytesIO does copy-on-write; https://bugs.python.org/issue22003 + data = convert_to_utf8(http_headers, file.read(), result) + + # note that data *is* the prefix + return StreamFactory(data, io.BytesIO(b''), result.get('encoding')) + + +def convert_file_prefix_to_utf8( + http_headers, file: t.IO[bytes], result, + *, + prefix_len: int = CONVERT_FILE_PREFIX_LEN, + read_to_ascii_len: int = 2**8, +) -> bytes: + """Like convert_to_utf8(), but only use the prefix of a binary file. + + Set result like convert_to_utf8() would. + + Return the updated prefix, as bytes. + + """ + # This is complicated by convert_to_utf8() detecting the wrong encoding + # if we have only part of the bytes that make a code-point: + # + # '๐Ÿ˜€'.encode('utf-8') -> utf-8 + # '๐Ÿ˜€'.encode('utf-8')[:-1] -> windows-1252 + bozo + + prefix = file.read(prefix_len - 1) + + # reading up to after an ASCII byte increases + # the likelihood of being on a code point boundary + prefix += read_to_after_ascii_byte(file, read_to_ascii_len) + + # call convert_to_utf8() up to 4 times, + # to make sure we eventually land on a code point boundary + candidates = [] + for attempt in range(4): + byte = file.read(1) + + # we're at the end of the file, and the loop already ran once + if not byte and attempt != 0: + break + + prefix += byte + + fake_result: t.Any = {} + converted_prefix = convert_to_utf8(http_headers, prefix, fake_result) + + # an encoding was detected successfully, keep it + if not fake_result.get('bozo'): + break + + candidates.append((file.tell(), converted_prefix, fake_result)) + + # no encoding was detected successfully, pick the "best" one + else: + + def key(candidate): + *_, result = candidate + + exc = result.get('bozo_exception') + exc_score = 0 + if isinstance(exc, NonXMLContentType): + exc_score = 20 + elif isinstance(exc, CharacterEncodingOverride): + exc_score = 10 + + return ( + exc_score, + # prefer utf- encodings to anything else + result.get('encoding').startswith('utf-') + ) + + candidates.sort(key=key) + offset, converted_prefix, fake_result = candidates[-1] + + file.seek(offset) + + result.update(fake_result) + return converted_prefix + + +def read_to_after_ascii_byte(file: t.IO[bytes], max_len: int) -> bytes: + offset = file.tell() + buffer = b'' + + for _ in range(max_len): + byte = file.read(1) + + # end of file, nothing to do + if not byte: + break + + buffer += byte + + # we stop after a ASCII character + if byte < b'\x80': + break + + # couldn't find an ASCII character, reset the file to the original offset + else: + file.seek(offset) + return b'' + + return buffer + + +class MissingEncoding(io.UnsupportedOperation): + pass + + +class StreamFactory: + + """Decode on the fly a binary stream that *may* have a known encoding. + + If the underlying stream is seekable, it is possible to call + the get_{text,binary}_file() methods more than once. + + """ + + def __init__(self, prefix: bytes, file, encoding=None): + self.prefix = prefix + self.file = ResetFileWrapper(file) + self.encoding = encoding + self.should_reset = False + + def get_text_file(self, fallback_encoding=None, errors='strict'): + encoding = self.encoding or fallback_encoding + if encoding is None: + raise MissingEncoding("cannot create text stream without encoding") + + if isinstance(self.file.read(0), str): + file = PrefixFileWrapper(self.prefix.decode(encoding), self.file) + else: + file = PrefixFileWrapper( + self.prefix.decode('utf-8', errors), + codecs.getreader(encoding)(self.file, errors) + ) + + self.reset() + return file + + def get_binary_file(self): + if isinstance(self.file.read(0), str): + raise io.UnsupportedOperation("underlying stream is text, not binary") from None + + file = PrefixFileWrapper(self.prefix, self.file) + + self.reset() + return file + + def get_file(self): + try: + return self.get_text_file() + except MissingEncoding: + return self.get_binary_file() + + def reset(self): + if self.should_reset: + self.file.reset() + self.should_reset = True + + +class ResetFileWrapper: + """Given a seekable file, allow reading its content again + (from the current position) by calling reset(). + + """ + def __init__(self, file): + self.file = file + try: + self.file_initial_offset = file.tell() + except OSError: + self.file_initial_offset = None + + def read(self, size=-1): + return self.file.read(size) + + def reset(self): + # raises io.UnsupportedOperation if the underlying stream is not seekable + self.file.seek(self.file_initial_offset) + + +class PrefixFileWrapper: + """Stitch a (possibly modified) prefix and a file into a new file object. + + >>> file = io.StringIO('abcdef') + >>> file.read(2) + 'ab' + >>> wrapped = PrefixFileWrapper(file.read(2).upper(), file) + >>> wrapped.read() + 'CDef' + + """ + def __init__(self, prefix, file): + self.prefix = prefix + self.file = file + self.offset = 0 + + def read(self, size=-1): + buffer = self.file.read(0) + + if self.offset < len(self.prefix): + if size < 0: + chunk = self.prefix + else: + chunk = self.prefix[self.offset : self.offset+size] + size -= len(chunk) + buffer += chunk + self.offset += len(chunk) + + while True: + chunk = self.file.read(size) + if not chunk: + break + buffer += chunk + self.offset += len(chunk) + + if size <= 0: + break + + size -= len(chunk) + + return buffer + + def close(self): + # do not touch the underlying stream + pass + diff --git a/feedparser/parsers/json.py b/feedparser/parsers/json.py index ae43163c..ccfd065d 100644 --- a/feedparser/parsers/json.py +++ b/feedparser/parsers/json.py @@ -61,8 +61,8 @@ def __init__(self, baseuri=None, baselang=None, encoding=None): self.namespacesInUse = [] self.entries = [] - def feed(self, data): - data = json.loads(data) + def feed(self, file): + data = json.load(file) v = data.get('version', '') try: diff --git a/tests/runtests.py b/tests/runtests.py index 4dd70f3e..33cacad0 100644 --- a/tests/runtests.py +++ b/tests/runtests.py @@ -49,6 +49,7 @@ import feedparser import feedparser.api import feedparser.datetimes +import feedparser.encodings import feedparser.http import feedparser.mixin import feedparser.sanitizer @@ -295,6 +296,235 @@ def test_gb2312_converted_to_gb18030_in_xml_encoding(self): self.assertEqual(result.encoding, 'gb18030') +class TestEncodingsHelpers(BaseTestCase): + + def test_reset_file_wrapper(self): + f = feedparser.encodings.ResetFileWrapper(io.BytesIO(b'abcdef')) + self.assertEqual(f.read(2) , b'ab') + f.reset() + self.assertEqual(f.read() , b'abcdef') + + f = io.BytesIO(b'abcdef') + f.read(2) + f = feedparser.encodings.ResetFileWrapper(f) + self.assertEqual(f.read(2) , b'cd') + f.reset() + self.assertEqual(f.read() , b'cdef') + + f = feedparser.encodings.ResetFileWrapper(_make_file_not_seekable(b'abcdef')) + self.assertEqual(f.read() , b'abcdef') + self.assertEqual(f.read() , b'') + with self.assertRaises(io.UnsupportedOperation): + f.reset() + self.assertEqual(f.read() , b'') + + f = feedparser.encodings.ResetFileWrapper(_make_file_not_seekable(b'abcdef')) + self.assertEqual(f.read(3) , b'abc') + with self.assertRaises(io.UnsupportedOperation): + f.reset() + self.assertEqual(f.read() , b'def') + + def test_prefix_file_wrapper_no_prefix(self): + f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc')) + self.assertEqual(f.read() , b'abc') + + f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc')) + self.assertEqual(f.read(1) , b'a') + + def test_convert_file_to_utf8_decode_error_fallback(self): + from feedparser.encodings import convert_to_utf8, convert_file_to_utf8 + + input = ( + "abcd๐Ÿ˜€".encode('utf-8') * feedparser.encodings.CONVERT_FILE_PREFIX_LEN + + "abcd๐Ÿ˜€".encode('utf-32') + ) + headers = {} + + expected_result = {} + expected_output = convert_to_utf8(headers, input, expected_result) + actual_result = {} + factory = convert_file_to_utf8(headers, io.BytesIO(input), actual_result) + + self.assertEqual(factory.get_binary_file().read(), expected_output) + self.assertEqual(actual_result['encoding'], expected_result['encoding']) + self.assertEqual( + type(actual_result['bozo_exception']), + type(expected_result['bozo_exception']) + ) + + +def make_prefix_file_wrapper_test(make_file): + + def test(self): + f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def')) + self.assertEqual(f.read() , b'abcdef') + self.assertEqual(f.read() , b'') + + f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def')) + self.assertEqual(f.read(2) , b'ab') + self.assertEqual(f.read(2) , b'cd') + self.assertEqual(f.read(2) , b'ef') + self.assertEqual(f.read(2) , b'') + self.assertEqual(f.read() , b'') + + f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def')) + self.assertEqual(f.read(3) , b'abc') + self.assertEqual(f.read(3) , b'def') + self.assertEqual(f.read(3) , b'') + self.assertEqual(f.read() , b'') + + f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def')) + self.assertEqual(f.read(0) , b'') + self.assertEqual(f.read() , b'abcdef') + + return test + + +def _make_file(data): + return io.BytesIO(data) + +def _make_file_in_the_middle(data): + prefix = b'zzzzz' + rv = io.BytesIO(prefix + data) + rv.seek(len(prefix)) + return rv + +class _make_file_one_by_one(io.BytesIO): + def read(self, size=-1): + if size <= 0: + return super().read(size) + return super().read(1) + +class _make_file_not_seekable(io.BytesIO): + def tell(self): + raise io.UnsupportedOperation + def seek(self, *args): + raise io.UnsupportedOperation + +PREFIX_FILE_WRAPPER_FACTORIES = [ + _make_file, + _make_file_in_the_middle, + _make_file_one_by_one, +] + +for factory in PREFIX_FILE_WRAPPER_FACTORIES: + func = make_prefix_file_wrapper_test(factory) + setattr( + TestEncodingsHelpers, + f"test_prefix_file_wrapper_{factory.__name__.lstrip('_')}", + func + ) +del factory, func + + +def make_convert_file_prefix_to_utf8_test(headers): + from feedparser.encodings import convert_to_utf8, convert_file_prefix_to_utf8 + + def test(self): + + def call(data, **kwargs): + expected_result = {} + expected_output = convert_to_utf8( + headers, data.encode('utf-8'), expected_result + ) + file = io.BytesIO(data.encode('utf-8')) + + actual_result = {} + prefix = convert_file_prefix_to_utf8( + headers, file, actual_result, **kwargs + ) + rest = file.read() + + self.assertEqual(prefix + rest, expected_output) + self.assertEqual( + prefix.decode('utf-8') + rest.decode('utf-8'), + expected_output.decode('utf-8') + ) + + expected_result.pop('bozo_exception', None) + actual_result.pop('bozo_exception', None) + self.assertEqual(actual_result, expected_result) + + # these should be parametrized, but it's too complicated to do + + # each of the emojis is 4 bytes long when encoded as utf-8 + data = '๐Ÿ˜€๐Ÿ˜›๐Ÿคฏ๐Ÿ˜ฑ' + call(data, prefix_len=3) + call(data, prefix_len=4) + call(data, prefix_len=5) + call(data, prefix_len=8) + call(data, prefix_len=40) + call(data * 8, prefix_len=2, read_to_ascii_len=4) + call(data * 8, prefix_len=4, read_to_ascii_len=4) + + data = '๐Ÿ˜€a๐Ÿ˜›b๐Ÿคฏc๐Ÿ˜ฑ' + call(data, prefix_len=3) + call(data, prefix_len=4) + call(data, prefix_len=5) + call(data * 8, prefix_len=2, read_to_ascii_len=4) + call(data * 8, prefix_len=4, read_to_ascii_len=4) + + return test + + +def make_convert_file_to_utf8_test(headers, length): + from feedparser.encodings import convert_file_to_utf8, convert_to_utf8 + + digits = b'0123456789abcdef' + input = convert_to_utf8({}, b'', {}) + digits * int(length / len(digits) + 2) + + def test(self): + expected_result = {} + expected_output = convert_to_utf8(headers, input, expected_result) + expected_result.pop('bozo_exception', None) + + actual_result = {} + factory = convert_file_to_utf8(headers, io.BytesIO(input), actual_result) + + self.assertEqual(factory.get_text_file().read(), expected_output.decode('utf-8')) + self.assertEqual(factory.get_binary_file().read(), expected_output) + + actual_result.pop('bozo_exception', None) + self.assertEqual(actual_result, expected_result) + + actual_result = {} + factory = convert_file_to_utf8( + headers, io.StringIO(input.decode('utf-8')), actual_result + ) + + self.assertEqual(factory.get_text_file().read(), expected_output.decode('utf-8')) + + actual_result.pop('bozo_exception', None) + self.assertEqual(actual_result, expected_result) + + return test + + +CONVERT_TO_UTF8_HEADERS = { + 'simple': {}, + 'bad_content_type': {'content-type': 'not-a-valid-content-type'}, +} +CONVERT_TO_UTF8_LENGTHS = [ + feedparser.encodings.CONVERT_FILE_PREFIX_LEN, + feedparser.encodings.CONVERT_FILE_STR_PREFIX_LEN, +] + +for name, headers in CONVERT_TO_UTF8_HEADERS.items(): + setattr( + TestEncodingsHelpers, + f'test_convert_file_prefix_to_utf8_{name}', + make_convert_file_prefix_to_utf8_test(headers) + ) + for length in CONVERT_TO_UTF8_LENGTHS: + setattr( + TestEncodingsHelpers, + f'test_convert_file_to_utf8_{name}', + make_convert_file_to_utf8_test(headers, length) + ) + +del name, headers, length + + class TestFeedParserDict(unittest.TestCase): """Ensure that FeedParserDict returns values as expected and won't crash""" @@ -379,7 +609,7 @@ class TestOpenResource(unittest.TestCase): """Ensure that `_open_resource()` interprets its arguments as URIs, file-like objects, or in-memory feeds as expected""" def test_fileobj(self): - r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {}).read() self.assertEqual(r, b'') def test_feed(self): @@ -392,22 +622,22 @@ def test_feed_http(self): def test_bytes(self): s = b'text' - r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read() self.assertEqual(s, r) def test_string(self): s = b'text' - r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read() self.assertEqual(s, r) def test_unicode_1(self): s = b'text' - r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read() self.assertEqual(s, r) def test_unicode_2(self): s = br't\u00e9xt' - r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read() self.assertEqual(s, r) def test_http_client_ascii_unicode_encode_error(self): @@ -837,6 +1067,51 @@ def test_resolve_relative_uris_off(self): resolve_relative_uris=False) self.assertEqual(u'boo', d.entries[1].content[0].value) + def test_optimistic_encoding_detection(self): + length = feedparser.encodings.CONVERT_FILE_PREFIX_LEN + digits = '0123456789abcdef๐Ÿ˜€' + description = digits * int(length / len(digits) * 1.5) + + feed_xml = f""" + + + + id + {description} + + + + """ + + class NonSeekableFileWrapper: + def __init__(self, file): + self.file = file + def read(self, *args, **kwargs): + return self.file.read(*args, **kwargs) + def close(self): + pass + + kwargs_params = { + 'default': dict(), + 'on': dict(optimistic_encoding_detection=True), + 'off': dict(optimistic_encoding_detection=False), + } + input_params = { + 'binary_file': lambda: io.BytesIO(feed_xml.encode('utf-8')), + 'nonseekable_binary_file': + lambda: NonSeekableFileWrapper(io.BytesIO(feed_xml.encode('utf-8'))), + 'bytes': lambda: feed_xml.encode('utf-8'), + 'text_file': lambda: io.StringIO(feed_xml), + 'nonseekable_text_file': + lambda: NonSeekableFileWrapper(io.StringIO(feed_xml)), + 'string': lambda: feed_xml, + } + + for kwargs_name, kwargs in kwargs_params.items(): + for input_name, make_input in input_params.items(): + with self.subTest(f"{kwargs_name} {input_name}"): + d = feedparser.parse(make_input(), **kwargs) + self.assertEqual(d.entries[0].description, description) class TestSanitizer(unittest.TestCase): def test_style_attr_is_enabled(self): @@ -989,6 +1264,7 @@ def runtests(): testsuite.addTest(testloader.loadTestsFromTestCase(TestStrictParser)) testsuite.addTest(testloader.loadTestsFromTestCase(TestLooseParser)) testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodings)) + testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodingsHelpers)) testsuite.addTest(testloader.loadTestsFromTestCase(TestDateParsers)) testsuite.addTest(testloader.loadTestsFromTestCase(TestHTMLGuessing)) testsuite.addTest(testloader.loadTestsFromTestCase(TestHTTPStatus))