diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
index 7c9077f3..75fc9a9c 100644
--- a/CONTRIBUTORS.rst
+++ b/CONTRIBUTORS.rst
@@ -14,6 +14,7 @@ bug report!
* `John Beimler `_
* `Beat Bolli `_
* `Franรงois Boulogne `_
+* `Adrian Damian `_
* `Jason Diamond `_
* `Jakub Kuczys `_
* `Fazal Majid `_
diff --git a/changelog.d/20220410_193326_lemon24_296_memory.rst b/changelog.d/20220410_193326_lemon24_296_memory.rst
new file mode 100644
index 00000000..708c38e0
--- /dev/null
+++ b/changelog.d/20220410_193326_lemon24_296_memory.rst
@@ -0,0 +1,10 @@
+Changed
+-------
+
+* Use only a prefix of the feed to detect encodings,
+ instead of reading the whole feed in memory.
+ This reduces the memory usage of parse() by up to ~3x (66-73%),
+ but may result in the wrong encoding being detected in rare cases;
+ use ``feedparser.parse(optimistic_encoding_detection=False)``
+ to get the original behavior (read the whole feed in memory).
+ (#296, #302)
diff --git a/feedparser/__init__.py b/feedparser/__init__.py
index 1e8877c0..0f9b0f71 100644
--- a/feedparser/__init__.py
+++ b/feedparser/__init__.py
@@ -46,3 +46,8 @@
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1
+
+
+# If you want feedparser to use only a prefix of the feed to detect encodings
+# (uses less memory), set this to 1.
+OPTIMISTIC_ENCODING_DETECTION = 1
diff --git a/feedparser/api.py b/feedparser/api.py
index 69c293da..c0260670 100644
--- a/feedparser/api.py
+++ b/feedparser/api.py
@@ -29,13 +29,13 @@
import datetime
import io
import time
-from typing import Dict, List, Union
+from typing import Dict, List, Union, IO
import urllib.error
import urllib.parse
import xml.sax
from .datetimes import registerDateHandler, _parse_date
-from .encodings import convert_to_utf8
+from .encodings import convert_file_to_utf8, MissingEncoding
from .html import BaseHTMLProcessor
from . import http
from .mixin import XMLParserMixin
@@ -106,20 +106,42 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
if request_headers is supplied it is a dictionary of HTTP request headers
that will override the values generated by FeedParser.
- :return: A bytes object.
+ :return: A seekable, readable file object.
"""
- if hasattr(url_file_stream_or_string, 'read'):
- return url_file_stream_or_string.read()
-
- if isinstance(url_file_stream_or_string, str) \
- and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
- return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+ # Some notes on the history of the implementation of _open_resource().
+ #
+ # parse() might need to go over the feed content twice:
+ # if the strict parser fails, it tries again with the loose parser.
+ #
+ # In 5.2.0, this returned an open file, to be read() by parse().
+ # By 6.0.8, this returned bytes directly.
+ #
+ # Since #296 (>6.0.8), this once again returns an open file
+ # (to reduce memory usage, see convert_file_to_utf8() for details).
+ # However, to accommodate parse() needing the content twice,
+ # the returned file is guaranteed to be seekable.
+ # (If the underlying resource is not seekable,
+ # the content is read and wrapped in a io.BytesIO/StringIO.)
+
+ if callable(getattr(url_file_stream_or_string, 'read', None)):
+ if callable(getattr(url_file_stream_or_string, 'seekable', None)):
+ if url_file_stream_or_string.seekable():
+ return url_file_stream_or_string
+ return _to_in_memory_file(url_file_stream_or_string.read())
+
+ looks_like_url = (
+ isinstance(url_file_stream_or_string, str)
+ and urllib.parse.urlparse(url_file_stream_or_string)[0]
+ in ('http', 'https', 'ftp', 'file', 'feed')
+ )
+ if looks_like_url:
+ data = http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+ return io.BytesIO(data)
# try to open with native open function (if url_file_stream_or_string is a filename)
try:
- with open(url_file_stream_or_string, 'rb') as f:
- data = f.read()
+ return open(url_file_stream_or_string, 'rb')
except (IOError, UnicodeEncodeError, TypeError, ValueError):
# if url_file_stream_or_string is a str object that
# cannot be converted to the encoding returned by
@@ -129,13 +151,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
# (such as an XML document encoded in UTF-32), TypeError will
# be thrown.
pass
- else:
- return data
- # treat url_file_stream_or_string as string
- if not isinstance(url_file_stream_or_string, bytes):
- return url_file_stream_or_string.encode('utf-8')
- return url_file_stream_or_string
+ # treat url_file_stream_or_string as bytes/string
+ return _to_in_memory_file(url_file_stream_or_string)
+
+
+def _to_in_memory_file(data):
+ if isinstance(data, str):
+ return io.StringIO(data)
+ else:
+ return io.BytesIO(data)
class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
@@ -156,6 +181,7 @@ def parse(
response_headers: Dict[str, str] = None,
resolve_relative_uris: bool = None,
sanitize_html: bool = None,
+ optimistic_encoding_detection: bool = None,
) -> FeedParserDict:
"""Parse a feed from a URL, file, stream, or string.
@@ -199,6 +225,11 @@ def parse(
Should feedparser skip HTML sanitization? Only disable this if you know
what you are doing! Defaults to the value of
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
+ :param optimistic_encoding_detection:
+ Should feedparser use only a prefix of the feed to detect encodings
+ (uses less memory, but the wrong encoding may be detected in rare cases).
+ Defaults to the value of
+ :data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``.
"""
@@ -206,12 +237,6 @@ def parse(
if not agent:
import feedparser
agent = feedparser.USER_AGENT
- if sanitize_html is None:
- import feedparser
- sanitize_html = bool(feedparser.SANITIZE_HTML)
- if resolve_relative_uris is None:
- import feedparser
- resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
result = FeedParserDict(
bozo=False,
@@ -221,7 +246,7 @@ def parse(
)
try:
- data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+ file = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
except urllib.error.URLError as error:
result.update({
'bozo': True,
@@ -229,18 +254,72 @@ def parse(
})
return result
- if not data:
+ # at this point, the file is guaranteed to be seekable;
+ # we read 1 byte/character to see if it's empty and return early
+ # (this preserves the behavior in 6.0.8)
+ initial_file_offset = file.tell()
+ if not file.read(1):
return result
+ file.seek(initial_file_offset)
# overwrite existing headers using response_headers
result['headers'].update(response_headers or {})
- data = convert_to_utf8(result['headers'], data, result)
+ try:
+ _parse_file_inplace(
+ file,
+ result,
+ resolve_relative_uris=resolve_relative_uris,
+ sanitize_html=sanitize_html,
+ optimistic_encoding_detection=optimistic_encoding_detection,
+ )
+ finally:
+ if not hasattr(url_file_stream_or_string, 'read'):
+ # the file does not come from the user, close it
+ file.close()
+
+ return result
+
+
+def _parse_file_inplace(
+ file: Union[IO[bytes], IO[str]],
+ result: dict,
+ *,
+ resolve_relative_uris: bool = None,
+ sanitize_html: bool = None,
+ optimistic_encoding_detection: bool = None,
+) -> None:
+
+ # Avoid a cyclic import.
+ import feedparser
+ if sanitize_html is None:
+ sanitize_html = bool(feedparser.SANITIZE_HTML)
+ if resolve_relative_uris is None:
+ resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
+ if optimistic_encoding_detection is None:
+ optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION)
+
+ stream_factory = convert_file_to_utf8(
+ result['headers'], file, result, optimistic_encoding_detection
+ )
+ # We're done with file, all access must happen through stream_factory.
+ del file
+
+ # Some notes about the stream_factory.get_{text,binary}_file() methods:
+ #
+ # Calling them a second time will raise io.UnsupportedOperation
+ # if the underlying file was not seekable.
+ #
+ # Calling close() on the returned file is ignored
+ # (that is, the underlying file is *not* closed),
+ # because the SAX parser closes the file when done;
+ # we don't want that, since we might try again with the loose parser.
+
use_json_parser = result['content-type'] == 'application/json'
use_strict_parser = result['encoding'] and True or False
if not use_json_parser:
- result['version'], data, entities = replace_doctype(data)
+ result['version'], stream_factory.prefix, entities = replace_doctype(stream_factory.prefix)
# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
contentloc = result['headers'].get('content-location', '')
@@ -253,15 +332,18 @@ def parse(
if not _XML_AVAILABLE:
use_strict_parser = False
+
feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]
+
if use_json_parser:
result['version'] = None
feed_parser = JSONParser(baseuri, baselang, 'utf-8')
try:
- feed_parser.feed(data)
+ feed_parser.feed(stream_factory.get_file())
except Exception as e:
result['bozo'] = 1
result['bozo_exception'] = e
+
elif use_strict_parser:
# Initialize the SAX parser.
feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8')
@@ -277,7 +359,14 @@ def parse(
saxparser.setContentHandler(feed_parser)
saxparser.setErrorHandler(feed_parser)
source = xml.sax.xmlreader.InputSource()
- source.setByteStream(io.BytesIO(data))
+
+ # If an encoding was detected, decode the file on the fly;
+ # otherwise, pass it as-is and let the SAX parser deal with it.
+ try:
+ source.setCharacterStream(stream_factory.get_text_file())
+ except MissingEncoding:
+ source.setByteStream(stream_factory.get_binary_file())
+
try:
saxparser.parse(source)
except xml.sax.SAXException as e:
@@ -291,7 +380,22 @@ def parse(
feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
feed_parser.resolve_relative_uris = resolve_relative_uris
feed_parser.sanitize_html = sanitize_html
- feed_parser.feed(data.decode('utf-8', 'replace'))
+
+ # If an encoding was detected, use it; otherwise, assume utf-8 and do your best.
+ # Will raise io.UnsupportedOperation if the underlying file is not seekable.
+ data = stream_factory.get_text_file('utf-8', 'replace').read()
+
+ # As of 6.0.8, LooseFeedParser.feed() can be called exactly once
+ # with the entire data (it does some re.sub() and str.replace() on it).
+ #
+ # SGMLParser (of which LooseFeedParser is a subclass)
+ # *can* be fed in a streaming fashion,
+ # by calling feed() repeatedly with chunks of text.
+ #
+ # When/if LooseFeedParser will support being fed chunks,
+ # replace the read() call above with read(size)/feed() calls in a loop.
+
+ feed_parser.feed(data)
result['feed'] = feed_parser.feeddata
result['entries'] = feed_parser.entries
@@ -300,4 +404,3 @@ def parse(
result['namespaces'] = {}
else:
result['namespaces'] = feed_parser.namespaces_in_use
- return result
diff --git a/feedparser/encodings.py b/feedparser/encodings.py
index 73251fc1..9497877b 100644
--- a/feedparser/encodings.py
+++ b/feedparser/encodings.py
@@ -27,6 +27,7 @@
# POSSIBILITY OF SUCH DAMAGE.
import codecs
+import io
import re
import typing as t
@@ -312,3 +313,312 @@ def convert_to_utf8(http_headers, data, result):
result['bozo'] = True
result['bozo_exception'] = error
return data
+
+
+# How much to read from a binary file in order to detect encoding.
+# In inital tests, 4k was enough for ~160 mostly-English feeds;
+# 64k seems like a safe margin.
+CONVERT_FILE_PREFIX_LEN = 2 ** 16
+
+# How much to read from a text file, and use as an utf-8 bytes prefix.
+# Note that no encoding detection is needed in this case.
+CONVERT_FILE_STR_PREFIX_LEN = 2 ** 13
+
+CONVERT_FILE_TEST_CHUNK_LEN = 2 ** 16
+
+
+def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True):
+ """Like convert_to_utf8(), but for a stream.
+
+ Unlike convert_to_utf8(), do not read the the entire file in memory;
+ instead, return a text stream that decodes it on the fly.
+ This should consume significantly less memory,
+ because it avoids (repeatedly) converting the entire file contents
+ from bytes to str and back.
+
+ To detect the encoding, only a prefix of the file contents is used.
+ In rare cases, the wrong encoding may be detected for this prefix;
+ use optimistic_encoding_detection=False to use the entire file contents
+ (equivalent to a plain convert_to_utf8() call).
+
+ Args:
+ http_headers (dict): The response headers.
+ file (IO[bytes] or IO[str]): A read()-able (binary) stream.
+ result (dict): The result dictionary.
+ optimistic_encoding_detection (bool):
+ If true, use only a prefix of the file content to detect encoding.
+
+ Returns:
+ StreamFactory: a stream factory, with the detected encoding set, if any
+
+ """
+ # Currently, this wraps convert_to_utf8(), because the logic is simply
+ # too complicated to ensure it's re-implemented correctly for a stream.
+ # That said, it should be possible to change the implementation
+ # transparently (not sure it's worth it, though).
+
+ # If file is a text stream, we don't need to detect encoding;
+ # we still need a bytes prefix to run functions on for side effects:
+ # convert_to_utf8() to sniff / set result['content-type'], and
+ # replace_doctype() to extract safe_entities.
+
+ if isinstance(file.read(0), str):
+ prefix = file.read(CONVERT_FILE_STR_PREFIX_LEN).encode('utf-8')
+ prefix = convert_to_utf8(http_headers, prefix, result)
+ result['encoding'] = 'utf-8'
+ return StreamFactory(prefix, file, 'utf-8')
+
+ if optimistic_encoding_detection:
+ prefix = convert_file_prefix_to_utf8(http_headers, file, result)
+ factory = StreamFactory(prefix, file, result.get('encoding'))
+
+ # Before returning factory, ensure the entire file can be decoded;
+ # if it cannot, fall back to convert_to_utf8().
+ #
+ # Not doing this means feedparser.parse() may raise UnicodeDecodeError
+ # instead of setting bozo_exception to CharacterEncodingOverride,
+ # breaking the 6.x API.
+
+ try:
+ text_file = factory.get_text_file()
+ except MissingEncoding:
+ return factory
+ try:
+ # read in chunks to limit memory usage
+ while text_file.read(CONVERT_FILE_TEST_CHUNK_LEN):
+ pass
+ except UnicodeDecodeError:
+ # fall back to convert_to_utf8()
+ file = factory.get_binary_file()
+ else:
+ return factory
+
+ # this shouldn't increase memory usage if file is BytesIO,
+ # since BytesIO does copy-on-write; https://bugs.python.org/issue22003
+ data = convert_to_utf8(http_headers, file.read(), result)
+
+ # note that data *is* the prefix
+ return StreamFactory(data, io.BytesIO(b''), result.get('encoding'))
+
+
+def convert_file_prefix_to_utf8(
+ http_headers, file: t.IO[bytes], result,
+ *,
+ prefix_len: int = CONVERT_FILE_PREFIX_LEN,
+ read_to_ascii_len: int = 2**8,
+) -> bytes:
+ """Like convert_to_utf8(), but only use the prefix of a binary file.
+
+ Set result like convert_to_utf8() would.
+
+ Return the updated prefix, as bytes.
+
+ """
+ # This is complicated by convert_to_utf8() detecting the wrong encoding
+ # if we have only part of the bytes that make a code-point:
+ #
+ # '๐'.encode('utf-8') -> utf-8
+ # '๐'.encode('utf-8')[:-1] -> windows-1252 + bozo
+
+ prefix = file.read(prefix_len - 1)
+
+ # reading up to after an ASCII byte increases
+ # the likelihood of being on a code point boundary
+ prefix += read_to_after_ascii_byte(file, read_to_ascii_len)
+
+ # call convert_to_utf8() up to 4 times,
+ # to make sure we eventually land on a code point boundary
+ candidates = []
+ for attempt in range(4):
+ byte = file.read(1)
+
+ # we're at the end of the file, and the loop already ran once
+ if not byte and attempt != 0:
+ break
+
+ prefix += byte
+
+ fake_result: t.Any = {}
+ converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
+
+ # an encoding was detected successfully, keep it
+ if not fake_result.get('bozo'):
+ break
+
+ candidates.append((file.tell(), converted_prefix, fake_result))
+
+ # no encoding was detected successfully, pick the "best" one
+ else:
+
+ def key(candidate):
+ *_, result = candidate
+
+ exc = result.get('bozo_exception')
+ exc_score = 0
+ if isinstance(exc, NonXMLContentType):
+ exc_score = 20
+ elif isinstance(exc, CharacterEncodingOverride):
+ exc_score = 10
+
+ return (
+ exc_score,
+ # prefer utf- encodings to anything else
+ result.get('encoding').startswith('utf-')
+ )
+
+ candidates.sort(key=key)
+ offset, converted_prefix, fake_result = candidates[-1]
+
+ file.seek(offset)
+
+ result.update(fake_result)
+ return converted_prefix
+
+
+def read_to_after_ascii_byte(file: t.IO[bytes], max_len: int) -> bytes:
+ offset = file.tell()
+ buffer = b''
+
+ for _ in range(max_len):
+ byte = file.read(1)
+
+ # end of file, nothing to do
+ if not byte:
+ break
+
+ buffer += byte
+
+ # we stop after a ASCII character
+ if byte < b'\x80':
+ break
+
+ # couldn't find an ASCII character, reset the file to the original offset
+ else:
+ file.seek(offset)
+ return b''
+
+ return buffer
+
+
+class MissingEncoding(io.UnsupportedOperation):
+ pass
+
+
+class StreamFactory:
+
+ """Decode on the fly a binary stream that *may* have a known encoding.
+
+ If the underlying stream is seekable, it is possible to call
+ the get_{text,binary}_file() methods more than once.
+
+ """
+
+ def __init__(self, prefix: bytes, file, encoding=None):
+ self.prefix = prefix
+ self.file = ResetFileWrapper(file)
+ self.encoding = encoding
+ self.should_reset = False
+
+ def get_text_file(self, fallback_encoding=None, errors='strict'):
+ encoding = self.encoding or fallback_encoding
+ if encoding is None:
+ raise MissingEncoding("cannot create text stream without encoding")
+
+ if isinstance(self.file.read(0), str):
+ file = PrefixFileWrapper(self.prefix.decode(encoding), self.file)
+ else:
+ file = PrefixFileWrapper(
+ self.prefix.decode('utf-8', errors),
+ codecs.getreader(encoding)(self.file, errors)
+ )
+
+ self.reset()
+ return file
+
+ def get_binary_file(self):
+ if isinstance(self.file.read(0), str):
+ raise io.UnsupportedOperation("underlying stream is text, not binary") from None
+
+ file = PrefixFileWrapper(self.prefix, self.file)
+
+ self.reset()
+ return file
+
+ def get_file(self):
+ try:
+ return self.get_text_file()
+ except MissingEncoding:
+ return self.get_binary_file()
+
+ def reset(self):
+ if self.should_reset:
+ self.file.reset()
+ self.should_reset = True
+
+
+class ResetFileWrapper:
+ """Given a seekable file, allow reading its content again
+ (from the current position) by calling reset().
+
+ """
+ def __init__(self, file):
+ self.file = file
+ try:
+ self.file_initial_offset = file.tell()
+ except OSError:
+ self.file_initial_offset = None
+
+ def read(self, size=-1):
+ return self.file.read(size)
+
+ def reset(self):
+ # raises io.UnsupportedOperation if the underlying stream is not seekable
+ self.file.seek(self.file_initial_offset)
+
+
+class PrefixFileWrapper:
+ """Stitch a (possibly modified) prefix and a file into a new file object.
+
+ >>> file = io.StringIO('abcdef')
+ >>> file.read(2)
+ 'ab'
+ >>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
+ >>> wrapped.read()
+ 'CDef'
+
+ """
+ def __init__(self, prefix, file):
+ self.prefix = prefix
+ self.file = file
+ self.offset = 0
+
+ def read(self, size=-1):
+ buffer = self.file.read(0)
+
+ if self.offset < len(self.prefix):
+ if size < 0:
+ chunk = self.prefix
+ else:
+ chunk = self.prefix[self.offset : self.offset+size]
+ size -= len(chunk)
+ buffer += chunk
+ self.offset += len(chunk)
+
+ while True:
+ chunk = self.file.read(size)
+ if not chunk:
+ break
+ buffer += chunk
+ self.offset += len(chunk)
+
+ if size <= 0:
+ break
+
+ size -= len(chunk)
+
+ return buffer
+
+ def close(self):
+ # do not touch the underlying stream
+ pass
+
diff --git a/feedparser/parsers/json.py b/feedparser/parsers/json.py
index ae43163c..ccfd065d 100644
--- a/feedparser/parsers/json.py
+++ b/feedparser/parsers/json.py
@@ -61,8 +61,8 @@ def __init__(self, baseuri=None, baselang=None, encoding=None):
self.namespacesInUse = []
self.entries = []
- def feed(self, data):
- data = json.loads(data)
+ def feed(self, file):
+ data = json.load(file)
v = data.get('version', '')
try:
diff --git a/tests/runtests.py b/tests/runtests.py
index 4dd70f3e..33cacad0 100644
--- a/tests/runtests.py
+++ b/tests/runtests.py
@@ -49,6 +49,7 @@
import feedparser
import feedparser.api
import feedparser.datetimes
+import feedparser.encodings
import feedparser.http
import feedparser.mixin
import feedparser.sanitizer
@@ -295,6 +296,235 @@ def test_gb2312_converted_to_gb18030_in_xml_encoding(self):
self.assertEqual(result.encoding, 'gb18030')
+class TestEncodingsHelpers(BaseTestCase):
+
+ def test_reset_file_wrapper(self):
+ f = feedparser.encodings.ResetFileWrapper(io.BytesIO(b'abcdef'))
+ self.assertEqual(f.read(2) , b'ab')
+ f.reset()
+ self.assertEqual(f.read() , b'abcdef')
+
+ f = io.BytesIO(b'abcdef')
+ f.read(2)
+ f = feedparser.encodings.ResetFileWrapper(f)
+ self.assertEqual(f.read(2) , b'cd')
+ f.reset()
+ self.assertEqual(f.read() , b'cdef')
+
+ f = feedparser.encodings.ResetFileWrapper(_make_file_not_seekable(b'abcdef'))
+ self.assertEqual(f.read() , b'abcdef')
+ self.assertEqual(f.read() , b'')
+ with self.assertRaises(io.UnsupportedOperation):
+ f.reset()
+ self.assertEqual(f.read() , b'')
+
+ f = feedparser.encodings.ResetFileWrapper(_make_file_not_seekable(b'abcdef'))
+ self.assertEqual(f.read(3) , b'abc')
+ with self.assertRaises(io.UnsupportedOperation):
+ f.reset()
+ self.assertEqual(f.read() , b'def')
+
+ def test_prefix_file_wrapper_no_prefix(self):
+ f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
+ self.assertEqual(f.read() , b'abc')
+
+ f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
+ self.assertEqual(f.read(1) , b'a')
+
+ def test_convert_file_to_utf8_decode_error_fallback(self):
+ from feedparser.encodings import convert_to_utf8, convert_file_to_utf8
+
+ input = (
+ "abcd๐".encode('utf-8') * feedparser.encodings.CONVERT_FILE_PREFIX_LEN
+ + "abcd๐".encode('utf-32')
+ )
+ headers = {}
+
+ expected_result = {}
+ expected_output = convert_to_utf8(headers, input, expected_result)
+ actual_result = {}
+ factory = convert_file_to_utf8(headers, io.BytesIO(input), actual_result)
+
+ self.assertEqual(factory.get_binary_file().read(), expected_output)
+ self.assertEqual(actual_result['encoding'], expected_result['encoding'])
+ self.assertEqual(
+ type(actual_result['bozo_exception']),
+ type(expected_result['bozo_exception'])
+ )
+
+
+def make_prefix_file_wrapper_test(make_file):
+
+ def test(self):
+ f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+ self.assertEqual(f.read() , b'abcdef')
+ self.assertEqual(f.read() , b'')
+
+ f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+ self.assertEqual(f.read(2) , b'ab')
+ self.assertEqual(f.read(2) , b'cd')
+ self.assertEqual(f.read(2) , b'ef')
+ self.assertEqual(f.read(2) , b'')
+ self.assertEqual(f.read() , b'')
+
+ f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+ self.assertEqual(f.read(3) , b'abc')
+ self.assertEqual(f.read(3) , b'def')
+ self.assertEqual(f.read(3) , b'')
+ self.assertEqual(f.read() , b'')
+
+ f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+ self.assertEqual(f.read(0) , b'')
+ self.assertEqual(f.read() , b'abcdef')
+
+ return test
+
+
+def _make_file(data):
+ return io.BytesIO(data)
+
+def _make_file_in_the_middle(data):
+ prefix = b'zzzzz'
+ rv = io.BytesIO(prefix + data)
+ rv.seek(len(prefix))
+ return rv
+
+class _make_file_one_by_one(io.BytesIO):
+ def read(self, size=-1):
+ if size <= 0:
+ return super().read(size)
+ return super().read(1)
+
+class _make_file_not_seekable(io.BytesIO):
+ def tell(self):
+ raise io.UnsupportedOperation
+ def seek(self, *args):
+ raise io.UnsupportedOperation
+
+PREFIX_FILE_WRAPPER_FACTORIES = [
+ _make_file,
+ _make_file_in_the_middle,
+ _make_file_one_by_one,
+]
+
+for factory in PREFIX_FILE_WRAPPER_FACTORIES:
+ func = make_prefix_file_wrapper_test(factory)
+ setattr(
+ TestEncodingsHelpers,
+ f"test_prefix_file_wrapper_{factory.__name__.lstrip('_')}",
+ func
+ )
+del factory, func
+
+
+def make_convert_file_prefix_to_utf8_test(headers):
+ from feedparser.encodings import convert_to_utf8, convert_file_prefix_to_utf8
+
+ def test(self):
+
+ def call(data, **kwargs):
+ expected_result = {}
+ expected_output = convert_to_utf8(
+ headers, data.encode('utf-8'), expected_result
+ )
+ file = io.BytesIO(data.encode('utf-8'))
+
+ actual_result = {}
+ prefix = convert_file_prefix_to_utf8(
+ headers, file, actual_result, **kwargs
+ )
+ rest = file.read()
+
+ self.assertEqual(prefix + rest, expected_output)
+ self.assertEqual(
+ prefix.decode('utf-8') + rest.decode('utf-8'),
+ expected_output.decode('utf-8')
+ )
+
+ expected_result.pop('bozo_exception', None)
+ actual_result.pop('bozo_exception', None)
+ self.assertEqual(actual_result, expected_result)
+
+ # these should be parametrized, but it's too complicated to do
+
+ # each of the emojis is 4 bytes long when encoded as utf-8
+ data = '๐๐๐คฏ๐ฑ'
+ call(data, prefix_len=3)
+ call(data, prefix_len=4)
+ call(data, prefix_len=5)
+ call(data, prefix_len=8)
+ call(data, prefix_len=40)
+ call(data * 8, prefix_len=2, read_to_ascii_len=4)
+ call(data * 8, prefix_len=4, read_to_ascii_len=4)
+
+ data = '๐a๐b๐คฏc๐ฑ'
+ call(data, prefix_len=3)
+ call(data, prefix_len=4)
+ call(data, prefix_len=5)
+ call(data * 8, prefix_len=2, read_to_ascii_len=4)
+ call(data * 8, prefix_len=4, read_to_ascii_len=4)
+
+ return test
+
+
+def make_convert_file_to_utf8_test(headers, length):
+ from feedparser.encodings import convert_file_to_utf8, convert_to_utf8
+
+ digits = b'0123456789abcdef'
+ input = convert_to_utf8({}, b'', {}) + digits * int(length / len(digits) + 2)
+
+ def test(self):
+ expected_result = {}
+ expected_output = convert_to_utf8(headers, input, expected_result)
+ expected_result.pop('bozo_exception', None)
+
+ actual_result = {}
+ factory = convert_file_to_utf8(headers, io.BytesIO(input), actual_result)
+
+ self.assertEqual(factory.get_text_file().read(), expected_output.decode('utf-8'))
+ self.assertEqual(factory.get_binary_file().read(), expected_output)
+
+ actual_result.pop('bozo_exception', None)
+ self.assertEqual(actual_result, expected_result)
+
+ actual_result = {}
+ factory = convert_file_to_utf8(
+ headers, io.StringIO(input.decode('utf-8')), actual_result
+ )
+
+ self.assertEqual(factory.get_text_file().read(), expected_output.decode('utf-8'))
+
+ actual_result.pop('bozo_exception', None)
+ self.assertEqual(actual_result, expected_result)
+
+ return test
+
+
+CONVERT_TO_UTF8_HEADERS = {
+ 'simple': {},
+ 'bad_content_type': {'content-type': 'not-a-valid-content-type'},
+}
+CONVERT_TO_UTF8_LENGTHS = [
+ feedparser.encodings.CONVERT_FILE_PREFIX_LEN,
+ feedparser.encodings.CONVERT_FILE_STR_PREFIX_LEN,
+]
+
+for name, headers in CONVERT_TO_UTF8_HEADERS.items():
+ setattr(
+ TestEncodingsHelpers,
+ f'test_convert_file_prefix_to_utf8_{name}',
+ make_convert_file_prefix_to_utf8_test(headers)
+ )
+ for length in CONVERT_TO_UTF8_LENGTHS:
+ setattr(
+ TestEncodingsHelpers,
+ f'test_convert_file_to_utf8_{name}',
+ make_convert_file_to_utf8_test(headers, length)
+ )
+
+del name, headers, length
+
+
class TestFeedParserDict(unittest.TestCase):
"""Ensure that FeedParserDict returns values as expected and won't crash"""
@@ -379,7 +609,7 @@ class TestOpenResource(unittest.TestCase):
"""Ensure that `_open_resource()` interprets its arguments as URIs, file-like objects, or in-memory feeds as expected"""
def test_fileobj(self):
- r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {})
+ r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {}).read()
self.assertEqual(r, b'')
def test_feed(self):
@@ -392,22 +622,22 @@ def test_feed_http(self):
def test_bytes(self):
s = b'- text
'
- r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
+ r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
self.assertEqual(s, r)
def test_string(self):
s = b'- text
'
- r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
+ r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
self.assertEqual(s, r)
def test_unicode_1(self):
s = b'- text
'
- r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
+ r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
self.assertEqual(s, r)
def test_unicode_2(self):
s = br'- t\u00e9xt
'
- r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
+ r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
self.assertEqual(s, r)
def test_http_client_ascii_unicode_encode_error(self):
@@ -837,6 +1067,51 @@ def test_resolve_relative_uris_off(self):
resolve_relative_uris=False)
self.assertEqual(u'boo', d.entries[1].content[0].value)
+ def test_optimistic_encoding_detection(self):
+ length = feedparser.encodings.CONVERT_FILE_PREFIX_LEN
+ digits = '0123456789abcdef๐'
+ description = digits * int(length / len(digits) * 1.5)
+
+ feed_xml = f"""
+
+
+ -
+ id
+ {description}
+
+
+
+ """
+
+ class NonSeekableFileWrapper:
+ def __init__(self, file):
+ self.file = file
+ def read(self, *args, **kwargs):
+ return self.file.read(*args, **kwargs)
+ def close(self):
+ pass
+
+ kwargs_params = {
+ 'default': dict(),
+ 'on': dict(optimistic_encoding_detection=True),
+ 'off': dict(optimistic_encoding_detection=False),
+ }
+ input_params = {
+ 'binary_file': lambda: io.BytesIO(feed_xml.encode('utf-8')),
+ 'nonseekable_binary_file':
+ lambda: NonSeekableFileWrapper(io.BytesIO(feed_xml.encode('utf-8'))),
+ 'bytes': lambda: feed_xml.encode('utf-8'),
+ 'text_file': lambda: io.StringIO(feed_xml),
+ 'nonseekable_text_file':
+ lambda: NonSeekableFileWrapper(io.StringIO(feed_xml)),
+ 'string': lambda: feed_xml,
+ }
+
+ for kwargs_name, kwargs in kwargs_params.items():
+ for input_name, make_input in input_params.items():
+ with self.subTest(f"{kwargs_name} {input_name}"):
+ d = feedparser.parse(make_input(), **kwargs)
+ self.assertEqual(d.entries[0].description, description)
class TestSanitizer(unittest.TestCase):
def test_style_attr_is_enabled(self):
@@ -989,6 +1264,7 @@ def runtests():
testsuite.addTest(testloader.loadTestsFromTestCase(TestStrictParser))
testsuite.addTest(testloader.loadTestsFromTestCase(TestLooseParser))
testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodings))
+ testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodingsHelpers))
testsuite.addTest(testloader.loadTestsFromTestCase(TestDateParsers))
testsuite.addTest(testloader.loadTestsFromTestCase(TestHTMLGuessing))
testsuite.addTest(testloader.loadTestsFromTestCase(TestHTTPStatus))