Skip to content

Commit

Permalink
Add stream-oriented version of convert_to_utf8().
Browse files Browse the repository at this point in the history
  • Loading branch information
lemon24 committed Jan 25, 2022
1 parent 3a806ae commit be3eab6
Show file tree
Hide file tree
Showing 2 changed files with 281 additions and 0 deletions.
195 changes: 195 additions & 0 deletions feedparser/encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import cgi
import codecs
import io
import re

try:
Expand Down Expand Up @@ -291,3 +292,197 @@ def convert_to_utf8(http_headers, data, result):
result['bozo'] = True
result['bozo_exception'] = error
return data


def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True):
"""Like convert_to_utf8(), but for a binary stream.
Unlike convert_to_utf8(), do not read the the entire file in memory;
instead, return a text stream that decodes it on the fly.
This should consume significantly less memory,
because it avoids (repeatedly) converting the entire file contents
from bytes to str and back.
To detect the encoding, only a prefix of the file contents is used.
In rare cases, the wrong encoding may be detected for this prefix;
use optimistic_encoding_detection=False to use the entire file contents
(equivalent to a plain convert_to_utf8() call).
Args:
http_headers (dict): The response headers.
file (typing.IO[bytes]): A read()-able binary stream.
result (dict): The result dictionary.
optimistic_encoding_detection (bool):
If true, use only a prefix of the file content to detect encoding.
Returns:
StreamFactory: a stream factory, with the detected encoding set, if any
"""
# Currently, this wraps convert_to_utf8(), because the logic is simply
# too complicated to ensure it's re-implemented correctly for a stream.
# That said, it should be possible to change the implementation
# transparently (not sure it's worth it, though).

if optimistic_encoding_detection:
prefix = convert_file_prefix_to_utf8(http_headers, file, result)
file = PrefixFileWrapper(prefix, file)

else:
# this shouldn't increase memory usage if file is BytesIO,
# since BytesIO does copy-on-write; https://bugs.python.org/issue22003
data = convert_to_utf8(http_headers, file.read(), result)
# still need to be able to reset() to the "beginning",
# and to access the prefix (note that data *is* the prefix)
file = PrefixFileWrapper(data, io.BytesIO(b''))

return StreamFactory(file, result.get('encoding'))


# In inital tests, 4k was enough for ~160 mostly-English feeds;
# 64k seems like a safe margin.
CONVERT_FILE_PREFIX_LEN = 2 ** 16

def convert_file_prefix_to_utf8(http_headers, file, result, prefix_len=CONVERT_FILE_PREFIX_LEN):
"""Like convert_to_utf8(), but only use the prefix of a binary file.
Set result like convert_to_utf8() would.
Return the updated prefix, as bytes.
"""
prefix = file.read(prefix_len)

# we call convert_to_utf8() up to 4 times,
# to make sure we eventually land on a code point boundary
for _ in range(4):
fake_result = {}
converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
if not fake_result.get('bozo'):
break

# check if the prefix we have is actually the whole thing
if len(prefix) < prefix_len:
break

byte = file.read(1)
if not byte:
break

prefix += byte
prefix_len += 1

result.update(fake_result)
return converted_prefix


class PrefixFileWrapper:
"""Stitch a (possibly modified) prefix and a file into a new file object.
If the underlying file is seekable, it is possible to read()
the same content again by calling reset().
>>> file = io.StringIO('abcdef')
>>> file.read(2)
'ab'
>>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
>>> wrapped.read()
'CDef'
>>> wrapped.reset()
>>> wrapped.read()
'CDef'
"""

def __init__(self, prefix, file):
self.prefix = prefix
self.file = file

try:
self.file_initial_offset = file.tell()
except OSError:
self.file_initial_offset = None

self.offset = 0

def reset(self):
# raises io.UnsupportedOperation if the underlying stream is not seekable
self.file.seek(self.file_initial_offset)
self.offset = 0

def read(self, size=-1):
buffer = self.file.read(0)

if self.offset < len(self.prefix):
if size < 0:
chunk = self.prefix
else:
chunk = self.prefix[self.offset : self.offset+size]
size -= len(chunk)
buffer += chunk
self.offset += len(chunk)

while True:
chunk = self.file.read(size)
if not chunk:
break
buffer += chunk
self.offset += len(chunk)

if size <= 0:
break

size -= len(chunk)

return buffer

def close(self):
# do not touch the underlying stream
pass


class MissingEncoding(io.UnsupportedOperation):
pass


class StreamFactory:

"""Decode on the fly a binary stream that *may* have a known encoding.
If the underlying stream has a reset() method,
it is possible to call the get_{text,binary}_file() methods more than once.
"""
# This could be implemented as a file-like object whose read()
# returns either bytes or str, depending on .encoding,
# but it would be difficult to get it to work with mypy.
#
# Having two different methods, one returning IO[str], and one IO[bytes],
# is much easier to type; also, it's better at showing intent.

def __init__(self, file, encoding=None):
self.file = file
self.encoding = encoding
self.should_reset = False

def get_text_file(self, fallback_encoding=None, errors='strict'):
encoding = self.encoding or fallback_encoding
if encoding is None:
raise MissingEncoding("cannot create text stream without encoding")
reader_factory = codecs.getreader(encoding)
reader = reader_factory(self.file, errors)
self.reset()
return reader

def get_binary_file(self):
self.reset()
return self.file

def reset(self):
if self.should_reset:
try:
self.file.reset()
except AttributeError:
raise io.UnsupportedOperation("underlying stream cannot be reset") from None
self.should_reset = True

86 changes: 86 additions & 0 deletions tests/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import feedparser
import feedparser.api
import feedparser.datetimes
import feedparser.encodings
import feedparser.http
import feedparser.mixin
import feedparser.sanitizer
Expand Down Expand Up @@ -294,6 +295,91 @@ def test_gb2312_converted_to_gb18030_in_xml_encoding(self):
})
self.assertEqual(result.encoding, 'gb18030')

def test_prefix_file_wrapper_not_seekable(self):
f = feedparser.encodings.PrefixFileWrapper(b'abc', _make_file_not_seekable(b'def'))

self.assertEqual(f.read() , b'abcdef')
self.assertEqual(f.read() , b'')
with self.assertRaises(io.UnsupportedOperation):
f.reset()
self.assertEqual(f.read() , b'')

f = feedparser.encodings.PrefixFileWrapper(b'abc', _make_file_not_seekable(b'def'))

self.assertEqual(f.read(3) , b'abc')
with self.assertRaises(io.UnsupportedOperation):
f.reset()
self.assertEqual(f.read() , b'def')

def test_prefix_file_wrapper_no_prefix(self):
f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
self.assertEqual(f.read(1) , b'a')
self.assertEqual(f.read() , b'bc')

f.reset()
self.assertEqual(f.read() , b'abc')


def make_prefix_file_wrapper_test(make_file):

def test(self):
f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))

self.assertEqual(f.read() , b'abcdef')
self.assertEqual(f.read() , b'')

f.reset()
self.assertEqual(f.read(2) , b'ab')
self.assertEqual(f.read(2) , b'cd')
self.assertEqual(f.read(2) , b'ef')
self.assertEqual(f.read(2) , b'')
self.assertEqual(f.read() , b'')

f.reset()
self.assertEqual(f.read(3) , b'abc')
self.assertEqual(f.read(3) , b'def')
self.assertEqual(f.read(3) , b'')
self.assertEqual(f.read() , b'')

f.reset()
self.assertEqual(f.read(0) , b'')
self.assertEqual(f.read() , b'abcdef')

f.reset()
f.reset()
self.assertEqual(f.read() , b'abcdef')

return test


def _make_file_in_the_middle(data):
prefix = b'zzzzz'
rv = io.BytesIO(prefix + data)
rv.seek(len(prefix))
return rv

class _make_file_one_by_one(io.BytesIO):
def read(self, size=-1):
if size <= 0:
return super().read(size)
return super().read(1)

class _make_file_not_seekable(io.BytesIO):
def tell(self):
raise io.UnsupportedOperation
def seek(self, *args):
raise io.UnsupportedOperation

prefix_file_wrapper_file_factories = [
io.BytesIO,
_make_file_in_the_middle,
_make_file_one_by_one,
]

for factory in prefix_file_wrapper_file_factories:
func = make_prefix_file_wrapper_test(factory)
setattr(TestEncodings, 'test_prefix_file_wrapper_%s' % func.__name__.lstrip('_'), func)


class TestFeedParserDict(unittest.TestCase):
"""Ensure that FeedParserDict returns values as expected and won't crash"""
Expand Down

0 comments on commit be3eab6

Please sign in to comment.