Skip to content

Commit

Permalink
Apply decode_errors to encoding and fix _Attributes crashes (#41)
Browse files Browse the repository at this point in the history
* Apply `decode_errors` to encoding as well, fixes #40

* Fix __repr__() segfault and buggy items() on _Attributes

When calling attrs on a temporary instance, the tree may already be
deallocated when the tag name is being retrieved, which causes a null pointer
dereference. This fix doesn't really solve the problem, but at least it
prevents the crash.

Also fixes a bug in items() that made the method useless.

Fixes #39

* Add test for unencodable strings
  • Loading branch information
phoerious authored Jul 10, 2021
1 parent 386392c commit a19369d
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 3 deletions.
4 changes: 2 additions & 2 deletions selectolax/node.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ cdef class _Attributes:

def items(self):
for key in self.__iter__():
yield (key, self(key))
yield key, self[key]

def values(self):
for key in self.__iter__():
Expand All @@ -111,7 +111,7 @@ cdef class _Attributes:
def __repr__(self):
cdef const char *c_text
c_text = myhtml_tag_name_by_id(self.node.tree, self.node.tag_id, NULL)
tag_name = c_text.decode(_ENCODING, 'ignore')
tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
return "<%s attributes, %s items>" % (tag_name, len(self))


Expand Down
2 changes: 1 addition & 1 deletion selectolax/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ cdef class HTMLParser:
self.decode_errors = decode_errors

if isinstance(html, (str, unicode)):
bytes_html = html.encode('UTF-8')
bytes_html = html.encode('UTF-8', errors=decode_errors)
detect_encoding = False
elif isinstance(html, bytes):
bytes_html = html
Expand Down
15 changes: 15 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,21 @@ def test_encoding():
html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-8')
assert HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding == 'WINDOWS-1251'

# UTF-16 not ASCII-readable
html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-16le')
assert HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding == 'UTF-16LE'

# Unencodable characters in string, should not throw an exception by default
html_unencodable = b'<div>Roboto+Condensed</div>'.decode('utf-7', errors='ignore')
assert HTMLParser(html_unencodable).input_encoding == 'UTF-8'

# decode_errrors='strict' should error out
try:
HTMLParser(html_unencodable, decode_errors='strict')
assert False
except Exception as e:
assert type(e) is UnicodeEncodeError


def test_parser():
html = HTMLParser("")
Expand Down

0 comments on commit a19369d

Please sign in to comment.