Apply decode_errors to encoding and fix _Attributes crashes (#41)

* Apply `decode_errors` to encoding as well, fixes #40 * Fix __repr__() segfault and buggy items() on _Attributes When calling attrs on a temporary instance, the tree may already be deallocated when the tag name is being retrieved, which causes a null pointer dereference. This fix doesn't really solve the problem, but at least it prevents the crash. Also fixes a bug in items() that made the method useless. Fixes #39 * Add test for unencodable strings
rushter · Jul 10, 2021 · a19369d · a19369d
1 parent 386392c
commit a19369d
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 3 deletions.
diff --git a/selectolax/node.pxi b/selectolax/node.pxi
@@ -88,7 +88,7 @@ cdef class _Attributes:
 
     def items(self):
         for key in self.__iter__():
-            yield (key, self(key))
+            yield key, self[key]
 
     def values(self):
         for key in self.__iter__():
@@ -111,7 +111,7 @@ cdef class _Attributes:
     def __repr__(self):
         cdef const char *c_text
         c_text = myhtml_tag_name_by_id(self.node.tree, self.node.tag_id, NULL)
-        tag_name = c_text.decode(_ENCODING, 'ignore')
+        tag_name = c_text.decode(_ENCODING, 'ignore') if c_text != NULL else 'unknown'
         return "<%s attributes, %s items>" % (tag_name, len(self))
 
 

diff --git a/selectolax/parser.pyx b/selectolax/parser.pyx
@@ -34,7 +34,7 @@ cdef class HTMLParser:
         self.decode_errors = decode_errors
 
         if isinstance(html, (str, unicode)):
-            bytes_html = html.encode('UTF-8')
+            bytes_html = html.encode('UTF-8', errors=decode_errors)
             detect_encoding = False
         elif isinstance(html, bytes):
             bytes_html = html

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -25,6 +25,21 @@ def test_encoding():
     html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-8')
     assert HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding == 'WINDOWS-1251'
 
+    # UTF-16 not ASCII-readable
+    html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-16le')
+    assert HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding == 'UTF-16LE'
+
+    # Unencodable characters in string, should not throw an exception by default
+    html_unencodable = b'<div>Roboto+Condensed</div>'.decode('utf-7', errors='ignore')
+    assert HTMLParser(html_unencodable).input_encoding == 'UTF-8'
+
+    # decode_errrors='strict' should error out
+    try:
+        HTMLParser(html_unencodable, decode_errors='strict')
+        assert False
+    except Exception as e:
+        assert type(e) is UnicodeEncodeError
+
 
 def test_parser():
     html = HTMLParser("")