warnings for narrow python builds

if chars > 0xFFFF are really encountered a UnicodeWarning is issued. On import an ImportWarning is issued. These are ignored by default, but can be enabled if python is invoked with `-W all`, as any good developer should do ^^. closes RDFLib#453
joernhees · Feb 18, 2015 · b8845b5 · b8845b5
1 parent 30f0f8c
commit b8845b5
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 2 deletions.
diff --git a/rdflib/__init__.py b/rdflib/__init__.py
@@ -66,6 +66,21 @@
 logger.info("RDFLib Version: %s" % __version__)
 
 
+try:
+    unichr(0x10FFFF)
+except ValueError:
+    import warnings
+    warnings.warn(
+        'You are using a narrow Python build!\n'
+        'This means that your Python does not properly support chars > 16bit.\n'
+        'On your system chars like c=u"\\U0010FFFF" will have a len(c)==2.\n'
+        'As this can cause hard to debug problems with string processing\n'
+        '(slicing, regexp, ...) later on, we strongly advise to use a wide\n'
+        'Python build in production systems.',
+        ImportWarning)
+    del warnings
+
+
 NORMALIZE_LITERALS = True
 """
 If True - Literals lexical forms are normalized when created.

diff --git a/rdflib/plugins/parsers/notation3.py b/rdflib/plugins/parsers/notation3.py
@@ -34,6 +34,7 @@
 import os
 import re
 import codecs
+import warnings
 
 from decimal import Decimal
 
@@ -303,10 +304,21 @@ def becauseSubexpression(*args, **kargs):
 
 def unicodeExpand(m):
     try:
-        return codecs.decode(m.group(0), 'unicode_escape')
+        return unichr(int(m.group(1), 16))
     except:
         raise Exception("Invalid unicode code point: " + m.group(1))
 
+if py3compat.narrow_build:
+    def unicodeExpand(m):
+        try:
+            return unichr(int(m.group(1), 16))
+        except ValueError:
+            warnings.warn(
+                'Encountered a unicode char > 0xFFFF in a narrow python build. '
+                'Trying to degrade gracefully, but this can cause problems '
+                'later when working with the string:\n%s' % m.group(0))
+            return codecs.decode(m.group(0), 'unicode_escape')
+
 unicodeEscape4 = re.compile(
     r'\\u([0-9a-fA-F]{4})')
 unicodeEscape8 = re.compile(

diff --git a/rdflib/py3compat.py b/rdflib/py3compat.py
@@ -4,6 +4,7 @@
 import sys
 import re
 import codecs
+import warnings
 
 try:
     from functools import wraps
@@ -138,7 +139,27 @@ def sign(n):
 r_unicodeEscape = re.compile(r'(\\u[0-9A-Fa-f]{4}|\\U[0-9A-Fa-f]{8})')
 
 def _unicodeExpand(s):
-    return r_unicodeEscape.sub(lambda m: codecs.decode(m.group(0), 'unicode_escape'), s)
+    return r_unicodeEscape.sub(lambda m: unichr(int(m.group(0)[2:], 16)), s)
+
+narrow_build = False
+try:
+    unichr(0x10FFFF)
+except ValueError:
+    narrow_build = True
+
+if narrow_build:
+    def _unicodeExpand(s):
+        try:
+            return r_unicodeEscape.sub(
+                lambda m: unichr(int(m.group(0)[2:], 16)), s)
+        except ValueError:
+            warnings.warn(
+                'Encountered a unicode char > 0xFFFF in a narrow python build. '
+                'Trying to degrade gracefully, but this can cause problems '
+                'later when working with the string:\n%s' % s)
+            return r_unicodeEscape.sub(
+                lambda m: codecs.decode(m.group(0), 'unicode_escape'), s)
+
 
 def decodeStringEscape(s):