Skip to content

Commit

Permalink
warnings for narrow python builds
Browse files Browse the repository at this point in the history
if chars > 0xFFFF are really encountered a UnicodeWarning is
issued.

On import an ImportWarning is issued. These are ignored by
default, but can be enabled if python is invoked with `-W all`,
as any good developer should do ^^.

closes RDFLib#453
  • Loading branch information
joernhees committed Feb 18, 2015
1 parent 30f0f8c commit b8845b5
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 2 deletions.
15 changes: 15 additions & 0 deletions rdflib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@
logger.info("RDFLib Version: %s" % __version__)


try:
unichr(0x10FFFF)
except ValueError:
import warnings
warnings.warn(
'You are using a narrow Python build!\n'
'This means that your Python does not properly support chars > 16bit.\n'
'On your system chars like c=u"\\U0010FFFF" will have a len(c)==2.\n'
'As this can cause hard to debug problems with string processing\n'
'(slicing, regexp, ...) later on, we strongly advise to use a wide\n'
'Python build in production systems.',
ImportWarning)
del warnings


NORMALIZE_LITERALS = True
"""
If True - Literals lexical forms are normalized when created.
Expand Down
14 changes: 13 additions & 1 deletion rdflib/plugins/parsers/notation3.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import os
import re
import codecs
import warnings

from decimal import Decimal

Expand Down Expand Up @@ -303,10 +304,21 @@ def becauseSubexpression(*args, **kargs):

def unicodeExpand(m):
try:
return codecs.decode(m.group(0), 'unicode_escape')
return unichr(int(m.group(1), 16))
except:
raise Exception("Invalid unicode code point: " + m.group(1))

if py3compat.narrow_build:
def unicodeExpand(m):
try:
return unichr(int(m.group(1), 16))
except ValueError:
warnings.warn(
'Encountered a unicode char > 0xFFFF in a narrow python build. '
'Trying to degrade gracefully, but this can cause problems '
'later when working with the string:\n%s' % m.group(0))
return codecs.decode(m.group(0), 'unicode_escape')

unicodeEscape4 = re.compile(
r'\\u([0-9a-fA-F]{4})')
unicodeEscape8 = re.compile(
Expand Down
23 changes: 22 additions & 1 deletion rdflib/py3compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import re
import codecs
import warnings

try:
from functools import wraps
Expand Down Expand Up @@ -138,7 +139,27 @@ def sign(n):
r_unicodeEscape = re.compile(r'(\\u[0-9A-Fa-f]{4}|\\U[0-9A-Fa-f]{8})')

def _unicodeExpand(s):
return r_unicodeEscape.sub(lambda m: codecs.decode(m.group(0), 'unicode_escape'), s)
return r_unicodeEscape.sub(lambda m: unichr(int(m.group(0)[2:], 16)), s)

narrow_build = False
try:
unichr(0x10FFFF)
except ValueError:
narrow_build = True

if narrow_build:
def _unicodeExpand(s):
try:
return r_unicodeEscape.sub(
lambda m: unichr(int(m.group(0)[2:], 16)), s)
except ValueError:
warnings.warn(
'Encountered a unicode char > 0xFFFF in a narrow python build. '
'Trying to degrade gracefully, but this can cause problems '
'later when working with the string:\n%s' % s)
return r_unicodeEscape.sub(
lambda m: codecs.decode(m.group(0), 'unicode_escape'), s)


def decodeStringEscape(s):

Expand Down

0 comments on commit b8845b5

Please sign in to comment.