Merge with #532, fix unicode filenames with escapesurogates

gitpython-developers · Oct 16, 2016 · ec731f4 · ec731f4
2 parents b2efa1b + 9e4a454
commit ec731f4
Show file tree

Hide file tree

Showing 7 changed files with 209 additions and 18 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.9dev0
+2.0.10dev0
diff --git a/git/compat.py b/git/compat.py
@@ -10,6 +10,8 @@
 import locale
 import os
 import sys
+import codecs
+
 
 from gitdb.utils.compat import (
     xrange,
@@ -67,7 +69,7 @@ def safe_decode(s):
     if isinstance(s, unicode):
         return s
     elif isinstance(s, bytes):
-        return s.decode(defenc, 'replace')
+        return s.decode(defenc, 'surrogateescape')
     elif s is not None:
         raise TypeError('Expected bytes or text, but got %r' % (s,))
 
@@ -121,3 +123,191 @@ def __str__(self):
     else:  # Python 2
         def __str__(self):
             return self.__unicode__().encode(defenc)
+
+
+"""
+This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
+handler of Python 3.
+Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
+"""
+
+# This code is released under the Python license and the BSD 2-clause license
+
+
+FS_ERRORS = 'surrogateescape'
+
+#     # -- Python 2/3 compatibility -------------------------------------
+#     FS_ERRORS = 'my_surrogateescape'
+
+def u(text):
+    if PY3:
+        return text
+    else:
+        return text.decode('unicode_escape')
+
+def b(data):
+    if PY3:
+        return data.encode('latin1')
+    else:
+        return data
+
+if PY3:
+    _unichr = chr
+    bytes_chr = lambda code: bytes((code,))
+else:
+    _unichr = unichr
+    bytes_chr = chr
+
+def surrogateescape_handler(exc):
+    """
+    Pure Python implementation of the PEP 383: the "surrogateescape" error
+    handler of Python 3. Undecodable bytes will be replaced by a Unicode
+    character U+DCxx on decoding, and these are translated into the
+    original bytes on encoding.
+    """
+    mystring = exc.object[exc.start:exc.end]
+
+    try:
+        if isinstance(exc, UnicodeDecodeError):
+            # mystring is a byte-string in this case
+            decoded = replace_surrogate_decode(mystring)
+        elif isinstance(exc, UnicodeEncodeError):
+            # In the case of u'\udcc3'.encode('ascii',
+            # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
+            # exception anyway after this function is called, even though I think
+            # it's doing what it should. It seems that the strict encoder is called
+            # to encode the unicode string that this function returns ...
+            decoded = replace_surrogate_encode(mystring)
+        else:
+            raise exc
+    except NotASurrogateError:
+        raise exc
+    return (decoded, exc.end)
+
+
+class NotASurrogateError(Exception):
+    pass
+
+
+def replace_surrogate_encode(mystring):
+    """
+    Returns a (unicode) string, not the more logical bytes, because the codecs
+    register_error functionality expects this.
+    """
+    decoded = []
+    for ch in mystring:
+        # if PY3:
+        #     code = ch
+        # else:
+        code = ord(ch)
+
+        # The following magic comes from Py3.3's Python/codecs.c file:
+        if not 0xD800 <= code <= 0xDCFF:
+            # Not a surrogate. Fail with the original exception.
+            raise exc
+        # mybytes = [0xe0 | (code >> 12),
+        #            0x80 | ((code >> 6) & 0x3f),
+        #            0x80 | (code & 0x3f)]
+        # Is this a good idea?
+        if 0xDC00 <= code <= 0xDC7F:
+            decoded.append(_unichr(code - 0xDC00))
+        elif code <= 0xDCFF:
+            decoded.append(_unichr(code - 0xDC00))
+        else:
+            raise NotASurrogateError
+    return str().join(decoded)
+
+
+def replace_surrogate_decode(mybytes):
+    """
+    Returns a (unicode) string
+    """
+    decoded = []
+    for ch in mybytes:
+        # We may be parsing newbytes (in which case ch is an int) or a native
+        # str on Py2
+        if isinstance(ch, int):
+            code = ch
+        else:
+            code = ord(ch)
+        if 0x80 <= code <= 0xFF:
+            decoded.append(_unichr(0xDC00 + code))
+        elif code <= 0x7F:
+            decoded.append(_unichr(code))
+        else:
+            # # It may be a bad byte
+            # # Try swallowing it.
+            # continue
+            # print("RAISE!")
+            raise NotASurrogateError
+    return str().join(decoded)
+
+
+def encodefilename(fn):
+    if FS_ENCODING == 'ascii':
+        # ASCII encoder of Python 2 expects that the error handler returns a
+        # Unicode string encodable to ASCII, whereas our surrogateescape error
+        # handler has to return bytes in 0x80-0xFF range.
+        encoded = []
+        for index, ch in enumerate(fn):
+            code = ord(ch)
+            if code < 128:
+                ch = bytes_chr(code)
+            elif 0xDC80 <= code <= 0xDCFF:
+                ch = bytes_chr(code - 0xDC00)
+            else:
+                raise UnicodeEncodeError(FS_ENCODING,
+                    fn, index, index+1,
+                    'ordinal not in range(128)')
+            encoded.append(ch)
+        return bytes().join(encoded)
+    elif FS_ENCODING == 'utf-8':
+        # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
+        # doesn't go through our error handler
+        encoded = []
+        for index, ch in enumerate(fn):
+            code = ord(ch)
+            if 0xD800 <= code <= 0xDFFF:
+                if 0xDC80 <= code <= 0xDCFF:
+                    ch = bytes_chr(code - 0xDC00)
+                    encoded.append(ch)
+                else:
+                    raise UnicodeEncodeError(
+                        FS_ENCODING,
+                        fn, index, index+1, 'surrogates not allowed')
+            else:
+                ch_utf8 = ch.encode('utf-8')
+                encoded.append(ch_utf8)
+        return bytes().join(encoded)
+    else:
+        return fn.encode(FS_ENCODING, FS_ERRORS)
+
+def decodefilename(fn):
+    return fn.decode(FS_ENCODING, FS_ERRORS)
+
+FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
+# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
+# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
+
+
+# normalize the filesystem encoding name.
+# For example, we expect "utf-8", not "UTF8".
+FS_ENCODING = codecs.lookup(FS_ENCODING).name
+
+
+def register_surrogateescape():
+    """
+    Registers the surrogateescape error handler on Python 2 (only)
+    """
+    if PY3:
+        return
+    try:
+        codecs.lookup_error(FS_ERRORS)
+    except LookupError:
+        codecs.register_error(FS_ERRORS, surrogateescape_handler)
+
+
+try:
+    b"100644 \x9f\0aaa".decode(defenc, "surrogateescape")
+except:
+    register_surrogateescape()
diff --git a/git/ext/gitdb b/git/ext/gitdb
diff --git a/git/objects/fun.py b/git/objects/fun.py
@@ -2,6 +2,7 @@
 from stat import S_ISDIR
 from git.compat import (
     byte_ord,
+    safe_decode,
     defenc,
     xrange,
     text_type,
@@ -76,11 +77,7 @@ def tree_entries_from_data(data):
         # default encoding for strings in git is utf8
         # Only use the respective unicode object if the byte stream was encoded
         name = data[ns:i]
-        try:
-            name = name.decode(defenc)
-        except UnicodeDecodeError:
-            pass
-        # END handle encoding
+        name = safe_decode(name)
 
         # byte is NULL, get next 20
         i += 1

diff --git a/git/test/performance/test_commit.py b/git/test/performance/test_commit.py
@@ -52,7 +52,7 @@ def test_iteration(self):
             # END for each object
         # END for each commit
         elapsed_time = time() - st
-        print("Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )"
+        print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )"
               % (nc, no, elapsed_time, no / elapsed_time), file=sys.stderr)
 
     def test_commit_traversal(self):

diff --git a/git/test/test_fun.py b/git/test/test_fun.py
@@ -1,10 +1,8 @@
 from io import BytesIO
-from stat import (
-    S_IFDIR,
-    S_IFREG,
-    S_IFLNK
-)
+from stat import S_IFDIR, S_IFREG, S_IFLNK
+from unittest.case import skipIf
 
+from git.compat import PY3
 from git.index import IndexFile
 from git.index.fun import (
     aggressive_tree_merge
@@ -253,6 +251,12 @@ def test_tree_traversal_single(self):
             assert entries
         # END for each commit
 
-    def test_tree_entries_from_data_with_failing_name_decode(self):
+    @skipIf(PY3, 'odd types returned ... maybe figure it out one day')
+    def test_tree_entries_from_data_with_failing_name_decode_py2(self):
+        r = tree_entries_from_data(b'100644 \x9f\0aaa')
+        assert r == [('aaa', 33188, u'\udc9f')], r
+
+    @skipIf(not PY3, 'odd types returned ... maybe figure it out one day')
+    def test_tree_entries_from_data_with_failing_name_decode_py3(self):
         r = tree_entries_from_data(b'100644 \x9f\0aaa')
-        assert r == [(b'aaa', 33188, b'\x9f')], r
+        assert r == [(b'aaa', 33188, '\udc9f')], r
diff --git a/setup.py b/setup.py
@@ -64,7 +64,7 @@ def _stamp_version(filename):
     else:
         print("WARNING: Couldn't find version line in file %s" % filename, file=sys.stderr)
 
-install_requires = ['gitdb >= 0.6.4']
+install_requires = ['gitdb2 >= 2.0.0']
 extras_require = {
     ':python_version == "2.6"': ['ordereddict'],
 }
@@ -100,7 +100,7 @@ def _stamp_version(filename):
     package_data={'git.test': ['fixtures/*']},
     package_dir={'git': 'git'},
     license="BSD License",
-    requires=['gitdb (>=0.6.4)'],
+    requires=['gitdb2 (>=2.0.0)'],
     install_requires=install_requires,
     test_requirements=test_requires + install_requires,
     zip_safe=False,
+7 −0		README.rst
+1 −1		gitdb/__init__.py
+0 −1,154		gitdb/_delta_apply.c
+0 −6		gitdb/_delta_apply.h
+0 −107		gitdb/_fun.c
+1 −1		gitdb/ext/smmap
+1 −1		gitdb/fun.py
+1 −1		gitdb/pack.py
+1 −1		gitdb/stream.py
+38 −118		setup.py