Merge pull request #1 from Cybjit/master

Python 3 text conversion issues
pdfminer · Sep 9, 2014 · 7b620b3 · 7b620b3
2 parents 28c2a4e + a6f31a7
commit 7b620b3
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 26 deletions.
diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py
@@ -37,3 +37,4 @@ def process(self, data):
 
     encrypt = decrypt = process
 
+new = Arcfour
diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -180,11 +180,11 @@ def add_cid2unichr(self, cid, code):
         if isinstance(code, PSLiteral):
             # Interpret as an Adobe glyph name.
             self.cid2unichr[cid] = name2unicode(code.name)
-        elif isinstance(code, str):
+        elif isinstance(code, bytes):
             # Interpret as UTF-16BE.
-            self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
+            self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
         elif isinstance(code, int):
-            self.cid2unichr[cid] = unichr(code)
+            self.cid2unichr[cid] = six.unichr(code)
         else:
             raise TypeError(code)
         return
@@ -358,7 +358,7 @@ def do_keyword(self, pos, token):
                 e1 = nunpack(evar)
                 vlen = len(svar)
                 #assert s1 <= e1
-                for i in xrange(e1-s1+1):
+                for i in range(e1-s1+1):
                     x = sprefix+struct.pack('>L', s1+i)[-vlen:]
                     self.cmap.add_code2cid(x, cid+i)
             return
@@ -379,21 +379,21 @@ def do_keyword(self, pos, token):
         if token is self.KEYWORD_ENDBFRANGE:
             objs = [obj for (__, obj) in self.popall()]
             for (s, e, code) in choplist(3, objs):
-                if (not isinstance(s, str) or not isinstance(e, str) or
+                if (not isinstance(s, bytes) or not isinstance(e, bytes) or
                    len(s) != len(e)):
                         continue
                 s1 = nunpack(s)
                 e1 = nunpack(e)
                 #assert s1 <= e1
                 if isinstance(code, list):
-                    for i in xrange(e1-s1+1):
+                    for i in range(e1-s1+1):
                         self.cmap.add_cid2unichr(s1+i, code[i])
                 else:
                     var = code[-4:]
                     base = nunpack(var)
                     prefix = code[:-4]
                     vlen = len(var)
-                    for i in xrange(e1-s1+1):
+                    for i in range(e1-s1+1):
                         x = prefix+struct.pack('>L', base+i)[-vlen:]
                         self.cmap.add_cid2unichr(s1+i, x)
             return
@@ -404,7 +404,7 @@ def do_keyword(self, pos, token):
         if token is self.KEYWORD_ENDBFCHAR:
             objs = [obj for (__, obj) in self.popall()]
             for (cid, code) in choplist(2, objs):
-                if isinstance(cid, str) and isinstance(code, str):
+                if isinstance(cid, bytes) and isinstance(code, bytes):
                     self.cmap.add_cid2unichr(nunpack(cid), code)
             return
 

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -99,7 +99,7 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
         # other shapes
         pts = []
         for p in path:
-            for i in xrange(1, len(p), 2):
+            for i in range(1, len(p), 2):
                 pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
         self.cur_item.add(LTCurve(gstate.linewidth, pts))
         return
@@ -164,7 +164,9 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
         return
 
     def write_text(self, text):
-        self.outfp.write(text.encode(self.codec, 'ignore'))
+        if self.codec:
+            text = text.encode(self.codec, 'ignore')
+        self.outfp.write(text)
         return
 
     def receive_layout(self, ltpage):
@@ -252,7 +254,7 @@ def write_header(self):
 
     def write_footer(self):
         self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
-                   ', '.join('<a href="#%s">%s</a>' % (i, i) for i in xrange(1, self.pageno)))
+                   ', '.join('<a href="#%s">%s</a>' % (i, i) for i in range(1, self.pageno)))
         self.write('</body></html>\n')
         return
 

diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
@@ -342,7 +342,7 @@ def compute_u(self, key):
             hash.update(self.docid[0])  # 3
             result = ARC4.new(key).encrypt(hash.digest())  # 4
             for i in range(1, 20):  # 5
-                k = b''.join(chr(ord(c) ^ i) for c in key)
+                k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key))
                 result = ARC4.new(k).encrypt(result)
             result += result  # 6
             return result

diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
@@ -45,7 +45,7 @@ def get_widths(seq):
             r.append(v)
             if len(r) == 3:
                 (char1, char2, w) = r
-                for i in xrange(char1, char2+1):
+                for i in range(char1, char2+1):
                     widths[i] = w
                 r = []
     return widths
@@ -68,7 +68,7 @@ def get_widths2(seq):
             r.append(v)
             if len(r) == 5:
                 (char1, char2, w, vx, vy) = r
-                for i in xrange(char1, char2+1):
+                for i in range(char1, char2+1):
                     widths[i] = (w, (vx, vy))
                 r = []
     return widths
@@ -266,7 +266,7 @@ def __init__(self, fp):
             self.fp = fp
             self.offsets = []
             (count, offsize) = struct.unpack('>HB', self.fp.read(3))
-            for i in xrange(count+1):
+            for i in range(count+1):
                 self.offsets.append(nunpack(self.fp.read(offsize)))
             self.base = self.fp.tell()-1
             self.fp.seek(self.base+self.offsets[-1])
@@ -283,7 +283,7 @@ def __getitem__(self, i):
             return self.fp.read(self.offsets[i+1]-self.offsets[i])
 
         def __iter__(self):
-            return iter(self[i] for i in xrange(len(self)))
+            return iter(self[i] for i in range(len(self)))
 
     def __init__(self, name, fp):
         self.name = name
@@ -323,9 +323,9 @@ def __init__(self, name, fp):
             # Format 1
             (n,) = struct.unpack('B', self.fp.read(1))
             code = 0
-            for i in xrange(n):
+            for i in range(n):
                 (first, nleft) = struct.unpack('BB', self.fp.read(2))
-                for gid in xrange(first, first+nleft+1):
+                for gid in range(first, first+nleft+1):
                     self.code2gid[code] = gid
                     self.gid2code[gid] = code
                     code += 1
@@ -348,9 +348,9 @@ def __init__(self, name, fp):
             # Format 1
             (n,) = struct.unpack('B', self.fp.read(1))
             sid = 0
-            for i in xrange(n):
+            for i in range(n):
                 (first, nleft) = struct.unpack('BB', self.fp.read(2))
-                for gid in xrange(first, first+nleft+1):
+                for gid in range(first, first+nleft+1):
                     name = self.getstr(sid)
                     self.name2gid[name] = gid
                     self.gid2name[gid] = name
@@ -384,7 +384,7 @@ def __init__(self, name, fp):
         self.tables = {}
         self.fonttype = fp.read(4)
         (ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
-        for _ in xrange(ntables):
+        for _ in range(ntables):
             (name, tsum, offset, length) = struct.unpack('>4sLLL', fp.read(16))
             self.tables[name] = (offset, length)
         return
@@ -397,7 +397,7 @@ def create_unicode_map(self):
         fp.seek(base_offset)
         (version, nsubtables) = struct.unpack('>HH', fp.read(4))
         subtables = []
-        for i in xrange(nsubtables):
+        for i in range(nsubtables):
             subtables.append(struct.unpack('>HHL', fp.read(8)))
         char2gid = {}
         # Only supports subtable type 0, 2 and 4.
@@ -413,15 +413,15 @@ def create_unicode_map(self):
                     firstbytes[k//8] = i
                 nhdrs = max(subheaderkeys)//8 + 1
                 hdrs = []
-                for i in xrange(nhdrs):
+                for i in range(nhdrs):
                     (firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
                     hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
                 for (i, firstcode, entcount, delta, pos) in hdrs:
                     if not entcount:
                         continue
                     first = firstcode + (firstbytes[i] << 8)
                     fp.seek(pos)
-                    for c in xrange(entcount):
+                    for c in range(entcount):
                         gid = struct.unpack('>H', fp.read(2))
                         if gid:
                             gid += delta
@@ -438,10 +438,10 @@ def create_unicode_map(self):
                 for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
                     if idr:
                         fp.seek(pos+idr)
-                        for c in xrange(sc, ec+1):
+                        for c in range(sc, ec+1):
                             char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff
                     else:
-                        for c in xrange(sc, ec+1):
+                        for c in range(sc, ec+1):
                             char2gid[c] = (c + idd) & 0xffff
             else:
                 assert 0

diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
@@ -85,6 +85,8 @@ def usage():
         outfp = open(outfile, 'wb')
     else:
         outfp = sys.stdout
+        if outfp.encoding is not None:
+            codec = None
     if outtype == 'text':
         device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                imagewriter=imagewriter)