mcs07 · mcs07 · Oct 4, 2016 · Oct 4, 2016
diff --git a/chemdataextractor/biblio/bibtex.py b/chemdataextractor/biblio/bibtex.py
@@ -66,7 +66,7 @@ def __init__(self, data, **kwargs):
 
     def _next_token(self, skipws=True):
         """Increment _token to the next token and return it."""
-        self._token = self._tokens.next().group(0)
+        self._token = next(self._tokens).group(0)
         return self._next_token() if skipws and self._token.isspace() else self._token
 
     def parse(self):
@@ -184,13 +184,14 @@ def size(self):
     @property
     def records_list(self):
         """Return the records as a list of dictionaries."""
-        return self.records.values()
+        return list(self.records.values())
 
     @property
     def metadata(self):
         """Return metadata for the parsed collection of records."""
         auto = {u'records': self.size}
-        return dict(auto.items() + self.meta.items())
+        auto.update(self.meta)
+        return auto
 
     @property
     def json(self):

diff --git a/chemdataextractor/doc/text.py b/chemdataextractor/doc/text.py
@@ -374,7 +374,7 @@ def abbreviation_definitions(self):
     @memoized_property
     def ner_tagged_tokens(self):
         """"""
-        return zip(self.raw_tokens, self.ner_tags)
+        return list(zip(self.raw_tokens, self.ner_tags))
 
     @memoized_property
     def ner_tags(self):
@@ -486,7 +486,7 @@ def tags(self):
 
     @property
     def tagged_tokens(self):
-        return zip(self.raw_tokens, self.tags)
+        return list(zip(self.raw_tokens, self.tags))
 
     @property
     def records(self):

diff --git a/chemdataextractor/nlp/tag.py b/chemdataextractor/nlp/tag.py
@@ -446,5 +446,5 @@ def tag(self, tokens):
             if not tags[start_token] == 'I-%s' % self.entity:
                 tags[start_token] = 'B-%s' % self.entity
             tags[start_token+1:end_token+1] = ['I-%s' % self.entity] * (end_token - start_token)
-        tokentags = zip(tokens, tags)
+        tokentags = list(zip(tokens, tags))
         return tokentags
diff --git a/chemdataextractor/scrape/base.py b/chemdataextractor/scrape/base.py
@@ -140,7 +140,7 @@ def __new__(mcs, name, bases, attrs):
         for attr_name, attr_value in six.iteritems(attrs):
             if isinstance(attr_value, BaseField):
                 # Set the name attribute on the field to the attribute name on the Entity
-                attr_value.name = unicode(attr_name)
+                attr_value.name = six.text_type(attr_name)
                 fields[attr_name] = attr_value
         #attrs['fields'] = fields
         # Set default _meta values, then update with any custom definitions from meta

diff --git a/chemdataextractor/scrape/clean.py b/chemdataextractor/scrape/clean.py
@@ -129,7 +129,7 @@ def __call__(self, doc):
                 if parent is None:
                     continue
                 # Append the text to previous tail (or parent text if no previous), ensuring newline if block level
-                if el.text and isinstance(el.tag, basestring):
+                if el.text and isinstance(el.tag, six.string_types):
                     if previous is None:
                         parent.text = (parent.text or '') + el.text
                     else:

diff --git a/chemdataextractor/scrape/fields.py b/chemdataextractor/scrape/fields.py
@@ -149,4 +149,4 @@ def process(self, value):
             return None
 
     def serialize(self, value):
-        return unicode(value.isoformat())
+        return six.text_type(value.isoformat())
diff --git a/chemdataextractor/scrape/pub/rsc.py b/chemdataextractor/scrape/pub/rsc.py
@@ -15,7 +15,6 @@
 from __future__ import unicode_literals
 import logging
 import re
-import urllib
 
 from bs4 import UnicodeDammit
 from lxml.etree import fromstring
@@ -266,7 +265,7 @@ def parse_rsc_html(htmlstring):
 
 def replace_rsc_img_chars(document):
     """Replace image characters with unicode equivalents."""
-    image_re = re.compile(ur'http://www.rsc.org/images/entities/(?:h[23]+_)?(?:[ib]+_)?char_([0-9a-f]{4})(?:_([0-9a-f]{4}))?\.gif')
+    image_re = re.compile('http://www.rsc.org/images/entities/(?:h[23]+_)?(?:[ib]+_)?char_([0-9a-f]{4})(?:_([0-9a-f]{4}))?\.gif')
     for img in document.xpath('.//img[starts-with(@src, "http://www.rsc.org/images/entities/")]'):
         m = image_re.match(img.get('src'))
         if m:
@@ -378,7 +377,7 @@ class RscChemicalMention(Entity):
 
     process_text = normalize
     process_chemspider_id = Chain(LStrip('http://www.chemspider.com/Chemical-Structure.'), RStrip('.html'), Discard(''))
-    process_inchi = Chain(LStrip('http://www.chemspider.com/Search.aspx?q='), urllib.unquote, six.text_type.strip)
+    process_inchi = Chain(LStrip('http://www.chemspider.com/Search.aspx?q='), six.moves.urllib.parse.unquote, six.text_type.strip)
 
 
 class RscImage(Entity):

diff --git a/chemdataextractor/text/latex.py b/chemdataextractor/text/latex.py
@@ -17,6 +17,8 @@
 import re
 import string
 
+import six
+
 from . import NAME_SMALL, SMALL
 
 
@@ -66,16 +68,16 @@ def latex_to_unicode(text, capitalize=False):
                 res.append(c.lower())
         text = ''.join(res)
     if any(i in text for i in ['\\', '{', '}', '$', '&', '%', '#', '_']):
-        for k, v in LATEX_MAPPINGS.iteritems():
+        for k, v in six.iteritems(LATEX_MAPPINGS):
             text = text.replace(k, v)
-        for k, v in LATEX_SUB_MAPPINGS.iteritems():
+        for k, v in six.iteritems(LATEX_SUB_MAPPINGS):
             text = text.replace(k, v)
         for mod in ['mathbb', 'mathbf', 'mathbit', 'mathfrak', 'mathrm', 'mathscr', 'mathsf', 'mathsfbf', 'mathsfbfsl', 
                     'mathsfsl', 'mathsl', 'mathslbb', 'mathtt']:
             text = re.sub(r'\\%s\{([\\\w]+)\}' % mod, r'\1', text)
-        for k, v in LATEX_SUB_SUB_MAPPINGS.iteritems():
+        for k, v in six.iteritems(LATEX_SUB_SUB_MAPPINGS):
             text = text.replace(k, v)
-        for k, v in LATEX_COMBINING_CHARS.iteritems():
+        for k, v in six.iteritems(LATEX_COMBINING_CHARS):
             text = re.sub(r'%s\{?(\w)\}?' % k, r'\1%s' % v, text)
         text = re.sub(r'\\noopsort\{.*?\}', r'', text)
         text = re.sub(r'\\path\|(.*?)\|', r'\1', text)

diff --git a/chemdataextractor/text/processors.py b/chemdataextractor/text/processors.py
@@ -16,7 +16,6 @@
 from abc import ABCMeta, abstractmethod
 import logging
 import re
-from urlparse import urlparse
 
 import six
 from . import EMAIL_RE, APOSTROPHES
@@ -128,7 +127,7 @@ def floats(s):
 
 def strip_querystring(url):
     """Remove the querystring from the end of a URL."""
-    p = urlparse(url)
+    p = six.moves.urllib.parse.urlparse(url)
     return p.scheme + "://" + p.netloc + p.path
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,7 @@ cssselect>=0.9.2
 DAWG>=0.7.8
 lxml>=3.6.4
 nltk>=3.2.1
-pdfminer>=20140328
+pdfminer.six>=20160614
 python-crfsuite>=0.8.4
 python-dateutil>=2.5.3
 requests>=2.11.1

diff --git a/scripts/melting_points.py b/scripts/melting_points.py
@@ -16,20 +16,19 @@
 import copy
 from collections import defaultdict
 import gzip
-from itertools import izip
 import json
 import logging
 import math
 import os
 import re
 import shutil
-
 import pickle
 
 import cirpy
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from rdkit.Chem import rdMolDescriptors
+import six
 
 from chemdataextractor import Document
 from chemdataextractor.doc import Paragraph, Table
@@ -162,7 +161,7 @@ def standardize_results():
     n2s = {}
     with open('opsin_input.txt') as op_in:
         with open('opsin_output.txt') as op_out:
-            for name, smiles in izip(op_in, op_out):
+            for name, smiles in six.moves.zip(op_in, op_out):
                 n2s[name.strip().decode('utf8')] = smiles.strip().decode('utf8')
 
     result_dir = '../examples/mp/results'

diff --git a/setup.py b/setup.py
@@ -25,8 +25,8 @@
     entry_points={'console_scripts': ['cde = chemdataextractor.cli:cli']},
     tests_require=['pytest'],
     install_requires=[
-        'appdirs', 'beautifulsoup4', 'click', 'cssselect', 'lxml', 'nltk', 'pdfminer', 'python-dateutil', 'requests',
-        'schematics', 'six', 'python-crfsuite', 'DAWG'
+        'appdirs', 'beautifulsoup4', 'click', 'cssselect', 'lxml', 'nltk', 'pdfminer.six', 'python-dateutil',
+        'requests', 'schematics', 'six', 'python-crfsuite', 'DAWG'
     ],
     classifiers=[
         'Intended Audience :: Developers',

diff --git a/tests/test_parse_ir.py b/tests/test_parse_ir.py
@@ -15,6 +15,7 @@
 from __future__ import unicode_literals
 import logging
 import unittest
+
 from lxml import etree
 
 from chemdataextractor.doc.text import Sentence
@@ -32,7 +33,7 @@ def do_parse(self, input, expected):
         s = Sentence(input)
         log.debug(s)
         log.debug(s.tagged_tokens)
-        result = ir.scan(s.tagged_tokens).next()[0]
+        result = next(ir.scan(s.tagged_tokens))[0]
         log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
         self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
         for c in IrParser().parse(s.tagged_tokens):

diff --git a/tests/test_parse_mp.py b/tests/test_parse_mp.py
@@ -15,6 +15,7 @@
 from __future__ import unicode_literals
 import logging
 import unittest
+
 from lxml import etree
 
 from chemdataextractor.doc.text import Sentence, Paragraph
@@ -33,7 +34,7 @@ def do_parse(self, input, expected):
         s = Sentence(input)
         log.debug(s)
         log.debug(s.tagged_tokens)
-        result = mp_phrase.scan(s.tagged_tokens).next()[0]
+        result = next(mp_phrase.scan(s.tagged_tokens))[0]
         log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
         self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
 

diff --git a/tests/test_parse_nmr.py b/tests/test_parse_nmr.py
@@ -15,6 +15,7 @@
 from __future__ import unicode_literals
 import logging
 import unittest
+
 from lxml import etree
 
 from chemdataextractor.doc.text import Sentence
@@ -32,7 +33,7 @@ def do_parse(self, input, expected):
         s = Sentence(input)
         log.debug(s)
         log.debug(s.tagged_tokens)
-        result = nmr.scan(s.tagged_tokens).next()[0]
+        result = next(nmr.scan(s.tagged_tokens))[0]
         log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
         self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
 

diff --git a/tests/test_parse_uvvis.py b/tests/test_parse_uvvis.py
@@ -15,6 +15,7 @@
 from __future__ import unicode_literals
 import logging
 import unittest
+
 from lxml import etree
 
 from chemdataextractor.doc.text import Sentence
@@ -33,7 +34,7 @@ def do_parse(self, input, expected):
         s = Sentence(input)
         log.debug(s)
         log.debug(s.tagged_tokens)
-        result = uvvis.scan(s.tagged_tokens).next()[0]
+        result = next(uvvis.scan(s.tagged_tokens))[0]
         log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
         self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
         for c in UvvisParser().parse(s.tagged_tokens):