Skip to content

Commit

Permalink
Fix deprecation warnings for regex string literals. Fix piskvorky#1646 (
Browse files Browse the repository at this point in the history
piskvorky#1649)

* Fix deprecation warnings for regex string literals. Fix piskvorky#1646

Add raw flag before all Regex strings so Python 3 can stop complaining.

* Fix two more occurrences of unescaped Regex strings
  • Loading branch information
frank-lsf authored and horpto committed Oct 28, 2017
1 parent 860483a commit d83043d
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 32 deletions.
42 changes: 21 additions & 21 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,23 +41,23 @@
TOKEN_MAX_LEN = 15


RE_P0 = re.compile('<!--.*?-->', re.DOTALL | re.UNICODE) # comments
RE_P1 = re.compile('<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE) # footnotes
RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages
RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template
RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template
RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE) # simplify links, keep description
RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
RE_P9 = re.compile('<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE) # outside links
RE_P10 = re.compile('<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE) # math content
RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories
RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE) # comments
RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE) # footnotes
RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE) # links to languages
RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE) # template
RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE) # template
RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE) # simplify links, keep description
RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
RE_P9 = re.compile(r'<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE) # outside links
RE_P10 = re.compile(r'<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE) # math content
RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories
# Remove File and Image template
RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)

# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
# ought to be ignored
Expand All @@ -81,7 +81,7 @@ def filter_wiki(raw):


def remove_markup(text):
text = re.sub(RE_P2, "", text) # remove the last list (=languages)
text = re.sub(RE_P2, '', text) # remove the last list (=languages)
# the wiki markup is recursive (markup inside markup etc)
# instead of writing a recursive grammar, here we deal with that by removing
# markup in a loop, starting with inner-most expressions and working outwards,
Expand All @@ -91,11 +91,11 @@ def remove_markup(text):
iters = 0
while True:
old, iters = text, iters + 1
text = re.sub(RE_P0, "", text) # remove comments
text = re.sub(RE_P0, '', text) # remove comments
text = re.sub(RE_P1, '', text) # remove footnotes
text = re.sub(RE_P9, "", text) # remove outside links
text = re.sub(RE_P10, "", text) # remove math content
text = re.sub(RE_P11, "", text) # remove all remaining tags
text = re.sub(RE_P9, '', text) # remove outside links
text = re.sub(RE_P10, '', text) # remove math content
text = re.sub(RE_P11, '', text) # remove all remaining tags
text = re.sub(RE_P14, '', text) # remove categories
text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
Expand Down
2 changes: 1 addition & 1 deletion gensim/examples/dmlcz/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
if sys.version_info[0] >= 3:
unicode = str

PAT_TAG = re.compile('<(.*?)>(.*)</.*?>')
PAT_TAG = re.compile(r'<(.*?)>(.*)</.*?>')
logger = logging.getLogger('gensim.corpora.sources')


Expand Down
2 changes: 1 addition & 1 deletion gensim/parsing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def remove_stopwords(s):
return " ".join(w for w in s.split() if w not in STOPWORDS)


RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
RE_PUNCT = re.compile(r'([%s])+' % re.escape(string.punctuation), re.UNICODE)


def strip_punctuation(s):
Expand Down
14 changes: 7 additions & 7 deletions gensim/summarization/textcleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
HAS_PATTERN = False


SEPARATOR = r"@"
RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE)
AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE)
AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE)
UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE)
UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE)
SEPARATOR = r'@'
RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE)
UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE)
UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE)


def split_sentences(text):
Expand Down
4 changes: 2 additions & 2 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def smart_open(fname, mode='rb'):
return open(fname, mode)


PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)


Expand Down Expand Up @@ -1039,7 +1039,7 @@ def has_pattern():
return False


def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
stopwords=frozenset(), min_length=2, max_length=15):
"""
This function is only available when the optional 'pattern' package is installed.
Expand Down

0 comments on commit d83043d

Please sign in to comment.