Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PIP-1615: Operate on unicode data exclusively [python3] #227

Merged
merged 1 commit into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def finalize_options(self):


setup(name='talon',
version='1.5.0',
version='1.6.0',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),
Expand Down
50 changes: 20 additions & 30 deletions talon/quotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@
"""

from __future__ import absolute_import
import regex as re

import logging
from copy import deepcopy

from lxml import html, etree

from talon.utils import (get_delimiter, html_tree_to_text,
html_document_fromstring)
from talon import html_quotations
import regex as re
from lxml import etree, html
from six.moves import range
import six

from talon import html_quotations
from talon.utils import (get_delimiter, html_document_fromstring,
html_tree_to_text)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,7 +93,7 @@
)

RE_QUOTATION = re.compile(
r'''
r"""
(
# quotation border: splitter line or a number of quotation marker lines
(?:
Expand All @@ -112,10 +111,10 @@

# after quotations should be text only or nothing at all
[te]*$
''', re.VERBOSE)
""", re.VERBOSE)

RE_EMPTY_QUOTATION = re.compile(
r'''
r"""
(
# quotation border: splitter line or a number of quotation marker lines
(?:
Expand All @@ -125,7 +124,7 @@
)
)
e*
''', re.VERBOSE)
""", re.VERBOSE)

# ------Original Message------ or ---- Reply Message ----
# With variations in other languages.
Expand Down Expand Up @@ -343,9 +342,6 @@ def _replace_link_brackets(msg_body):

Converts msg_body into a unicode
"""
if isinstance(msg_body, bytes):
msg_body = msg_body.decode('utf8')

def link_wrapper(link):
newline_index = msg_body[:link.start()].rfind("\n")
if msg_body[newline_index + 1] == ">":
Expand Down Expand Up @@ -385,8 +381,6 @@ def postprocess(msg_body):

def extract_from_plain(msg_body):
"""Extracts a non quoted message from provided plain text."""
stripped_text = msg_body

delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter)
# don't process too long messages
Expand Down Expand Up @@ -418,17 +412,13 @@ def extract_from_html(msg_body):

Returns a unicode string.
"""
msg_body_bytes = msg_body
if isinstance(msg_body, six.text_type):
msg_body_bytes = msg_body.encode('utf8')

if msg_body_bytes.strip() == b'':
if msg_body.strip() == "":
return msg_body

msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n')
msg_body = msg_body.replace("\r\n", "\n")
# Cut out xml and doctype tags to avoid conflict with unicode decoding.
msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes)
html_tree = html_document_fromstring(msg_body_bytes)
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
html_tree = html_document_fromstring(msg_body)
if html_tree is None:
return msg_body

Expand Down Expand Up @@ -531,11 +521,11 @@ def extract_from_html_tree(html_tree):
# of replacing data outside the <tag> which might be essential to
# the customer.
remove_namespaces(html_tree_copy)
s = html.tostring(html_tree_copy)
s = html.tostring(html_tree_copy, encoding="ascii")
if not s:
return None

return s.decode('utf-8')
return s.decode("ascii")


def remove_namespaces(root):
Expand Down Expand Up @@ -654,23 +644,23 @@ def _readable_text_empty(html_tree):


def is_splitter(line):
'''
"""
Returns Matcher object if provided string is a splitter and
None otherwise.
'''
"""
for pattern in SPLITTER_PATTERNS:
matcher = re.match(pattern, line)
if matcher:
return matcher


def text_content(context):
'''XPath Extension function to return a node text content.'''
"""XPath Extension function to return a node text content."""
return context.context_node.xpath("string()").strip()


def tail(context):
'''XPath Extension function to return a node tail text.'''
"""XPath Extension function to return a node tail text."""
return context.context_node.tail or ''


Expand Down
29 changes: 11 additions & 18 deletions talon/signature/learning/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,17 @@
* regexp's constants used when evaluating signature's features

"""

from __future__ import absolute_import
import unicodedata
import regex as re

from talon.utils import to_unicode
import regex as re

from talon.signature.constants import SIGNATURE_MAX_LINES


rc = re.compile

RE_EMAIL = rc('\S@\S')
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""")

# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
Expand Down Expand Up @@ -55,7 +51,7 @@


def binary_regex_search(prog):
'''Returns a function that returns 1 or 0 depending on regex search result.
"""Returns a function that returns 1 or 0 depending on regex search result.

If regular expression compiled into prog is present in a string
the result of calling the returned function with the string will be 1
Expand All @@ -66,12 +62,12 @@ def binary_regex_search(prog):
1
>>> binary_regex_search(re.compile("12"))("34")
0
'''
"""
return lambda s: 1 if prog.search(s) else 0


def binary_regex_match(prog):
'''Returns a function that returns 1 or 0 depending on regex match result.
"""Returns a function that returns 1 or 0 depending on regex match result.

If a string matches regular expression compiled into prog
the result of calling the returned function with the string will be 1
Expand All @@ -82,7 +78,7 @@ def binary_regex_match(prog):
1
>>> binary_regex_match(re.compile("12"))("3 12")
0
'''
"""
return lambda s: 1 if prog.match(s) else 0


Expand Down Expand Up @@ -135,7 +131,6 @@ def extract_names(sender):
>>> extract_names('')
[]
"""
sender = to_unicode(sender, precise=True)
# Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e.
Expand All @@ -154,7 +149,7 @@ def extract_names(sender):


def categories_percent(s, categories):
'''Returns category characters percent.
"""Returns category characters percent.

>>> categories_percent("qqq ggg hhh", ["Po"])
0.0
Expand All @@ -166,29 +161,27 @@ def categories_percent(s, categories):
50.0
>>> categories_percent("s.s,5s", ["Po", "Nd"])
50.0
'''
"""
count = 0
s = to_unicode(s, precise=True)
for c in s:
if unicodedata.category(c) in categories:
count += 1
return 100 * float(count) / len(s) if len(s) else 0


def punctuation_percent(s):
'''Returns punctuation percent.
"""Returns punctuation percent.

>>> punctuation_percent("qqq ggg hhh")
0.0
>>> punctuation_percent("q,w.")
50.0
'''
"""
return categories_percent(s, ['Po'])


def capitalized_words_percent(s):
'''Returns capitalized words percent.'''
s = to_unicode(s, precise=True)
"""Returns capitalized words percent."""
words = re.split('\s', s)
words = [w for w in words if w.strip()]
words = [w for w in words if len(w) > 2]
Expand Down
Loading