Email validation/parsing #91

mklaber · 2019-11-15T16:48:49Z

Cleaning signup sheets and other human entered email addresses.

Here's the code I use that may be helpful to y'all. (It's a bit messy.) It uses email-validator and pydash (because I'm sad I don't get to write in node.js).

from email_validator import validate_email, EmailNotValidError, EmailSyntaxError, EmailUndeliverableError
import re
from pydash import predicates
from pydash.strings import trim, reg_exp_replace, clean, deburr
from typing import List
from pydash.collections import every, filter_
from pydash.arrays import flatten_deep


def empty_if_null(value: str) -> str:
    return value if value else ""

def trim_non_printing(value: str) -> str:
    value = trim(value)
    value = reg_exp_replace(value, '[\u202a\u25a0\u00a0\s]+$', '')
    value = reg_exp_replace(value, '^[\u202a\u25a0\u00a0\s]+', '')
    return value

def clean_email_string(value: str) -> str:
    if not predicates.is_string(value):
        return ""
    # lowercase everything
    value = trim_non_printing(clean(deburr(value)))
    # strip spaces in the middle of the address
    value = reg_exp_replace(value, r'\s+', '')
    return value

email_display_name_re = re.compile(r".+\<(?P<email>[^@]+@[^\>]+)\>")

def fix_common_email_problems(value: str) -> str:
    if email_display_name_re.match(value):
        components = email_display_name_re.search(value)
        value = components.group('email')
    value = clean_email_string(value)
    # trim off the start or end: ,  .  :  "  >  <  '
    # then trim whitespace again
    value = trim(trim(value, ',.:"><\''))
    # fix common suffix issues (could do a better job with this though...)
    value = reg_exp_replace(value, r',com$', '.com')
    return value

def clean_emails(email: str) -> List[str]:
    def _clean_emails(email: str, already_fixed: bool) -> List[any]:
        try:
            return [validate_email(email)['email'].lower()]
        except EmailNotValidError as e:
            msg = str(e)
            if 'It must have exactly one @-sign' in msg:
                print(f'try splitting or {email} with {email.count("@")} @ signs')
                for delim in [';', '/', ',', '|']:
                    # if email is split by this delimiter, do we end up with one @ in each set?
                    # if so, split on that delimiter and treat each as their own address in need 
                    # of cleaning.
                    if every(email.split(delim), lambda x: x.count('@') == 1):
                        print(f'the delimiter is {delim}')
                        return list(map(lambda x: _clean_emails(x, False), email.split(delim)))
                print("Can't figure out what delimiter it is so lets just try cleaning in otherways")
            if not already_fixed:
                return _clean_emails(fix_common_email_problems(email), True)
            print(f'Giving up, {email} is probably just a really bad address due to {msg}')
            return []
    results = _clean_emails(email, False)
    return filter_(flatten_deep(results))

shaunagm · 2023-02-21T19:31:20Z

Seems related to #554

eliotst added the good first issue These issues are great ones to start working on for Parsons newcomers! label Dec 4, 2019

eliotst added low priority Priority - this is a nice-to-have, is non-urgent, and/or has a minor overall imapact on Parsons and removed good first issue These issues are great ones to start working on for Parsons newcomers! labels Jun 18, 2020

neverett mentioned this issue Nov 3, 2021

Validation epic #591

Closed

3 tasks

shaunagm added the low priority Priority - this is a nice-to-have, is non-urgent, and/or has a minor overall imapact on Parsons label Feb 21, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Email validation/parsing #91

Email validation/parsing #91

mklaber commented Nov 15, 2019

shaunagm commented Feb 21, 2023

Email validation/parsing #91

Email validation/parsing #91

Comments

mklaber commented Nov 15, 2019

shaunagm commented Feb 21, 2023