Skip to content

Commit

Permalink
drop support for Python 3.5 (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 8, 2021
1 parent fc945f0 commit c8ce6d8
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 66 deletions.
15 changes: 0 additions & 15 deletions courlan/compatibility.py

This file was deleted.

6 changes: 1 addition & 5 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@


from .clean import normalize_url, scrub_url
from .compatibility import TLD_EXTRACTION
from .filters import basic_filter, extension_filter, lang_filter, \
path_filter, spam_filter, type_filter, validate_url
from .network import redirection_test
Expand Down Expand Up @@ -176,10 +175,7 @@ def extract_links(pagecontent, base_url, external_bool, language=None,
return validlinks
# define host reference
if reference is None:
if TLD_EXTRACTION is not None:
reference = TLD_EXTRACTION(base_url)
else:
reference = base_url
reference = base_url
# extract links
for link in FIND_LINKS_REGEX.findall(pagecontent):
# https://en.wikipedia.org/wiki/Hreflang
Expand Down
8 changes: 2 additions & 6 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@

from urllib.parse import urlparse

from langcodes import Language
try:
from langcodes import tag_is_valid
except ImportError: # Python 3.5
tag_is_valid = None
from langcodes import Language, tag_is_valid

from .langinfo import COUNTRY_CODES, LANGUAGE_CODES

Expand Down Expand Up @@ -78,7 +74,7 @@ def langcodes_score(language, segment, score):
if segment[:2] not in COUNTRY_CODES and segment[:2] not in LANGUAGE_CODES:
return score
# test if tag is valid (caution: private codes are)
if tag_is_valid is None or tag_is_valid(segment):
if tag_is_valid(segment):
# try to identify language code
identified = Language.get(segment).language
# see if it matches
Expand Down
41 changes: 10 additions & 31 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,21 @@

from urllib.parse import urlparse


from .compatibility import TLD_EXTRACTION, get_fld, get_tld, tldextract
from tld import get_fld, get_tld


def extract_domain(url, blacklist=None):
'''Extract domain name information using top-level domain info'''
if blacklist is None:
blacklist = set()
# legacy tldextract code
if TLD_EXTRACTION is not None:
tldinfo = TLD_EXTRACTION(url)
# domain TLD blacklist
if tldinfo.domain in blacklist:
return None
# return domain
returnval = '.'.join(part for part in tldinfo if part)
# new code
else:
tldinfo = get_tld(url, as_object=True, fail_silently=True)
# invalid input OR domain TLD blacklist
if tldinfo is None or tldinfo.domain in blacklist:
return None
# return domain
returnval = tldinfo.fld
# new code: Python >= 3.6 with tld module
tldinfo = get_tld(url, as_object=True, fail_silently=True)
# invalid input OR domain TLD blacklist
if tldinfo is None or tldinfo.domain in blacklist:
return None
# return domain
# this step seems necessary to standardize output
return re.sub(r'^www[0-9]*\.', '', returnval)
return re.sub(r'^www[0-9]*\.', '', tldinfo.fld)


def get_base_url(url):
Expand Down Expand Up @@ -81,18 +70,8 @@ def fix_relative_urls(baseurl, url):
def is_external(url, reference, ignore_suffix=True):
'''Determine if a link leads to another host, takes a reference URL or
tld/tldextract object as input, returns a boolean'''
# legacy tldextract code
if TLD_EXTRACTION is not None:
# reference
if not isinstance(reference, tldextract.tldextract.ExtractResult):
reference = TLD_EXTRACTION(reference)
tldinfo = TLD_EXTRACTION(url)
if ignore_suffix is True:
ref_domain, domain = reference.domain, tldinfo.domain
else: # '.'.join(ext[-2:]).strip('.')
ref_domain, domain = reference.registered_domain, tldinfo.registered_domain
# new tld code
elif ignore_suffix is True:
# new code: Python >= 3.6 with tld module
if ignore_suffix is True:
try:
ref_domain, domain = get_tld(reference, as_object=True, fail_silently=True).domain, \
get_tld(url, as_object=True, fail_silently=True).domain
Expand Down
14 changes: 5 additions & 9 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_long_description():
setup(
name='courlan',
version=get_version('courlan'),
description='Clean, filter, normalize, and sample URLs',
description='Clean, filter and sample URLs to optimize data collection. Includes spam, content type and language filters.',
long_description=get_long_description(),
classifiers=[
# As from http://pypi.python.org/pypi?%3Aaction=list_classifiers
Expand All @@ -49,7 +49,6 @@ def get_long_description():
'Operating System :: POSIX :: Linux',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
Expand All @@ -66,18 +65,15 @@ def get_long_description():
license='GPLv3+',
packages=['courlan'],
project_urls={
"Source": "https://github.com/adbar/courlan",
"Coverage": "https://codecov.io/github/adbar/courlan",
"Tracker": "https://github.com/adbar/courlan/issues",
"Blog": "https://adrien.barbaresi.eu/blog/", # /tag/courlan.html
},
#package_data={},
include_package_data=True,
python_requires='>=3.5',
python_requires='>=3.6',
install_requires=[
'langcodes==2.2.0; python_version < "3.6"',
'langcodes>=3.2.1; python_version >= "3.6"',
'tldextract; python_version < "3.6"',
'tld; python_version >= "3.6"',
'langcodes>=3.2.1;
'tld;
'urllib3>=1.25,<2',
],
#extras_require=extras,
Expand Down

0 comments on commit c8ce6d8

Please sign in to comment.