Skip to content

Commit

Permalink
accept part of code refactoring by Sourcery
Browse files Browse the repository at this point in the history
* 'Refactored by Sourcery'

* review length test in courlan/filters.py

* reject suggestion

* process changes

* refuse one change

* restore code commented out

* merge rejections and comments restored

* fix: return values

Co-authored-by: Sourcery AI <>
Co-authored-by: Adrien Barbaresi <adbar@users.noreply.github.com>
  • Loading branch information
sourcery-ai[bot] and adbar authored Oct 26, 2021
1 parent 6438f20 commit fc945f0
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 76 deletions.
8 changes: 4 additions & 4 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def scrub_url(url):
# link = link.split('"')[0]
# double/faulty URLs
protocols = PROTOCOLS.findall(url)
if len(protocols) > 1 and not 'web.archive.org' in url:
if len(protocols) > 1 and 'web.archive.org' not in url:
logging.debug('double url: %s %s', len(protocols), url)
match = SELECTION.match(url)
if match and validate_url(match.group(1))[0] is True:
Expand All @@ -72,9 +72,9 @@ def scrub_url(url):
match = re.match(r'(.*?)[<>"\'\r\n ]', url)
if match:
url = match.group(1)
if len(url) > 500:
logging.debug('invalid-looking link %s of length %d',
url[:50] + '...', len(url))
if len(url) > 500:
logging.debug('invalid-looking link %s of length %d',
url[:50] + '...', len(url))
# trailing ampersand
url = url.strip('&')
# trailing slashes in URLs without path or in embedded URLs
Expand Down
2 changes: 1 addition & 1 deletion courlan/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def main():
with open(args.discardedfile, 'a', encoding='utf-8') as discardfh:
discardfh.write(line)
else:
urllist = list()
urllist = []
with open(args.inputfile, 'r', encoding='utf-8', errors='ignore') as inputfh:
for line in inputfh:
urllist.append(line.strip())
Expand Down
29 changes: 13 additions & 16 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,12 @@ def check_url(url, strict=False, with_redirects=False, language=None, with_nav=F
raise ValueError

# internationalization and language heuristics in URL
if language is not None:
if lang_filter(url, language, strict) is False:
LOGGER.debug('rejected, lang filter: %s', url)
raise ValueError
if (
language is not None
and lang_filter(url, language, strict) is False
):
LOGGER.debug('rejected, lang filter: %s', url)
raise ValueError

# split and validate
validation_test, parsed_url = validate_url(url)
Expand Down Expand Up @@ -129,28 +131,23 @@ def sample_urls(urllist, samplesize, exclude_min=None, exclude_max=None, strict=
continue
url, domain = checked[0], checked[1]
# continue collection
if domain == lastseen:
urlbuffer.add(url)
# sample, drop, fresh start
else:
if domain != lastseen:
# threshold for too small websites
if exclude_min is None or len(urlbuffer) >= exclude_min:
# write all the buffer
if len(urlbuffer) <= samplesize:
yield from sorted(urlbuffer)
LOGGER.info('%s\t\turls: %s', lastseen, len(urlbuffer))
# or sample URLs
# print all or sample URLs
elif exclude_max is None or len(urlbuffer) <= exclude_max:
yield from sorted(sample(urlbuffer, samplesize))
LOGGER.info('%s\t\turls: %s\tprop.: %s', lastseen, len(urlbuffer), samplesize/len(urlbuffer))
else:
# threshold for too large websites
if exclude_max is None or len(urlbuffer) <= exclude_max:
yield from sorted(sample(urlbuffer, samplesize))
LOGGER.info('%s\t\turls: %s\tprop.: %s', lastseen, len(urlbuffer), samplesize/len(urlbuffer))
else:
LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer))
LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer))
else:
LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer))
urlbuffer = set()
urlbuffer.add(url)
urlbuffer.add(url)
lastseen = domain


Expand Down
25 changes: 12 additions & 13 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,19 @@

def basic_filter(url):
'''Filter URLs based on basic formal characteristics'''
if not url.startswith('http') or len(url) >= 500 or len(url) < 10:
return False
return True
return bool(url.startswith('http') and 10 <= len(url) < 500)


def extension_filter(urlpath):
'''Filter based on file extension'''
if EXTENSION_REGEX.search(urlpath) and not urlpath.endswith(WHITELISTED_EXTENSIONS):
return False
return True
# suggestion:
#return bool(
# not EXTENSION_REGEX.search(urlpath)
# or urlpath.endswith(WHITELISTED_EXTENSIONS)
#)


def langcodes_score(language, segment, score):
Expand Down Expand Up @@ -118,9 +121,7 @@ def lang_filter(url, language=None, strict=False):
else:
score -= 1
# determine test result
if score < 0:
return False
return True
return score >= 0


def path_filter(urlpath, query):
Expand All @@ -139,10 +140,7 @@ def spam_filter(url):
#for exp in (''):
# if exp in url:
# return False
if ADULT_FILTER.search(url):
return False
# default
return True
return not ADULT_FILTER.search(url)


def type_filter(url, strict=False, with_nav=False):
Expand All @@ -158,9 +156,10 @@ def type_filter(url, strict=False, with_nav=False):
if re.search(r'/oembed\b', url, re.IGNORECASE):
raise ValueError
# wordpress structure
if WORDPRESS_CONTENT_FILTER.search(url):
if with_nav is not True or not is_navigation_page(url):
raise ValueError
if WORDPRESS_CONTENT_FILTER.search(url) and (
with_nav is not True or not is_navigation_page(url)
):
raise ValueError
# hidden in parameters
if strict is True and PARAM_FILTER.search(url):
raise ValueError
Expand Down
42 changes: 19 additions & 23 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,23 +60,22 @@ def fix_relative_urls(baseurl, url):
'Prepend protocol and host information to relative links.'
if url.startswith('//'):
if baseurl.startswith('https'):
urlfix = 'https:' + url
return 'https:' + url
else:
urlfix = 'http:' + url
return 'http:' + url
elif url.startswith('/'):
urlfix = baseurl + url
# imperfect path handling
# imperfect path handling
return baseurl + url
elif url.startswith('.'):
urlfix = baseurl + '/' + re.sub(r'(.+/)+', '', url)
# don't try to correct these URLs
# don't try to correct these URLs
return baseurl + '/' + re.sub(r'(.+/)+', '', url)
elif url.startswith('{'):
urlfix = url
# catchall
# catchall
return url
elif not url.startswith('http'):
urlfix = baseurl + '/' + url
return baseurl + '/' + url
else:
urlfix = url
return urlfix
return url


def is_external(url, reference, ignore_suffix=True):
Expand All @@ -93,17 +92,14 @@ def is_external(url, reference, ignore_suffix=True):
else: # '.'.join(ext[-2:]).strip('.')
ref_domain, domain = reference.registered_domain, tldinfo.registered_domain
# new tld code
elif ignore_suffix is True:
try:
ref_domain, domain = get_tld(reference, as_object=True, fail_silently=True).domain, \
get_tld(url, as_object=True, fail_silently=True).domain
# invalid input
except AttributeError:
return True
else:
if ignore_suffix is True:
try:
ref_domain, domain = get_tld(reference, as_object=True, fail_silently=True).domain, \
get_tld(url, as_object=True, fail_silently=True).domain
# invalid input
except AttributeError:
return True
else:
ref_domain, domain = get_fld(reference, fail_silently=True), get_fld(url, fail_silently=True)
ref_domain, domain = get_fld(reference, fail_silently=True), get_fld(url, fail_silently=True)
# comparison
if domain != ref_domain:
return True
return False
return domain != ref_domain
4 changes: 1 addition & 3 deletions helpers/common-crawl-normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,7 @@
continue

# print the result of the substitution
core = ''
for item in reversed(elements):
core += item
core = ''.join(reversed(elements))
core = re.sub(r'\.$', '', core)
line = re.sub(r'^.+?/', '/', line)
line = 'http://' + lastone + '.' + core + line
Expand Down
32 changes: 19 additions & 13 deletions helpers/find-wordpress-urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,21 @@ def find_target(url):
return url_types.group(1).rstrip('/') + '/'

# lax
if args.lax is True:
# path correction
# mpath = re.match(r'(/blog/|/weblog/)', url) #uparse.path
# if mpath:
# path = mpath.group(1)
#else:
# path = ''
if re.search(r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url):
url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url)
if url_lax:
if url_lax.group(2) and url_lax.group(3):
return url_lax.group(1) + url_lax.group(2)
else:
return url_lax.group(1).rstrip('/') + '/'
if args.lax is True and re.search(
r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url
):
url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url)
if url_lax:
if url_lax.group(2) and url_lax.group(3):
return url_lax.group(1) + url_lax.group(2)
else:
return url_lax.group(1).rstrip('/') + '/'

return None

Expand All @@ -103,11 +104,16 @@ def find_target(url):
url = url.lower().rstrip('\n')

# filters
if re.match('http', url) and len(url) > 11:
# akamai/fbcdn, etc.
if not re.search(r'\.blogspot\.|\.google\.|\.tumblr\.|\.typepad\.com|\.wp\.com|\.archive\.|akamai|fbcdn|baidu\.com|\.gravatar\.', url):
# test if part of the URL is interesting
target = find_target(url)
if (
re.match('http', url)
and len(url) > 11
and not re.search(
r'\.blogspot\.|\.google\.|\.tumblr\.|\.typepad\.com|\.wp\.com|\.archive\.|akamai|fbcdn|baidu\.com|\.gravatar\.',
url,
)
):
# test if part of the URL is interesting
target = find_target(url)

# limit path depth and filter out queries
if target and not re.search(r'=|\.php', target) and len(re.findall(r'/', target)) <= 4:
Expand Down
9 changes: 6 additions & 3 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,12 +336,15 @@ def test_cli():

def test_sample():
'''test URL sampling'''
assert len(list(sample_urls(['http://test.org/test1', 'http://test.org/test2'], 0))) == 0
assert not list(
sample_urls(['http://test.org/test1', 'http://test.org/test2'], 0)
)

# assert len(sample_urls(['http://test.org/test1', 'http://test.org/test2'], 1)) == 1
mylist = ['http://t.o/t1', 'http://test.org/test1', 'http://test.org/test2', 'http://test2.org/test2']
assert len(list(sample_urls(mylist, 1, verbose=True))) == 1
assert len(list(sample_urls(mylist, 1, exclude_min=10, verbose=True))) == 0
assert len(list(sample_urls(mylist, 1, exclude_max=1, verbose=True))) == 0
assert not list(sample_urls(mylist, 1, exclude_min=10, verbose=True))
assert not list(sample_urls(mylist, 1, exclude_max=1, verbose=True))


def test_examples():
Expand Down

0 comments on commit fc945f0

Please sign in to comment.