From fc945f0bdea4680e8573695341483b20a1c6f2ed Mon Sep 17 00:00:00 2001 From: "sourcery-ai[bot]" <58596630+sourcery-ai[bot]@users.noreply.github.com> Date: Tue, 26 Oct 2021 18:42:32 +0200 Subject: [PATCH] accept part of code refactoring by Sourcery * 'Refactored by Sourcery' * review length test in courlan/filters.py * reject suggestion * process changes * refuse one change * restore code commented out * merge rejections and comments restored * fix: return values Co-authored-by: Sourcery AI <> Co-authored-by: Adrien Barbaresi --- courlan/clean.py | 8 +++--- courlan/cli.py | 2 +- courlan/core.py | 29 ++++++++++----------- courlan/filters.py | 25 +++++++++--------- courlan/urlutils.py | 42 ++++++++++++++----------------- helpers/common-crawl-normalize.py | 4 +-- helpers/find-wordpress-urls.py | 32 +++++++++++++---------- tests/unit_tests.py | 9 ++++--- 8 files changed, 75 insertions(+), 76 deletions(-) diff --git a/courlan/clean.py b/courlan/clean.py index 1411066..dcb55da 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -55,7 +55,7 @@ def scrub_url(url): # link = link.split('"')[0] # double/faulty URLs protocols = PROTOCOLS.findall(url) - if len(protocols) > 1 and not 'web.archive.org' in url: + if len(protocols) > 1 and 'web.archive.org' not in url: logging.debug('double url: %s %s', len(protocols), url) match = SELECTION.match(url) if match and validate_url(match.group(1))[0] is True: @@ -72,9 +72,9 @@ def scrub_url(url): match = re.match(r'(.*?)[<>"\'\r\n ]', url) if match: url = match.group(1) - if len(url) > 500: - logging.debug('invalid-looking link %s of length %d', - url[:50] + '...', len(url)) + if len(url) > 500: + logging.debug('invalid-looking link %s of length %d', + url[:50] + '...', len(url)) # trailing ampersand url = url.strip('&') # trailing slashes in URLs without path or in embedded URLs diff --git a/courlan/cli.py b/courlan/cli.py index 062d306..492b037 100644 --- a/courlan/cli.py +++ b/courlan/cli.py @@ -69,7 +69,7 @@ def main(): with open(args.discardedfile, 'a', encoding='utf-8') as discardfh: discardfh.write(line) else: - urllist = list() + urllist = [] with open(args.inputfile, 'r', encoding='utf-8', errors='ignore') as inputfh: for line in inputfh: urllist.append(line.strip()) diff --git a/courlan/core.py b/courlan/core.py index 0007453..7ad371d 100644 --- a/courlan/core.py +++ b/courlan/core.py @@ -74,10 +74,12 @@ def check_url(url, strict=False, with_redirects=False, language=None, with_nav=F raise ValueError # internationalization and language heuristics in URL - if language is not None: - if lang_filter(url, language, strict) is False: - LOGGER.debug('rejected, lang filter: %s', url) - raise ValueError + if ( + language is not None + and lang_filter(url, language, strict) is False + ): + LOGGER.debug('rejected, lang filter: %s', url) + raise ValueError # split and validate validation_test, parsed_url = validate_url(url) @@ -129,28 +131,23 @@ def sample_urls(urllist, samplesize, exclude_min=None, exclude_max=None, strict= continue url, domain = checked[0], checked[1] # continue collection - if domain == lastseen: - urlbuffer.add(url) - # sample, drop, fresh start - else: + if domain != lastseen: # threshold for too small websites if exclude_min is None or len(urlbuffer) >= exclude_min: # write all the buffer if len(urlbuffer) <= samplesize: yield from sorted(urlbuffer) LOGGER.info('%s\t\turls: %s', lastseen, len(urlbuffer)) - # or sample URLs + # print all or sample URLs + elif exclude_max is None or len(urlbuffer) <= exclude_max: + yield from sorted(sample(urlbuffer, samplesize)) + LOGGER.info('%s\t\turls: %s\tprop.: %s', lastseen, len(urlbuffer), samplesize/len(urlbuffer)) else: - # threshold for too large websites - if exclude_max is None or len(urlbuffer) <= exclude_max: - yield from sorted(sample(urlbuffer, samplesize)) - LOGGER.info('%s\t\turls: %s\tprop.: %s', lastseen, len(urlbuffer), samplesize/len(urlbuffer)) - else: - LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer)) + LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer)) else: LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer)) urlbuffer = set() - urlbuffer.add(url) + urlbuffer.add(url) lastseen = domain diff --git a/courlan/filters.py b/courlan/filters.py index 3f98820..825ec45 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -55,9 +55,7 @@ def basic_filter(url): '''Filter URLs based on basic formal characteristics''' - if not url.startswith('http') or len(url) >= 500 or len(url) < 10: - return False - return True + return bool(url.startswith('http') and 10 <= len(url) < 500) def extension_filter(urlpath): @@ -65,6 +63,11 @@ def extension_filter(urlpath): if EXTENSION_REGEX.search(urlpath) and not urlpath.endswith(WHITELISTED_EXTENSIONS): return False return True + # suggestion: + #return bool( + # not EXTENSION_REGEX.search(urlpath) + # or urlpath.endswith(WHITELISTED_EXTENSIONS) + #) def langcodes_score(language, segment, score): @@ -118,9 +121,7 @@ def lang_filter(url, language=None, strict=False): else: score -= 1 # determine test result - if score < 0: - return False - return True + return score >= 0 def path_filter(urlpath, query): @@ -139,10 +140,7 @@ def spam_filter(url): #for exp in (''): # if exp in url: # return False - if ADULT_FILTER.search(url): - return False - # default - return True + return not ADULT_FILTER.search(url) def type_filter(url, strict=False, with_nav=False): @@ -158,9 +156,10 @@ def type_filter(url, strict=False, with_nav=False): if re.search(r'/oembed\b', url, re.IGNORECASE): raise ValueError # wordpress structure - if WORDPRESS_CONTENT_FILTER.search(url): - if with_nav is not True or not is_navigation_page(url): - raise ValueError + if WORDPRESS_CONTENT_FILTER.search(url) and ( + with_nav is not True or not is_navigation_page(url) + ): + raise ValueError # hidden in parameters if strict is True and PARAM_FILTER.search(url): raise ValueError diff --git a/courlan/urlutils.py b/courlan/urlutils.py index 27c0d48..f050b5e 100644 --- a/courlan/urlutils.py +++ b/courlan/urlutils.py @@ -60,23 +60,22 @@ def fix_relative_urls(baseurl, url): 'Prepend protocol and host information to relative links.' if url.startswith('//'): if baseurl.startswith('https'): - urlfix = 'https:' + url + return 'https:' + url else: - urlfix = 'http:' + url + return 'http:' + url elif url.startswith('/'): - urlfix = baseurl + url - # imperfect path handling + # imperfect path handling + return baseurl + url elif url.startswith('.'): - urlfix = baseurl + '/' + re.sub(r'(.+/)+', '', url) - # don't try to correct these URLs + # don't try to correct these URLs + return baseurl + '/' + re.sub(r'(.+/)+', '', url) elif url.startswith('{'): - urlfix = url - # catchall + # catchall + return url elif not url.startswith('http'): - urlfix = baseurl + '/' + url + return baseurl + '/' + url else: - urlfix = url - return urlfix + return url def is_external(url, reference, ignore_suffix=True): @@ -93,17 +92,14 @@ def is_external(url, reference, ignore_suffix=True): else: # '.'.join(ext[-2:]).strip('.') ref_domain, domain = reference.registered_domain, tldinfo.registered_domain # new tld code + elif ignore_suffix is True: + try: + ref_domain, domain = get_tld(reference, as_object=True, fail_silently=True).domain, \ + get_tld(url, as_object=True, fail_silently=True).domain + # invalid input + except AttributeError: + return True else: - if ignore_suffix is True: - try: - ref_domain, domain = get_tld(reference, as_object=True, fail_silently=True).domain, \ - get_tld(url, as_object=True, fail_silently=True).domain - # invalid input - except AttributeError: - return True - else: - ref_domain, domain = get_fld(reference, fail_silently=True), get_fld(url, fail_silently=True) + ref_domain, domain = get_fld(reference, fail_silently=True), get_fld(url, fail_silently=True) # comparison - if domain != ref_domain: - return True - return False + return domain != ref_domain diff --git a/helpers/common-crawl-normalize.py b/helpers/common-crawl-normalize.py index 7bb7a48..7f6440f 100644 --- a/helpers/common-crawl-normalize.py +++ b/helpers/common-crawl-normalize.py @@ -57,9 +57,7 @@ continue # print the result of the substitution - core = '' - for item in reversed(elements): - core += item + core = ''.join(reversed(elements)) core = re.sub(r'\.$', '', core) line = re.sub(r'^.+?/', '/', line) line = 'http://' + lastone + '.' + core + line diff --git a/helpers/find-wordpress-urls.py b/helpers/find-wordpress-urls.py index a8411c0..8e3cdd1 100644 --- a/helpers/find-wordpress-urls.py +++ b/helpers/find-wordpress-urls.py @@ -75,20 +75,21 @@ def find_target(url): return url_types.group(1).rstrip('/') + '/' # lax - if args.lax is True: # path correction # mpath = re.match(r'(/blog/|/weblog/)', url) #uparse.path # if mpath: # path = mpath.group(1) #else: # path = '' - if re.search(r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url): - url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url) - if url_lax: - if url_lax.group(2) and url_lax.group(3): - return url_lax.group(1) + url_lax.group(2) - else: - return url_lax.group(1).rstrip('/') + '/' + if args.lax is True and re.search( + r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url + ): + url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url) + if url_lax: + if url_lax.group(2) and url_lax.group(3): + return url_lax.group(1) + url_lax.group(2) + else: + return url_lax.group(1).rstrip('/') + '/' return None @@ -103,11 +104,16 @@ def find_target(url): url = url.lower().rstrip('\n') # filters - if re.match('http', url) and len(url) > 11: - # akamai/fbcdn, etc. - if not re.search(r'\.blogspot\.|\.google\.|\.tumblr\.|\.typepad\.com|\.wp\.com|\.archive\.|akamai|fbcdn|baidu\.com|\.gravatar\.', url): - # test if part of the URL is interesting - target = find_target(url) + if ( + re.match('http', url) + and len(url) > 11 + and not re.search( + r'\.blogspot\.|\.google\.|\.tumblr\.|\.typepad\.com|\.wp\.com|\.archive\.|akamai|fbcdn|baidu\.com|\.gravatar\.', + url, + ) + ): + # test if part of the URL is interesting + target = find_target(url) # limit path depth and filter out queries if target and not re.search(r'=|\.php', target) and len(re.findall(r'/', target)) <= 4: diff --git a/tests/unit_tests.py b/tests/unit_tests.py index db7e145..6bee0ee 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -336,12 +336,15 @@ def test_cli(): def test_sample(): '''test URL sampling''' - assert len(list(sample_urls(['http://test.org/test1', 'http://test.org/test2'], 0))) == 0 + assert not list( + sample_urls(['http://test.org/test1', 'http://test.org/test2'], 0) + ) + # assert len(sample_urls(['http://test.org/test1', 'http://test.org/test2'], 1)) == 1 mylist = ['http://t.o/t1', 'http://test.org/test1', 'http://test.org/test2', 'http://test2.org/test2'] assert len(list(sample_urls(mylist, 1, verbose=True))) == 1 - assert len(list(sample_urls(mylist, 1, exclude_min=10, verbose=True))) == 0 - assert len(list(sample_urls(mylist, 1, exclude_max=1, verbose=True))) == 0 + assert not list(sample_urls(mylist, 1, exclude_min=10, verbose=True)) + assert not list(sample_urls(mylist, 1, exclude_max=1, verbose=True)) def test_examples():