accept part of code refactoring by Sourcery

* 'Refactored by Sourcery' * review length test in courlan/filters.py * reject suggestion * process changes * refuse one change * restore code commented out * merge rejections and comments restored * fix: return values Co-authored-by: Sourcery AI <> Co-authored-by: Adrien Barbaresi <adbar@users.noreply.github.com>
adbar · Oct 26, 2021 · fc945f0 · fc945f0
1 parent 6438f20
commit fc945f0
Show file tree

Hide file tree

Showing 8 changed files with 75 additions and 76 deletions.
diff --git a/courlan/clean.py b/courlan/clean.py
@@ -55,7 +55,7 @@ def scrub_url(url):
     #    link = link.split('"')[0]
     # double/faulty URLs
     protocols = PROTOCOLS.findall(url)
-    if len(protocols) > 1 and not 'web.archive.org' in url:
+    if len(protocols) > 1 and 'web.archive.org' not in url:
         logging.debug('double url: %s %s', len(protocols), url)
         match = SELECTION.match(url)
         if match and validate_url(match.group(1))[0] is True:
@@ -72,9 +72,9 @@ def scrub_url(url):
         match = re.match(r'(.*?)[<>"\'\r\n ]', url)
         if match:
             url = match.group(1)
-        if len(url) > 500:
-            logging.debug('invalid-looking link %s of length %d',
-                           url[:50] + '...', len(url))
+    if len(url) > 500:
+        logging.debug('invalid-looking link %s of length %d',
+                       url[:50] + '...', len(url))
     # trailing ampersand
     url = url.strip('&')
     # trailing slashes in URLs without path or in embedded URLs

diff --git a/courlan/cli.py b/courlan/cli.py
@@ -69,7 +69,7 @@ def main():
                     with open(args.discardedfile, 'a', encoding='utf-8') as discardfh:
                         discardfh.write(line)
     else:
-        urllist = list()
+        urllist = []
         with open(args.inputfile, 'r', encoding='utf-8', errors='ignore') as inputfh:
             for line in inputfh:
                 urllist.append(line.strip())

diff --git a/courlan/core.py b/courlan/core.py
@@ -74,10 +74,12 @@ def check_url(url, strict=False, with_redirects=False, language=None, with_nav=F
             raise ValueError
 
         # internationalization and language heuristics in URL
-        if language is not None:
-            if lang_filter(url, language, strict) is False:
-                LOGGER.debug('rejected, lang filter: %s', url)
-                raise ValueError
+        if (
+            language is not None
+            and lang_filter(url, language, strict) is False
+        ):
+            LOGGER.debug('rejected, lang filter: %s', url)
+            raise ValueError
 
         # split and validate
         validation_test, parsed_url = validate_url(url)
@@ -129,28 +131,23 @@ def sample_urls(urllist, samplesize, exclude_min=None, exclude_max=None, strict=
             continue
         url, domain = checked[0], checked[1]
         # continue collection
-        if domain == lastseen:
-            urlbuffer.add(url)
-        # sample, drop, fresh start
-        else:
+        if domain != lastseen:
             # threshold for too small websites
             if exclude_min is None or len(urlbuffer) >= exclude_min:
                 # write all the buffer
                 if len(urlbuffer) <= samplesize:
                     yield from sorted(urlbuffer)
                     LOGGER.info('%s\t\turls: %s', lastseen, len(urlbuffer))
-                # or sample URLs
+                # print all or sample URLs
+                elif exclude_max is None or len(urlbuffer) <= exclude_max:
+                    yield from sorted(sample(urlbuffer, samplesize))
+                    LOGGER.info('%s\t\turls: %s\tprop.: %s', lastseen, len(urlbuffer), samplesize/len(urlbuffer))
                 else:
-                    # threshold for too large websites
-                    if exclude_max is None or len(urlbuffer) <= exclude_max:
-                        yield from sorted(sample(urlbuffer, samplesize))
-                        LOGGER.info('%s\t\turls: %s\tprop.: %s', lastseen, len(urlbuffer), samplesize/len(urlbuffer))
-                    else:
-                        LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer))
+                    LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer))
             else:
                 LOGGER.info('discarded (exclude size): %s\t\turls: %s', lastseen, len(urlbuffer))
             urlbuffer = set()
-            urlbuffer.add(url)
+        urlbuffer.add(url)
         lastseen = domain
 
 

diff --git a/courlan/filters.py b/courlan/filters.py
@@ -55,16 +55,19 @@
 
 def basic_filter(url):
     '''Filter URLs based on basic formal characteristics'''
-    if not url.startswith('http') or len(url) >= 500 or len(url) < 10:
-        return False
-    return True
+    return bool(url.startswith('http') and 10 <= len(url) < 500)
 
 
 def extension_filter(urlpath):
     '''Filter based on file extension'''
     if EXTENSION_REGEX.search(urlpath) and not urlpath.endswith(WHITELISTED_EXTENSIONS):
         return False
     return True
+    # suggestion:
+    #return bool(
+    #    not EXTENSION_REGEX.search(urlpath)
+    #    or urlpath.endswith(WHITELISTED_EXTENSIONS)
+    #)
 
 
 def langcodes_score(language, segment, score):
@@ -118,9 +121,7 @@ def lang_filter(url, language=None, strict=False):
             else:
                 score -= 1
     # determine test result
-    if score < 0:
-        return False
-    return True
+    return score >= 0
 
 
 def path_filter(urlpath, query):
@@ -139,10 +140,7 @@ def spam_filter(url):
     #for exp in (''):
     #    if exp in url:
     #        return False
-    if ADULT_FILTER.search(url):
-        return False
-    # default
-    return True
+    return not ADULT_FILTER.search(url)
 
 
 def type_filter(url, strict=False, with_nav=False):
@@ -158,9 +156,10 @@ def type_filter(url, strict=False, with_nav=False):
         if re.search(r'/oembed\b', url, re.IGNORECASE):
             raise ValueError
         # wordpress structure
-        if WORDPRESS_CONTENT_FILTER.search(url):
-            if with_nav is not True or not is_navigation_page(url):
-                raise ValueError
+        if WORDPRESS_CONTENT_FILTER.search(url) and (
+            with_nav is not True or not is_navigation_page(url)
+        ):
+            raise ValueError
         # hidden in parameters
         if strict is True and PARAM_FILTER.search(url):
             raise ValueError

diff --git a/courlan/urlutils.py b/courlan/urlutils.py
@@ -60,23 +60,22 @@ def fix_relative_urls(baseurl, url):
     'Prepend protocol and host information to relative links.'
     if url.startswith('//'):
         if baseurl.startswith('https'):
-            urlfix = 'https:' + url
+            return 'https:' + url
         else:
-            urlfix = 'http:' + url
+            return 'http:' + url
     elif url.startswith('/'):
-        urlfix = baseurl + url
-    # imperfect path handling
+        # imperfect path handling
+        return baseurl + url
     elif url.startswith('.'):
-        urlfix = baseurl + '/' + re.sub(r'(.+/)+', '', url)
-    # don't try to correct these URLs
+        # don't try to correct these URLs
+        return baseurl + '/' + re.sub(r'(.+/)+', '', url)
     elif url.startswith('{'):
-        urlfix = url
-    # catchall
+        # catchall
+        return url
     elif not url.startswith('http'):
-        urlfix = baseurl + '/' + url
+        return baseurl + '/' + url
     else:
-        urlfix = url
-    return urlfix
+        return url
 
 
 def is_external(url, reference, ignore_suffix=True):
@@ -93,17 +92,14 @@ def is_external(url, reference, ignore_suffix=True):
         else:  # '.'.join(ext[-2:]).strip('.')
             ref_domain, domain = reference.registered_domain, tldinfo.registered_domain
     # new tld code
+    elif ignore_suffix is True:
+        try:
+            ref_domain, domain = get_tld(reference, as_object=True, fail_silently=True).domain, \
+                                 get_tld(url, as_object=True, fail_silently=True).domain
+        # invalid input
+        except AttributeError:
+            return True
     else:
-        if ignore_suffix is True:
-            try:
-                ref_domain, domain = get_tld(reference, as_object=True, fail_silently=True).domain, \
-                                     get_tld(url, as_object=True, fail_silently=True).domain
-            # invalid input
-            except AttributeError:
-                return True
-        else:
-            ref_domain, domain = get_fld(reference, fail_silently=True), get_fld(url, fail_silently=True)
+        ref_domain, domain = get_fld(reference, fail_silently=True), get_fld(url, fail_silently=True)
     # comparison
-    if domain != ref_domain:
-        return True
-    return False
+    return domain != ref_domain
diff --git a/helpers/common-crawl-normalize.py b/helpers/common-crawl-normalize.py
@@ -57,9 +57,7 @@
                         continue
 
                     # print the result of the substitution
-                    core = ''
-                    for item in reversed(elements):
-                        core += item
+                    core = ''.join(reversed(elements))
                     core = re.sub(r'\.$', '', core)
                     line = re.sub(r'^.+?/', '/', line)
                     line = 'http://' + lastone + '.' + core + line

diff --git a/helpers/find-wordpress-urls.py b/helpers/find-wordpress-urls.py
@@ -75,20 +75,21 @@ def find_target(url):
                 return url_types.group(1).rstrip('/') + '/'
 
     # lax
-    if args.lax is True:
     # path correction
     # mpath = re.match(r'(/blog/|/weblog/)', url) #uparse.path
     # if mpath:
     #    path = mpath.group(1)
     #else:
     #    path = '' 
-        if re.search(r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url):
-            url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url)
-            if url_lax:
-                if url_lax.group(2) and url_lax.group(3):
-                    return url_lax.group(1) + url_lax.group(2)
-                else:
-                    return url_lax.group(1).rstrip('/') + '/'
+    if args.lax is True and re.search(
+        r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url
+    ):
+        url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url)
+        if url_lax:
+            if url_lax.group(2) and url_lax.group(3):
+                return url_lax.group(1) + url_lax.group(2)
+            else:
+                return url_lax.group(1).rstrip('/') + '/'
 
     return None
 
@@ -103,11 +104,16 @@ def find_target(url):
                 url = url.lower().rstrip('\n')
 
                 # filters
-                if re.match('http', url) and len(url) > 11:
-                    # akamai/fbcdn, etc.
-                    if not re.search(r'\.blogspot\.|\.google\.|\.tumblr\.|\.typepad\.com|\.wp\.com|\.archive\.|akamai|fbcdn|baidu\.com|\.gravatar\.', url):
-                        # test if part of the URL is interesting
-                        target = find_target(url)
+                if (
+                    re.match('http', url)
+                    and len(url) > 11
+                    and not re.search(
+                        r'\.blogspot\.|\.google\.|\.tumblr\.|\.typepad\.com|\.wp\.com|\.archive\.|akamai|fbcdn|baidu\.com|\.gravatar\.',
+                        url,
+                    )
+                ):
+                    # test if part of the URL is interesting
+                    target = find_target(url)
 
                 # limit path depth and filter out queries
                 if target and not re.search(r'=|\.php', target) and len(re.findall(r'/', target)) <= 4:

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -336,12 +336,15 @@ def test_cli():
 
 def test_sample():
     '''test URL sampling'''
-    assert len(list(sample_urls(['http://test.org/test1', 'http://test.org/test2'], 0))) == 0
+    assert not list(
+        sample_urls(['http://test.org/test1', 'http://test.org/test2'], 0)
+    )
+
     # assert len(sample_urls(['http://test.org/test1', 'http://test.org/test2'], 1)) == 1
     mylist = ['http://t.o/t1', 'http://test.org/test1', 'http://test.org/test2', 'http://test2.org/test2']
     assert len(list(sample_urls(mylist, 1, verbose=True))) == 1
-    assert len(list(sample_urls(mylist, 1, exclude_min=10, verbose=True))) == 0
-    assert len(list(sample_urls(mylist, 1, exclude_max=1, verbose=True))) == 0
+    assert not list(sample_urls(mylist, 1, exclude_min=10, verbose=True))
+    assert not list(sample_urls(mylist, 1, exclude_max=1, verbose=True))
 
 
 def test_examples():