From 2434b825ee9acb7e4568f63c0278b031876e204f Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Tue, 4 Jun 2024 18:22:10 +0200
Subject: [PATCH 1/2] maintenance: simplify code

---
 courlan/clean.py    | 13 +++++--------
 courlan/filters.py  | 25 ++++++++++---------------
 courlan/urlutils.py | 24 ++++++++++++++----------
 3 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/courlan/clean.py b/courlan/clean.py
index e2c519b..a68c5b5 100644
--- a/courlan/clean.py
+++ b/courlan/clean.py
@@ -70,12 +70,11 @@ def scrub_url(url: str) -> str:
     url = REMAINING_MARKUP.sub("", url)
 
     # & and &amp;
-    if "&amp;" in url:
-        url = url.replace("&amp;", "&")
-    url = TRAILING_AMP.sub("", url)
+    url = TRAILING_AMP.sub("", url.replace("&amp;", "&"))
 
     # if '"' in link:
     #    link = link.split('"')[0]
+
     # double/faulty URLs
     protocols = PROTOCOLS.findall(url)
     if len(protocols) > 1 and "web.archive.org" not in url:
@@ -182,21 +181,19 @@ def normalize_url(
     parsed_url = _parse(parsed_url)
     # lowercase + remove fragments + normalize punycode
     scheme = parsed_url.scheme.lower()
-    netloc = parsed_url.netloc.lower()
+    netloc = decode_punycode(parsed_url.netloc.lower())
     # port
     try:
-        if parsed_url.port and parsed_url.port in (80, 443):
+        if parsed_url.port in (80, 443):
             netloc = NETLOC_RE.sub("", netloc)
     except ValueError:
         pass  # Port could not be cast to integer value
-    # lowercase + remove fragments + normalize punycode
-    netloc = decode_punycode(netloc)
     # path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py
     # leading /../'s in the path are removed
     newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path)))
     # strip unwanted query elements
     newquery = clean_query(parsed_url.query, strict, language) or ""
-    if newquery and newpath == "":
+    if newquery and not newpath:
         newpath = "/"
     elif (
         not trailing_slash
diff --git a/courlan/filters.py b/courlan/filters.py
index ef39536..0e66106 100644
--- a/courlan/filters.py
+++ b/courlan/filters.py
@@ -209,7 +209,7 @@ def lang_filter(
             occurrences = ALL_PATH_LANGS_NO_TRAILING.findall(url)
         if len(occurrences) == 1:
             score = langcodes_score(language, match[1], score)
-        elif len(occurrences) == 2:
+        elif len(occurrences) <= 2:
             for occurrence in occurrences:
                 score = langcodes_score(language, occurrence, score)
         # don't perform the test if there are too many candidates: > 2
@@ -217,11 +217,7 @@ def lang_filter(
     if strict:
         match = HOST_LANG_FILTER.match(url)
         if match:
-            candidate = match[1].lower()
-            if candidate == language:
-                score += 1
-            else:
-                score -= 1
+            score += 1 if match[1].lower() == language else -1
     # determine test result
     return score >= 0
 
@@ -236,17 +232,16 @@ def path_filter(urlpath: str, query: str) -> bool:
 def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
     """Make sure the target URL is from a suitable type (HTML page with primarily text).
     Strict: Try to filter out other document types, spam, video and adult websites."""
-    try:
+    if (
         # feeds + blogspot
-        if url.endswith(("/feed", "/rss", "_archive.html")):
-            raise ValueError
+        url.endswith(("/feed", "/rss", "_archive.html"))
+        or
         # website structure
-        if SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)):
-            raise ValueError
+        (SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)))
+        or
         # type (also hidden in parameters), videos, adult content
-        if strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)):
-            raise ValueError
-    except ValueError:
+        (strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)))
+    ):
         return False
     # default
     return True
@@ -259,7 +254,7 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]:
     except ValueError:
         return False, None
 
-    if not bool(parsed_url.scheme) or parsed_url.scheme not in PROTOCOLS:
+    if not parsed_url.scheme or parsed_url.scheme not in PROTOCOLS:
         return False, None
 
     if len(parsed_url.netloc) < 5 or (
diff --git a/courlan/urlutils.py b/courlan/urlutils.py
index 5c69ce3..b8a702a 100644
--- a/courlan/urlutils.py
+++ b/courlan/urlutils.py
@@ -111,12 +111,15 @@ def fix_relative_urls(baseurl: str, url: str) -> str:
     "Prepend protocol and host information to relative links."
     if url.startswith("{"):
         return url
+
     base_netloc = urlsplit(baseurl).netloc
     split_url = urlsplit(url)
+
     if split_url.netloc not in (base_netloc, ""):
         if split_url.scheme:
             return url
         return urlunsplit(split_url._replace(scheme="http"))
+
     return urljoin(baseurl, url)
 
 
@@ -150,20 +153,21 @@ def is_known_link(link: str, known_links: Set[str]) -> bool:
         return True
 
     # check link and variants with trailing slashes
-    test_links = [link.rstrip("/"), link.rstrip("/") + "/"]
-    if any(test_link in known_links for test_link in test_links):
+    slash_test = link.rstrip("/") if link[-1] == "/" else link + "/"
+    if slash_test in known_links:
         return True
 
     # check link and variants with modified protocol
     if link.startswith("http"):
-        if link.startswith("https"):
-            testlink = link[:4] + link[5:]
-        else:
-            testlink = "".join([link[:4], "s", link[4:]])
-        if any(
-            test in known_links
-            for test in [testlink, testlink.rstrip("/"), testlink.rstrip("/") + "/"]
-        ):
+        protocol_test = (
+            "http" + link[:5] if link.startswith("https") else "https" + link[4:]
+        )
+        slash_test = (
+            protocol_test.rstrip("/")
+            if protocol_test[-1] == "/"
+            else protocol_test + "/"
+        )
+        if protocol_test in known_links or slash_test in known_links:
             return True
 
     return False

From db120b08b01ecbf9a73621c0e6c10b9d1e3c8ab0 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Tue, 4 Jun 2024 18:27:13 +0200
Subject: [PATCH 2/2] fix typo

---
 courlan/filters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/courlan/filters.py b/courlan/filters.py
index 0e66106..51e052c 100644
--- a/courlan/filters.py
+++ b/courlan/filters.py
@@ -209,7 +209,7 @@ def lang_filter(
             occurrences = ALL_PATH_LANGS_NO_TRAILING.findall(url)
         if len(occurrences) == 1:
             score = langcodes_score(language, match[1], score)
-        elif len(occurrences) <= 2:
+        elif len(occurrences) == 2:
             for occurrence in occurrences:
                 score = langcodes_score(language, occurrence, score)
         # don't perform the test if there are too many candidates: > 2