From 2434b825ee9acb7e4568f63c0278b031876e204f Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 4 Jun 2024 18:22:10 +0200 Subject: [PATCH 1/2] maintenance: simplify code --- courlan/clean.py | 13 +++++-------- courlan/filters.py | 25 ++++++++++--------------- courlan/urlutils.py | 24 ++++++++++++++---------- 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/courlan/clean.py b/courlan/clean.py index e2c519b..a68c5b5 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -70,12 +70,11 @@ def scrub_url(url: str) -> str: url = REMAINING_MARKUP.sub("", url) # & and & - if "&" in url: - url = url.replace("&", "&") - url = TRAILING_AMP.sub("", url) + url = TRAILING_AMP.sub("", url.replace("&", "&")) # if '"' in link: # link = link.split('"')[0] + # double/faulty URLs protocols = PROTOCOLS.findall(url) if len(protocols) > 1 and "web.archive.org" not in url: @@ -182,21 +181,19 @@ def normalize_url( parsed_url = _parse(parsed_url) # lowercase + remove fragments + normalize punycode scheme = parsed_url.scheme.lower() - netloc = parsed_url.netloc.lower() + netloc = decode_punycode(parsed_url.netloc.lower()) # port try: - if parsed_url.port and parsed_url.port in (80, 443): + if parsed_url.port in (80, 443): netloc = NETLOC_RE.sub("", netloc) except ValueError: pass # Port could not be cast to integer value - # lowercase + remove fragments + normalize punycode - netloc = decode_punycode(netloc) # path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py # leading /../'s in the path are removed newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path))) # strip unwanted query elements newquery = clean_query(parsed_url.query, strict, language) or "" - if newquery and newpath == "": + if newquery and not newpath: newpath = "/" elif ( not trailing_slash diff --git a/courlan/filters.py b/courlan/filters.py index ef39536..0e66106 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -209,7 +209,7 @@ def lang_filter( occurrences = ALL_PATH_LANGS_NO_TRAILING.findall(url) if len(occurrences) == 1: score = langcodes_score(language, match[1], score) - elif len(occurrences) == 2: + elif len(occurrences) <= 2: for occurrence in occurrences: score = langcodes_score(language, occurrence, score) # don't perform the test if there are too many candidates: > 2 @@ -217,11 +217,7 @@ def lang_filter( if strict: match = HOST_LANG_FILTER.match(url) if match: - candidate = match[1].lower() - if candidate == language: - score += 1 - else: - score -= 1 + score += 1 if match[1].lower() == language else -1 # determine test result return score >= 0 @@ -236,17 +232,16 @@ def path_filter(urlpath: str, query: str) -> bool: def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool: """Make sure the target URL is from a suitable type (HTML page with primarily text). Strict: Try to filter out other document types, spam, video and adult websites.""" - try: + if ( # feeds + blogspot - if url.endswith(("/feed", "/rss", "_archive.html")): - raise ValueError + url.endswith(("/feed", "/rss", "_archive.html")) + or # website structure - if SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)): - raise ValueError + (SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url))) + or # type (also hidden in parameters), videos, adult content - if strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)): - raise ValueError - except ValueError: + (strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url))) + ): return False # default return True @@ -259,7 +254,7 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]: except ValueError: return False, None - if not bool(parsed_url.scheme) or parsed_url.scheme not in PROTOCOLS: + if not parsed_url.scheme or parsed_url.scheme not in PROTOCOLS: return False, None if len(parsed_url.netloc) < 5 or ( diff --git a/courlan/urlutils.py b/courlan/urlutils.py index 5c69ce3..b8a702a 100644 --- a/courlan/urlutils.py +++ b/courlan/urlutils.py @@ -111,12 +111,15 @@ def fix_relative_urls(baseurl: str, url: str) -> str: "Prepend protocol and host information to relative links." if url.startswith("{"): return url + base_netloc = urlsplit(baseurl).netloc split_url = urlsplit(url) + if split_url.netloc not in (base_netloc, ""): if split_url.scheme: return url return urlunsplit(split_url._replace(scheme="http")) + return urljoin(baseurl, url) @@ -150,20 +153,21 @@ def is_known_link(link: str, known_links: Set[str]) -> bool: return True # check link and variants with trailing slashes - test_links = [link.rstrip("/"), link.rstrip("/") + "/"] - if any(test_link in known_links for test_link in test_links): + slash_test = link.rstrip("/") if link[-1] == "/" else link + "/" + if slash_test in known_links: return True # check link and variants with modified protocol if link.startswith("http"): - if link.startswith("https"): - testlink = link[:4] + link[5:] - else: - testlink = "".join([link[:4], "s", link[4:]]) - if any( - test in known_links - for test in [testlink, testlink.rstrip("/"), testlink.rstrip("/") + "/"] - ): + protocol_test = ( + "http" + link[:5] if link.startswith("https") else "https" + link[4:] + ) + slash_test = ( + protocol_test.rstrip("/") + if protocol_test[-1] == "/" + else protocol_test + "/" + ) + if protocol_test in known_links or slash_test in known_links: return True return False From db120b08b01ecbf9a73621c0e6c10b9d1e3c8ab0 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 4 Jun 2024 18:27:13 +0200 Subject: [PATCH 2/2] fix typo --- courlan/filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/courlan/filters.py b/courlan/filters.py index 0e66106..51e052c 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -209,7 +209,7 @@ def lang_filter( occurrences = ALL_PATH_LANGS_NO_TRAILING.findall(url) if len(occurrences) == 1: score = langcodes_score(language, match[1], score) - elif len(occurrences) <= 2: + elif len(occurrences) == 2: for occurrence in occurrences: score = langcodes_score(language, occurrence, score) # don't perform the test if there are too many candidates: > 2