Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

maintenance: simplify code #103

Merged
merged 2 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,11 @@ def scrub_url(url: str) -> str:
url = REMAINING_MARKUP.sub("", url)

# & and &
if "&" in url:
url = url.replace("&", "&")
url = TRAILING_AMP.sub("", url)
url = TRAILING_AMP.sub("", url.replace("&", "&"))

# if '"' in link:
# link = link.split('"')[0]

# double/faulty URLs
protocols = PROTOCOLS.findall(url)
if len(protocols) > 1 and "web.archive.org" not in url:
Expand Down Expand Up @@ -182,21 +181,19 @@ def normalize_url(
parsed_url = _parse(parsed_url)
# lowercase + remove fragments + normalize punycode
scheme = parsed_url.scheme.lower()
netloc = parsed_url.netloc.lower()
netloc = decode_punycode(parsed_url.netloc.lower())
# port
try:
if parsed_url.port and parsed_url.port in (80, 443):
if parsed_url.port in (80, 443):
netloc = NETLOC_RE.sub("", netloc)
except ValueError:
pass # Port could not be cast to integer value
# lowercase + remove fragments + normalize punycode
netloc = decode_punycode(netloc)
# path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py
# leading /../'s in the path are removed
newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path)))
# strip unwanted query elements
newquery = clean_query(parsed_url.query, strict, language) or ""
if newquery and newpath == "":
if newquery and not newpath:
newpath = "/"
elif (
not trailing_slash
Expand Down
23 changes: 9 additions & 14 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,11 +217,7 @@ def lang_filter(
if strict:
match = HOST_LANG_FILTER.match(url)
if match:
candidate = match[1].lower()
if candidate == language:
score += 1
else:
score -= 1
score += 1 if match[1].lower() == language else -1
# determine test result
return score >= 0

Expand All @@ -236,17 +232,16 @@ def path_filter(urlpath: str, query: str) -> bool:
def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
"""Make sure the target URL is from a suitable type (HTML page with primarily text).
Strict: Try to filter out other document types, spam, video and adult websites."""
try:
if (
# feeds + blogspot
if url.endswith(("/feed", "/rss", "_archive.html")):
raise ValueError
url.endswith(("/feed", "/rss", "_archive.html"))
or
# website structure
if SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)):
raise ValueError
(SITE_STRUCTURE.search(url) and (not with_nav or not is_navigation_page(url)))
or
# type (also hidden in parameters), videos, adult content
if strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)):
raise ValueError
except ValueError:
(strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)))
):
return False
# default
return True
Expand All @@ -259,7 +254,7 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]:
except ValueError:
return False, None

if not bool(parsed_url.scheme) or parsed_url.scheme not in PROTOCOLS:
if not parsed_url.scheme or parsed_url.scheme not in PROTOCOLS:
return False, None

if len(parsed_url.netloc) < 5 or (
Expand Down
24 changes: 14 additions & 10 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,15 @@ def fix_relative_urls(baseurl: str, url: str) -> str:
"Prepend protocol and host information to relative links."
if url.startswith("{"):
return url

base_netloc = urlsplit(baseurl).netloc
split_url = urlsplit(url)

if split_url.netloc not in (base_netloc, ""):
if split_url.scheme:
return url
return urlunsplit(split_url._replace(scheme="http"))

return urljoin(baseurl, url)


Expand Down Expand Up @@ -150,20 +153,21 @@ def is_known_link(link: str, known_links: Set[str]) -> bool:
return True

# check link and variants with trailing slashes
test_links = [link.rstrip("/"), link.rstrip("/") + "/"]
if any(test_link in known_links for test_link in test_links):
slash_test = link.rstrip("/") if link[-1] == "/" else link + "/"
if slash_test in known_links:
return True

# check link and variants with modified protocol
if link.startswith("http"):
if link.startswith("https"):
testlink = link[:4] + link[5:]
else:
testlink = "".join([link[:4], "s", link[4:]])
if any(
test in known_links
for test in [testlink, testlink.rstrip("/"), testlink.rstrip("/") + "/"]
):
protocol_test = (
"http" + link[:5] if link.startswith("https") else "https" + link[4:]
)
slash_test = (
protocol_test.rstrip("/")
if protocol_test[-1] == "/"
else protocol_test + "/"
)
if protocol_test in known_links or slash_test in known_links:
return True

return False
Loading