Skip to content

Commit

Permalink
Supply full URL to extract_links() and fix handling of relative URLs (#…
Browse files Browse the repository at this point in the history
…46)

* rewrite fix_relative_urls()

Now can accept a full URL as the base to compare against.

* allow passing full URLs to all functions calling fix_relative_urls

Base URLs will still work as well, when that is desired.

* apply formatting by black

* review code and use urlsplit/unsplit

* add some more tests

---------

Co-authored-by: Adrien Barbaresi <barbaresi@bbaw.de>
  • Loading branch information
feltcat and adbar authored Jun 23, 2023
1 parent 8e97548 commit 78d50c0
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 23 deletions.
19 changes: 13 additions & 6 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,13 @@
)
from .network import redirection_test
from .settings import BLACKLIST
from .urlutils import extract_domain, fix_relative_urls, is_external, is_known_link
from .urlutils import (
extract_domain,
get_base_url,
fix_relative_urls,
is_external,
is_known_link,
)


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -120,7 +126,7 @@ def check_url(

def extract_links(
pagecontent: str,
base_url: str,
full_url: str,
external_bool: bool,
no_filter: bool = False,
language: Optional[str] = None,
Expand All @@ -132,7 +138,7 @@ def extract_links(
"""Filter links in a HTML document using a series of heuristics
Args:
pagecontent: whole page in binary format
base_url: beginning of the URL, without path, fragment and query
full_url: full URL of the page
external_bool: set to True for external links only, False for
internal links only
no_filter: override settings and bypass checks to return all possible URLs
Expand All @@ -148,6 +154,7 @@ def extract_links(
Raises:
Nothing.
"""
base_url = get_base_url(full_url)
candidates, validlinks = set(), set() # type: Set[str], Set[str]
if not pagecontent:
return validlinks
Expand Down Expand Up @@ -175,7 +182,7 @@ def extract_links(
for link in candidates:
# repair using base
if not link.startswith("http"):
link = fix_relative_urls(base_url, link)
link = fix_relative_urls(full_url, link)
# check
if no_filter is False:
checked = check_url(
Expand Down Expand Up @@ -203,7 +210,7 @@ def extract_links(

def filter_links(
htmlstring: str,
base_url: str,
full_url: str,
lang: Optional[str] = None,
rules: Optional[RobotFileParser] = None,
external: bool = False,
Expand All @@ -214,7 +221,7 @@ def filter_links(
links, links_priority = [], []
for link in extract_links(
pagecontent=htmlstring,
base_url=base_url,
full_url=full_url,
external_bool=external,
language=lang,
strict=strict,
Expand Down
7 changes: 4 additions & 3 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from .core import filter_links
from .filters import lang_filter, validate_url
from .meta import clear_caches
from .urlutils import get_host_and_path, is_known_link
from .urlutils import get_base_url, get_host_and_path, is_known_link


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -235,17 +235,18 @@ def add_urls(
def add_from_html(
self,
htmlstring: str,
base_url: str,
full_url: str,
external: bool = False,
lang: Optional[str] = None,
with_nav: bool = True,
) -> None:
"Find links in a HTML document, filter them and add them to the data store."
# lang = lang or self.language
base_url = get_base_url(full_url)
rules = self.get_rules(base_url)
links, links_priority = filter_links(
htmlstring=htmlstring,
base_url=base_url,
full_url=full_url,
external=external,
lang=lang or self.language,
rules=rules,
Expand Down
24 changes: 10 additions & 14 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from functools import lru_cache
from typing import Any, List, Optional, Set, Tuple, Union
from urllib.parse import urlparse, urlunsplit, ParseResult
from urllib.parse import urljoin, urlparse, urlsplit, urlunsplit, ParseResult

from tld import get_tld

Expand Down Expand Up @@ -114,19 +114,15 @@ def get_hostinfo(url: str) -> Tuple[Optional[str], str]:

def fix_relative_urls(baseurl: str, url: str) -> str:
"Prepend protocol and host information to relative links."
if url.startswith("//"):
return "https:" + url if baseurl.startswith("https") else "http:" + url
if url.startswith("/"):
# imperfect path handling
return baseurl + url
if url.startswith("."):
# don't try to correct these URLs
return baseurl + "/" + INNER_SLASH_REGEX.sub("", url)
if not url.startswith(("http", "{")):
return baseurl + "/" + url
# todo: handle here
# if url.startswith('{'):
return url
if url.startswith("{"):
return url
base_netloc = urlsplit(baseurl).netloc
split_url = urlsplit(url)
if split_url.netloc not in (base_netloc, ""):
if split_url.scheme:
return url
return urlunsplit(split_url._replace(scheme="http"))
return urljoin(baseurl, url)


def filter_urls(link_list: List[str], urlfilter: Optional[str]) -> List[str]:
Expand Down
44 changes: 44 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,50 @@ def test_fix_relative():
fix_relative_urls("https://example.org", "../../test.html")
== "https://example.org/test.html"
)
# sub-directories
assert (
fix_relative_urls("https://www.example.org/dir/subdir/file.html", "/absolute")
== "https://www.example.org/absolute"
)
assert (
fix_relative_urls("https://www.example.org/dir/subdir/file.html", "relative")
== "https://www.example.org/dir/subdir/relative"
)
assert (
fix_relative_urls("https://www.example.org/dir/subdir/", "relative")
== "https://www.example.org/dir/subdir/relative"
)
assert (
fix_relative_urls("https://www.example.org/dir/subdir", "relative")
== "https://www.example.org/dir/relative"
)
# non-relative URLs
assert (
fix_relative_urls("https://example.org", "https://www.eff.org")
== "https://www.eff.org"
)
assert (
fix_relative_urls("https://example.org", "//www.eff.org")
== "http://www.eff.org"
)
# looks like an absolute URL but is actually a valid relative URL
assert (
fix_relative_urls("https://example.org", "www.eff.org")
== "https://example.org/www.eff.org"
)
# misc
assert (
fix_relative_urls("https://www.example.org/dir/subdir/file.html", "./this:that")
== "https://www.example.org/dir/subdir/this:that"
)
assert (
fix_relative_urls("https://www.example.org/test.html?q=test#frag", "foo.html?q=bar#baz")
== "https://www.example.org/foo.html?q=bar#baz"
)
assert (
fix_relative_urls("https://www.example.org", "{privacy}")
== "{privacy}"
)


def test_scrub():
Expand Down

0 comments on commit 78d50c0

Please sign in to comment.