Supply full URL to extract_links() and fix handling of relative URLs (#…

…46) * rewrite fix_relative_urls() Now can accept a full URL as the base to compare against. * allow passing full URLs to all functions calling fix_relative_urls Base URLs will still work as well, when that is desired. * apply formatting by black * review code and use urlsplit/unsplit * add some more tests --------- Co-authored-by: Adrien Barbaresi <barbaresi@bbaw.de>
adbar · Jun 23, 2023 · 78d50c0 · 78d50c0
1 parent 8e97548
commit 78d50c0
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 23 deletions.
diff --git a/courlan/core.py b/courlan/core.py
@@ -25,7 +25,13 @@
 )
 from .network import redirection_test
 from .settings import BLACKLIST
-from .urlutils import extract_domain, fix_relative_urls, is_external, is_known_link
+from .urlutils import (
+    extract_domain,
+    get_base_url,
+    fix_relative_urls,
+    is_external,
+    is_known_link,
+)
 
 
 LOGGER = logging.getLogger(__name__)
@@ -120,7 +126,7 @@ def check_url(
 
 def extract_links(
     pagecontent: str,
-    base_url: str,
+    full_url: str,
     external_bool: bool,
     no_filter: bool = False,
     language: Optional[str] = None,
@@ -132,7 +138,7 @@ def extract_links(
     """Filter links in a HTML document using a series of heuristics
     Args:
         pagecontent: whole page in binary format
-        base_url: beginning of the URL, without path, fragment and query
+        full_url: full URL of the page
         external_bool: set to True for external links only, False for
                   internal links only
         no_filter: override settings and bypass checks to return all possible URLs
@@ -148,6 +154,7 @@ def extract_links(
     Raises:
         Nothing.
     """
+    base_url = get_base_url(full_url)
     candidates, validlinks = set(), set()  # type: Set[str], Set[str]
     if not pagecontent:
         return validlinks
@@ -175,7 +182,7 @@ def extract_links(
     for link in candidates:
         # repair using base
         if not link.startswith("http"):
-            link = fix_relative_urls(base_url, link)
+            link = fix_relative_urls(full_url, link)
         # check
         if no_filter is False:
             checked = check_url(
@@ -203,7 +210,7 @@ def extract_links(
 
 def filter_links(
     htmlstring: str,
-    base_url: str,
+    full_url: str,
     lang: Optional[str] = None,
     rules: Optional[RobotFileParser] = None,
     external: bool = False,
@@ -214,7 +221,7 @@ def filter_links(
     links, links_priority = [], []
     for link in extract_links(
         pagecontent=htmlstring,
-        base_url=base_url,
+        full_url=full_url,
         external_bool=external,
         language=lang,
         strict=strict,

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
@@ -30,7 +30,7 @@
 from .core import filter_links
 from .filters import lang_filter, validate_url
 from .meta import clear_caches
-from .urlutils import get_host_and_path, is_known_link
+from .urlutils import get_base_url, get_host_and_path, is_known_link
 
 
 LOGGER = logging.getLogger(__name__)
@@ -235,17 +235,18 @@ def add_urls(
     def add_from_html(
         self,
         htmlstring: str,
-        base_url: str,
+        full_url: str,
         external: bool = False,
         lang: Optional[str] = None,
         with_nav: bool = True,
     ) -> None:
         "Find links in a HTML document, filter them and add them to the data store."
         # lang = lang or self.language
+        base_url = get_base_url(full_url)
         rules = self.get_rules(base_url)
         links, links_priority = filter_links(
             htmlstring=htmlstring,
-            base_url=base_url,
+            full_url=full_url,
             external=external,
             lang=lang or self.language,
             rules=rules,

diff --git a/courlan/urlutils.py b/courlan/urlutils.py
@@ -6,7 +6,7 @@
 
 from functools import lru_cache
 from typing import Any, List, Optional, Set, Tuple, Union
-from urllib.parse import urlparse, urlunsplit, ParseResult
+from urllib.parse import urljoin, urlparse, urlsplit, urlunsplit, ParseResult
 
 from tld import get_tld
 
@@ -114,19 +114,15 @@ def get_hostinfo(url: str) -> Tuple[Optional[str], str]:
 
 def fix_relative_urls(baseurl: str, url: str) -> str:
     "Prepend protocol and host information to relative links."
-    if url.startswith("//"):
-        return "https:" + url if baseurl.startswith("https") else "http:" + url
-    if url.startswith("/"):
-        # imperfect path handling
-        return baseurl + url
-    if url.startswith("."):
-        # don't try to correct these URLs
-        return baseurl + "/" + INNER_SLASH_REGEX.sub("", url)
-    if not url.startswith(("http", "{")):
-        return baseurl + "/" + url
-    # todo: handle here
-    # if url.startswith('{'):
-    return url
+    if url.startswith("{"):
+        return url
+    base_netloc = urlsplit(baseurl).netloc
+    split_url = urlsplit(url)
+    if split_url.netloc not in (base_netloc, ""):
+        if split_url.scheme:
+            return url
+        return urlunsplit(split_url._replace(scheme="http"))
+    return urljoin(baseurl, url)
 
 
 def filter_urls(link_list: List[str], urlfilter: Optional[str]) -> List[str]:

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -99,6 +99,50 @@ def test_fix_relative():
         fix_relative_urls("https://example.org", "../../test.html")
         == "https://example.org/test.html"
     )
+    # sub-directories
+    assert (
+        fix_relative_urls("https://www.example.org/dir/subdir/file.html", "/absolute")
+        == "https://www.example.org/absolute"
+    )
+    assert (
+        fix_relative_urls("https://www.example.org/dir/subdir/file.html", "relative")
+        == "https://www.example.org/dir/subdir/relative"
+    )
+    assert (
+        fix_relative_urls("https://www.example.org/dir/subdir/", "relative")
+        == "https://www.example.org/dir/subdir/relative"
+    )
+    assert (
+        fix_relative_urls("https://www.example.org/dir/subdir", "relative")
+        == "https://www.example.org/dir/relative"
+    )
+    # non-relative URLs
+    assert (
+        fix_relative_urls("https://example.org", "https://www.eff.org")
+        == "https://www.eff.org"
+    )
+    assert (
+        fix_relative_urls("https://example.org", "//www.eff.org")
+        == "http://www.eff.org"
+    )
+    # looks like an absolute URL but is actually a valid relative URL
+    assert (
+        fix_relative_urls("https://example.org", "www.eff.org")
+        == "https://example.org/www.eff.org"
+    )
+    # misc
+    assert (
+        fix_relative_urls("https://www.example.org/dir/subdir/file.html", "./this:that")
+        == "https://www.example.org/dir/subdir/this:that"
+    )
+    assert (
+        fix_relative_urls("https://www.example.org/test.html?q=test#frag", "foo.html?q=bar#baz")
+        == "https://www.example.org/foo.html?q=bar#baz"
+    )
+    assert (
+        fix_relative_urls("https://www.example.org", "{privacy}")
+        == "{privacy}"
+    )
 
 
 def test_scrub():