From 7886777601e1b2c1fa4002ef30578b8e14630543 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Mon, 3 Jul 2023 17:12:02 +0200
Subject: [PATCH 1/7] review extract_links compatibility

---
 courlan/core.py     | 21 +++++++++++++--------
 courlan/urlstore.py |  6 +++---
 tests/unit_tests.py | 25 ++++++++++++++++++++++---
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/courlan/core.py b/courlan/core.py
index 595b8b1..bfc5bf7 100644
--- a/courlan/core.py
+++ b/courlan/core.py
@@ -126,19 +126,22 @@ def check_url(
 
 def extract_links(
     pagecontent: str,
-    full_url: str,
-    external_bool: bool,
+    url: Optional[str] = None,
+    base_url: Optional[str] = None,
+    external_bool: bool = False,
     no_filter: bool = False,
     language: Optional[str] = None,
     strict: bool = True,
     with_nav: bool = False,
     redirects: bool = False,
     reference: Optional[str] = None,
+
 ) -> Set[str]:
     """Filter links in a HTML document using a series of heuristics
     Args:
         pagecontent: whole page in binary format
-        full_url: full URL of the page
+        url: full URL of the original page
+        base_url: deprecated, legacy only
         external_bool: set to True for external links only, False for
                   internal links only
         no_filter: override settings and bypass checks to return all possible URLs
@@ -154,7 +157,8 @@ def extract_links(
     Raises:
         Nothing.
     """
-    base_url = get_base_url(full_url)
+    base_url = base_url or get_base_url(url)
+    url = url or base_url
     candidates, validlinks = set(), set()  # type: Set[str], Set[str]
     if not pagecontent:
         return validlinks
@@ -182,7 +186,7 @@ def extract_links(
     for link in candidates:
         # repair using base
         if not link.startswith("http"):
-            link = fix_relative_urls(full_url, link)
+            link = fix_relative_urls(url, link)
         # check
         if no_filter is False:
             checked = check_url(
@@ -194,7 +198,7 @@ def extract_links(
             )
             if checked is None:
                 continue
-            link = checked[0]
+            link = checked[0].rstrip("/")
             # external/internal links
             if external_bool != is_external(
                 url=link, reference=reference, ignore_suffix=True
@@ -210,7 +214,7 @@ def extract_links(
 
 def filter_links(
     htmlstring: str,
-    full_url: str,
+    url: Optional[str],
     lang: Optional[str] = None,
     rules: Optional[RobotFileParser] = None,
     external: bool = False,
@@ -219,9 +223,10 @@ def filter_links(
 ) -> Tuple[List[str], List[str]]:
     "Find links in a HTML document, filter them and add them to the data store."
     links, links_priority = [], []
+    url = url or base_url
     for link in extract_links(
         pagecontent=htmlstring,
-        full_url=full_url,
+        url=url,
         external_bool=external,
         language=lang,
         strict=strict,
diff --git a/courlan/urlstore.py b/courlan/urlstore.py
index 80b0ebf..d0dfe6a 100644
--- a/courlan/urlstore.py
+++ b/courlan/urlstore.py
@@ -235,18 +235,18 @@ def add_urls(
     def add_from_html(
         self,
         htmlstring: str,
-        full_url: str,
+        url: str,
         external: bool = False,
         lang: Optional[str] = None,
         with_nav: bool = True,
     ) -> None:
         "Find links in a HTML document, filter them and add them to the data store."
         # lang = lang or self.language
-        base_url = get_base_url(full_url)
+        base_url = get_base_url(url)
         rules = self.get_rules(base_url)
         links, links_priority = filter_links(
             htmlstring=htmlstring,
-            full_url=full_url,
+            url=url,
             external=external,
             lang=lang or self.language,
             rules=rules,
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 7598892..49e14c5 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -816,6 +816,21 @@ def test_extraction():
         "https://httpbin.org/links/2/0",
         "https://httpbin.org/links/2/1",
     ]
+    links = extract_links(
+        pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True
+    )
+    assert sorted(links) == [
+        "https://httpbin.org/links/2/0",
+        "https://httpbin.org/links/2/1",
+    ]
+    pagecontent = "<html><head><title>Links</title></head><body><a href='links/2/0'>0</a> <a href='links/2/1'>1</a> </body></html>"
+    links = extract_links(
+        pagecontent, url="https://httpbin.org/page1/", external_bool=False, with_nav=True
+    )
+    assert sorted(links) == [
+        "https://httpbin.org/page1/links/2/0",
+        "https://httpbin.org/page1/links/2/1",
+    ]
     pagecontent = "<html><head><title>Pages</title></head><body><a href='/page/10'>10</a> <a href='/page/?=11'>11</a></body></html>"
     assert (
         extract_links(
@@ -835,7 +850,7 @@ def test_extraction():
         with_nav=True,
     )
     assert sorted(links) == [
-        "https://example.org/page/",  # parameter stripped by strict filtering
+        "https://example.org/page",  # parameter stripped by strict filtering
         "https://example.org/page/10",
     ]
     links = extract_links(
@@ -889,8 +904,12 @@ def test_extraction():
     base_url = "https://example.org"
     htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
     links, links_priority = filter_links(htmlstring, base_url)
-    assert len(links) == 1
-    assert not links_priority
+    assert links == ['https://example.org/page1'] and not links_priority
+    # link filtering with relative URLs
+    url = "https://example.org/page1.html"
+    htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'
+    links, links_priority = filter_links(htmlstring, url=url)
+    assert links == ['https://example.org/subpage1'] and not links_priority
 
 
 def test_cli():

From 74960aacd557ce92192e189176c6db633146a907 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Mon, 3 Jul 2023 17:49:23 +0200
Subject: [PATCH 2/7] fix tests

---
 courlan/core.py         | 2 +-
 tests/unit_tests.py     | 9 ++++++---
 tests/urlstore_tests.py | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/courlan/core.py b/courlan/core.py
index bfc5bf7..3ea18bd 100644
--- a/courlan/core.py
+++ b/courlan/core.py
@@ -135,7 +135,6 @@ def extract_links(
     with_nav: bool = False,
     redirects: bool = False,
     reference: Optional[str] = None,
-
 ) -> Set[str]:
     """Filter links in a HTML document using a series of heuristics
     Args:
@@ -215,6 +214,7 @@ def extract_links(
 def filter_links(
     htmlstring: str,
     url: Optional[str],
+    base_url: Optional[str] = None,
     lang: Optional[str] = None,
     rules: Optional[RobotFileParser] = None,
     external: bool = False,
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 49e14c5..8d063ed 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -825,7 +825,10 @@ def test_extraction():
     ]
     pagecontent = "<html><head><title>Links</title></head><body><a href='links/2/0'>0</a> <a href='links/2/1'>1</a> </body></html>"
     links = extract_links(
-        pagecontent, url="https://httpbin.org/page1/", external_bool=False, with_nav=True
+        pagecontent,
+        url="https://httpbin.org/page1/",
+        external_bool=False,
+        with_nav=True,
     )
     assert sorted(links) == [
         "https://httpbin.org/page1/links/2/0",
@@ -904,12 +907,12 @@ def test_extraction():
     base_url = "https://example.org"
     htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
     links, links_priority = filter_links(htmlstring, base_url)
-    assert links == ['https://example.org/page1'] and not links_priority
+    assert links == ["https://example.org/page1"] and not links_priority
     # link filtering with relative URLs
     url = "https://example.org/page1.html"
     htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'
     links, links_priority = filter_links(htmlstring, url=url)
-    assert links == ['https://example.org/subpage1'] and not links_priority
+    assert links == ["https://example.org/subpage1"] and not links_priority
 
 
 def test_cli():
diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
index 9faa1cc..c432460 100644
--- a/tests/urlstore_tests.py
+++ b/tests/urlstore_tests.py
@@ -397,7 +397,7 @@ def test_from_html():
     url_store.add_from_html(htmlstring, base_url, lang="en")
     todo = url_store.find_unvisited_urls(base_url)
     known_links = url_store.find_known_urls(base_url)
-    assert "https://example.org/en/page1/" in todo and len(known_links) == 4
+    assert "https://example.org/en/page1" in todo and len(known_links) == 4
     # wrong language
     htmlstring = '<html><body><a href="https://example.org/en/page2"/></body></html>'
     url_store.add_from_html(htmlstring, base_url, lang="de")

From 7a8884fa4e330eaba715eb6fdc7e2ff1626f916d Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Wed, 5 Jul 2023 15:44:34 +0200
Subject: [PATCH 3/7] remove KeyError in is_exhausted_domain()

---
 courlan/urlstore.py     | 3 ++-
 tests/urlstore_tests.py | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
index d0dfe6a..efb8f32 100644
--- a/courlan/urlstore.py
+++ b/courlan/urlstore.py
@@ -293,7 +293,8 @@ def is_exhausted_domain(self, domain: str) -> bool:
         "Tell if all known URLs for the website have been visited."
         if domain in self.urldict:
             return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
-        raise KeyError("website not in store")
+        return False
+        # raise KeyError("website not in store")
 
     def unvisited_websites_number(self) -> int:
         "Return the number of websites for which there are still URLs to visit."
diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
index c432460..3a62ed1 100644
--- a/tests/urlstore_tests.py
+++ b/tests/urlstore_tests.py
@@ -170,8 +170,9 @@ def test_urlstore():
     assert my_urls.urldict["https://visited.com"].tuples[1].visited is False
     assert my_urls.urldict["https://visited.com"].state is State.OPEN
     assert my_urls.is_exhausted_domain("https://visited.com") is False
-    with pytest.raises(KeyError):
-        assert my_urls.is_exhausted_domain("https://visited2.com") is True
+    #with pytest.raises(KeyError):
+    #    assert my_urls.is_exhausted_domain("https://visited2.com") is True
+    assert my_urls.is_exhausted_domain("https://visited2.com") is False
     # revert changes for further tests
     del my_urls.urldict["https://visited.com"].tuples[1]
     my_urls.urldict["https://visited.com"].state = State.ALL_VISITED

From a3bba4ccf0e895fe463b8fff622ead13146a24bd Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Mon, 10 Jul 2023 17:06:03 +0200
Subject: [PATCH 4/7] unify: remove trailing slashes

---
 courlan/clean.py        | 10 ++++++----
 courlan/core.py         |  2 +-
 courlan/urlstore.py     |  4 ++++
 tests/urlstore_tests.py |  8 ++++----
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/courlan/clean.py b/courlan/clean.py
index 3750674..37e742a 100644
--- a/courlan/clean.py
+++ b/courlan/clean.py
@@ -37,7 +37,7 @@
 
 
 def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
-    """Helper function: chained scrubbing and normalization"""
+    "Helper function: chained scrubbing and normalization"
     try:
         return normalize_url(scrub_url(url), False, language)
     except (AttributeError, ValueError):
@@ -45,7 +45,7 @@ def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
 
 
 def scrub_url(url: str) -> str:
-    """Strip unnecessary parts and make sure only one URL is considered"""
+    "Strip unnecessary parts and make sure only one URL is considered"
     # trim
     # https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
     # remove leading and trailing white space and unescaped control chars
@@ -100,7 +100,7 @@ def scrub_url(url: str) -> str:
 def clean_query(
     parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None
 ) -> str:
-    """Strip unwanted query elements"""
+    "Strip unwanted query elements"
     if len(parsed_url.query) > 0:
         qdict = parse_qs(parsed_url.query)
         newqdict = {}
@@ -152,7 +152,7 @@ def normalize_url(
     strict: bool = False,
     language: Optional[str] = None,
 ) -> str:
-    """Takes a URL string or a parsed URL and returns a (basically) normalized URL string"""
+    "Takes a URL string or a parsed URL and returns a normalized URL string"
     parsed_url = _parse(parsed_url)
     # lowercase + remove fragments + normalize punycode
     scheme = parsed_url.scheme.lower()
@@ -170,6 +170,8 @@ def normalize_url(
     newquery = clean_query(parsed_url, strict, language) or ""
     if newquery and newpath == "":
         newpath = "/"
+    elif not newquery and len(newpath) > 1 and newpath.endswith("/"):
+        newpath = newpath.rstrip("/")
     # fragment
     newfragment = "" if strict else parsed_url.fragment
     # rebuild
diff --git a/courlan/core.py b/courlan/core.py
index 3ea18bd..8f96806 100644
--- a/courlan/core.py
+++ b/courlan/core.py
@@ -197,7 +197,7 @@ def extract_links(
             )
             if checked is None:
                 continue
-            link = checked[0].rstrip("/")
+            link = checked[0]
             # external/internal links
             if external_bool != is_external(
                 url=link, reference=reference, ignore_suffix=True
diff --git a/courlan/urlstore.py b/courlan/urlstore.py
index efb8f32..caf383b 100644
--- a/courlan/urlstore.py
+++ b/courlan/urlstore.py
@@ -27,6 +27,7 @@
 
 from urllib.robotparser import RobotFileParser
 
+from .clean import normalize_url
 from .core import filter_links
 from .filters import lang_filter, validate_url
 from .meta import clear_caches
@@ -115,6 +116,9 @@ def _buffer_urls(
                 ):
                     LOGGER.debug("Wrong language: %s", url)
                     raise ValueError
+                parsed_url = normalize_url(
+                    parsed_url, strict=self.strict, language=self.language
+                )
                 hostinfo, urlpath = get_host_and_path(parsed_url)
                 inputdict[hostinfo].append(UrlPathTuple(urlpath, visited))
             except (TypeError, ValueError):
diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
index 3a62ed1..e7412ca 100644
--- a/tests/urlstore_tests.py
+++ b/tests/urlstore_tests.py
@@ -170,7 +170,7 @@ def test_urlstore():
     assert my_urls.urldict["https://visited.com"].tuples[1].visited is False
     assert my_urls.urldict["https://visited.com"].state is State.OPEN
     assert my_urls.is_exhausted_domain("https://visited.com") is False
-    #with pytest.raises(KeyError):
+    # with pytest.raises(KeyError):
     #    assert my_urls.is_exhausted_domain("https://visited2.com") is True
     assert my_urls.is_exhausted_domain("https://visited2.com") is False
     # revert changes for further tests
@@ -186,7 +186,7 @@ def test_urlstore():
     my_urls.add_urls(appendleft=extension_urls)
     url_tuples = my_urls._load_urls(example_domain)
     assert len(url_tuples) == len(example_urls) + 11
-    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10/"
+    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
 
     # duplicates
     my_urls.add_urls(extension_urls)
@@ -194,7 +194,7 @@ def test_urlstore():
     assert len(my_urls._load_urls(example_domain)) == len(example_urls) + len(
         extension_urls
     )
-    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10/"
+    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
 
     # get_url
     assert my_urls.urldict[example_domain].timestamp is None
@@ -204,7 +204,7 @@ def test_urlstore():
     timestamp = my_urls.urldict[example_domain].timestamp
     sleep(0.1)
     url2 = my_urls.get_url(example_domain)
-    assert url1 != url2 and url1 == "https://www.example.org/1/10/"
+    assert url1 != url2 and url1 == "https://www.example.org/1/10"
     assert my_urls.urldict[example_domain].count == 2
     assert timestamp != my_urls.urldict[example_domain].timestamp
     assert url2 not in set(my_urls.find_unvisited_urls(example_domain))

From 9e193cdac7898d2ba4bf0b6808975b5f8e76a05a Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Mon, 10 Jul 2023 17:28:46 +0200
Subject: [PATCH 5/7] adjust lang filter

---
 courlan/filters.py  | 4 ++--
 tests/unit_tests.py | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/courlan/filters.py b/courlan/filters.py
index a2da045..f65301a 100644
--- a/courlan/filters.py
+++ b/courlan/filters.py
@@ -43,9 +43,9 @@
 
 # language filter
 PATH_LANG_FILTER = re.compile(
-    r"(?:https?://[^/]+/)([a-z]{2})([_-][a-z]{2,3})?(?:/)", re.IGNORECASE
+    r"(?:https?://[^/]+/)([a-z]{2})([_-][a-z]{2,3})?(?:/|$)", re.IGNORECASE
 )
-ALL_PATH_LANGS = re.compile(r"(?:/)([a-z]{2})([_-][a-z]{2})?(?:/)", re.IGNORECASE)
+ALL_PATH_LANGS = re.compile(r"(?:/)([a-z]{2})([_-][a-z]{2})?(?:/|$)", re.IGNORECASE)
 HOST_LANG_FILTER = re.compile(
     r"https?://([a-z]{2})\.(?:[^.]{4,})\.(?:[^.]+)(?:\.[^.]+)?/", re.IGNORECASE
 )
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 8d063ed..24269fd 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -310,6 +310,10 @@ def test_path_filter():
 
 
 def test_lang_filter():
+    assert lang_filter("http://test.com/az", "de") is False
+    assert lang_filter("http://test.com/az/", "de") is False
+    assert lang_filter("http://test.com/de", "de") is True
+    assert lang_filter("http://test.com/de/", "de") is True
     assert (
         lang_filter(
             "https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377",

From 3b376e9561c11996f46b6ffb767d856f85784883 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Thu, 13 Jul 2023 16:17:10 +0200
Subject: [PATCH 6/7] leave trailing slashes for now

---
 courlan/clean.py        | 2 --
 courlan/filters.py      | 4 ++--
 tests/unit_tests.py     | 4 +---
 tests/urlstore_tests.py | 8 ++++----
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/courlan/clean.py b/courlan/clean.py
index 37e742a..194368c 100644
--- a/courlan/clean.py
+++ b/courlan/clean.py
@@ -170,8 +170,6 @@ def normalize_url(
     newquery = clean_query(parsed_url, strict, language) or ""
     if newquery and newpath == "":
         newpath = "/"
-    elif not newquery and len(newpath) > 1 and newpath.endswith("/"):
-        newpath = newpath.rstrip("/")
     # fragment
     newfragment = "" if strict else parsed_url.fragment
     # rebuild
diff --git a/courlan/filters.py b/courlan/filters.py
index f65301a..a2da045 100644
--- a/courlan/filters.py
+++ b/courlan/filters.py
@@ -43,9 +43,9 @@
 
 # language filter
 PATH_LANG_FILTER = re.compile(
-    r"(?:https?://[^/]+/)([a-z]{2})([_-][a-z]{2,3})?(?:/|$)", re.IGNORECASE
+    r"(?:https?://[^/]+/)([a-z]{2})([_-][a-z]{2,3})?(?:/)", re.IGNORECASE
 )
-ALL_PATH_LANGS = re.compile(r"(?:/)([a-z]{2})([_-][a-z]{2})?(?:/|$)", re.IGNORECASE)
+ALL_PATH_LANGS = re.compile(r"(?:/)([a-z]{2})([_-][a-z]{2})?(?:/)", re.IGNORECASE)
 HOST_LANG_FILTER = re.compile(
     r"https?://([a-z]{2})\.(?:[^.]{4,})\.(?:[^.]+)(?:\.[^.]+)?/", re.IGNORECASE
 )
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 24269fd..9ea6f40 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -310,9 +310,7 @@ def test_path_filter():
 
 
 def test_lang_filter():
-    assert lang_filter("http://test.com/az", "de") is False
     assert lang_filter("http://test.com/az/", "de") is False
-    assert lang_filter("http://test.com/de", "de") is True
     assert lang_filter("http://test.com/de/", "de") is True
     assert (
         lang_filter(
@@ -857,7 +855,7 @@ def test_extraction():
         with_nav=True,
     )
     assert sorted(links) == [
-        "https://example.org/page",  # parameter stripped by strict filtering
+        "https://example.org/page/",  # parameter stripped by strict filtering
         "https://example.org/page/10",
     ]
     links = extract_links(
diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
index e7412ca..76ab034 100644
--- a/tests/urlstore_tests.py
+++ b/tests/urlstore_tests.py
@@ -186,7 +186,7 @@ def test_urlstore():
     my_urls.add_urls(appendleft=extension_urls)
     url_tuples = my_urls._load_urls(example_domain)
     assert len(url_tuples) == len(example_urls) + 11
-    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
+    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10/"
 
     # duplicates
     my_urls.add_urls(extension_urls)
@@ -194,7 +194,7 @@ def test_urlstore():
     assert len(my_urls._load_urls(example_domain)) == len(example_urls) + len(
         extension_urls
     )
-    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
+    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10/"
 
     # get_url
     assert my_urls.urldict[example_domain].timestamp is None
@@ -204,7 +204,7 @@ def test_urlstore():
     timestamp = my_urls.urldict[example_domain].timestamp
     sleep(0.1)
     url2 = my_urls.get_url(example_domain)
-    assert url1 != url2 and url1 == "https://www.example.org/1/10"
+    assert url1 != url2 and url1 == "https://www.example.org/1/10/"
     assert my_urls.urldict[example_domain].count == 2
     assert timestamp != my_urls.urldict[example_domain].timestamp
     assert url2 not in set(my_urls.find_unvisited_urls(example_domain))
@@ -398,7 +398,7 @@ def test_from_html():
     url_store.add_from_html(htmlstring, base_url, lang="en")
     todo = url_store.find_unvisited_urls(base_url)
     known_links = url_store.find_known_urls(base_url)
-    assert "https://example.org/en/page1" in todo and len(known_links) == 4
+    assert "https://example.org/en/page1/" in todo and len(known_links) == 4
     # wrong language
     htmlstring = '<html><body><a href="https://example.org/en/page2"/></body></html>'
     url_store.add_from_html(htmlstring, base_url, lang="de")

From 6ada75cc9021319fcf968f28930b633f1d99c32c Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Thu, 13 Jul 2023 16:51:37 +0200
Subject: [PATCH 7/7] temp fix for tests

---
 tests/unit_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 9ea6f40..6bd40c9 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -909,12 +909,12 @@ def test_extraction():
     base_url = "https://example.org"
     htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
     links, links_priority = filter_links(htmlstring, base_url)
-    assert links == ["https://example.org/page1"] and not links_priority
+    assert len(links) == 1 and not links_priority
     # link filtering with relative URLs
     url = "https://example.org/page1.html"
     htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'
     links, links_priority = filter_links(htmlstring, url=url)
-    assert links == ["https://example.org/subpage1"] and not links_priority
+    assert len(links) == 1 and not links_priority
 
 
 def test_cli():