From 2e9df31f60f6635a876cbe35b03dfc3655252f77 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 20 Nov 2023 17:29:31 +0100 Subject: [PATCH] add function is_valid_url() (#63) --- courlan/__init__.py | 8 +++++++- courlan/filters.py | 5 +++++ tests/unit_tests.py | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/courlan/__init__.py b/courlan/__init__.py index 62c51f2..290761e 100644 --- a/courlan/__init__.py +++ b/courlan/__init__.py @@ -14,7 +14,13 @@ # imports from .clean import clean_url, normalize_url, scrub_url from .core import check_url, extract_links -from .filters import is_navigation_page, is_not_crawlable, lang_filter, validate_url +from .filters import ( + is_navigation_page, + is_not_crawlable, + is_valid_url, + lang_filter, + validate_url, +) from .sampling import sample_urls from .urlstore import UrlStore from .urlutils import ( diff --git a/courlan/filters.py b/courlan/filters.py index 28847e0..6006f24 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -239,6 +239,11 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]: return True, parsed_url +def is_valid_url(url: Optional[str]) -> bool: + "Determine if a given string is a valid URL." + return validate_url(url)[0] + + def is_navigation_page(url: str) -> bool: """Determine if the URL is related to navigation and overview pages rather than content pages, e.g. /page/1 vs. article page.""" diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 7a8cdd1..2140d38 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -27,6 +27,7 @@ is_external, sample_urls, validate_url, + is_valid_url, extract_links, extract_domain, filter_urls, @@ -463,6 +464,9 @@ def test_validate(): assert validate_url("http://test.org/test")[0] is True # assert validate_url("http://sub.-mkyong.com/test")[0] is False + assert not is_valid_url("http://www.test[.org/test") + assert is_valid_url("http://test.org/test") + def test_normalization(): assert normalize_url("HTTPS://WWW.DWDS.DE/") == "https://www.dwds.de/"