Skip to content

Commit

Permalink
add function is_valid_url() (#63)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Nov 20, 2023
1 parent 1c6ee48 commit 2e9df31
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 1 deletion.
8 changes: 7 additions & 1 deletion courlan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
# imports
from .clean import clean_url, normalize_url, scrub_url
from .core import check_url, extract_links
from .filters import is_navigation_page, is_not_crawlable, lang_filter, validate_url
from .filters import (
is_navigation_page,
is_not_crawlable,
is_valid_url,
lang_filter,
validate_url,
)
from .sampling import sample_urls
from .urlstore import UrlStore
from .urlutils import (
Expand Down
5 changes: 5 additions & 0 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,11 @@ def validate_url(url: Optional[str]) -> Tuple[bool, Any]:
return True, parsed_url


def is_valid_url(url: Optional[str]) -> bool:
"Determine if a given string is a valid URL."
return validate_url(url)[0]


def is_navigation_page(url: str) -> bool:
"""Determine if the URL is related to navigation and overview pages
rather than content pages, e.g. /page/1 vs. article page."""
Expand Down
4 changes: 4 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
is_external,
sample_urls,
validate_url,
is_valid_url,
extract_links,
extract_domain,
filter_urls,
Expand Down Expand Up @@ -463,6 +464,9 @@ def test_validate():
assert validate_url("http://test.org/test")[0] is True
# assert validate_url("http://sub.-mkyong.com/test")[0] is False

assert not is_valid_url("http://www.test[.org/test")
assert is_valid_url("http://test.org/test")


def test_normalization():
assert normalize_url("HTTPS://WWW.DWDS.DE/") == "https://www.dwds.de/"
Expand Down

0 comments on commit 2e9df31

Please sign in to comment.