Skip to content

Commit

Permalink
Remove unreserved characters from the beginning of found URL
Browse files Browse the repository at this point in the history
fixes #131
  • Loading branch information
lipoja committed Oct 21, 2022
1 parent 6bc2f30 commit 1159482
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 2 deletions.
4 changes: 2 additions & 2 deletions tests/unit/test_find_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
["https://example.com/what.com"],
),
(
"https://i2.wp.com/siliconfilter.com/2011/06/example.jpg",
["https://i2.wp.com/siliconfilter.com/2011/06/example.jpg"],
"* test link -https://www.example.com",
["https://www.example.com"],
),
(
"https://www.test.org/paper/apostrophe'in-url",
Expand Down
3 changes: 3 additions & 0 deletions urlextract/urlextract_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,9 @@ def _complete_url(
# URL should not start with two backslashes
if complete_url.startswith("//"):
complete_url = complete_url[2:]
# URL should not start with unreserved characters
if complete_url.startswith(("-", ".", "~", "_")):
complete_url = complete_url[1:]
if not self._is_domain_valid(
complete_url, tld, check_dns=check_dns, with_schema_only=with_schema_only
):
Expand Down

0 comments on commit 1159482

Please sign in to comment.