Skip to content

Commit

Permalink
Accepting only ASCII characters left from TLD.
Browse files Browse the repository at this point in the history
  • Loading branch information
lipoja committed Feb 26, 2024
1 parent 1cda63c commit 562bf74
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
4 changes: 3 additions & 1 deletion tests/unit/test_find_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
@pytest.mark.parametrize(
"text, expected",
[
("%sexample.com" % chr(8231), ["example.com"]),
("some%sdomain.example.com" % chr(8231), ["domain.example.com"]),
("Let's have URL http://janlipovsky.cz", ["http://janlipovsky.cz"]),
("Let's have text without URLs.", []),
("Dot after TLD: http://janlipovsky.cz.", ["http://janlipovsky.cz"]),
Expand Down Expand Up @@ -57,7 +59,7 @@
"<script src='//www.example.com/somejsfile.js'>",
["www.example.com/somejsfile.js"],
),
("bad.email @address.net>", ['bad.email']),
("bad.email @address.net>", ["bad.email"]),
('[[ "$(giturl)" =~ ^https://gitlab.com ]] echo "found" || echo "didnt', []),
],
)
Expand Down
7 changes: 6 additions & 1 deletion urlextract/urlextract_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,12 @@ def _complete_url(
and text[start_pos - 1] in self._stop_chars_left_from_schema
):
left_ok = False
if left_ok and text[start_pos - 1] not in self._stop_chars_left:
if (
left_ok
and text[start_pos - 1] not in self._stop_chars_left
# Allow only ASCII characters in authority and schema
and ord(text[start_pos - 1]) <= 127
):
start_pos -= 1
else:
left_ok = False
Expand Down

0 comments on commit 562bf74

Please sign in to comment.