From d6c88e35644a41399c872d2dd01ce6a1aa7c998c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lipovsk=C3=BD?= Date: Wed, 14 Dec 2022 23:04:29 +0100 Subject: [PATCH] Adding the ability to set stop characters inside of scheme - default stop chars ':' fixes #82 --- tests/unit/test_find_urls.py | 43 +++++++++++++++++++++++++++++++++++ urlextract/urlextract_core.py | 39 ++++++++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_find_urls.py b/tests/unit/test_find_urls.py index 9ee46eb..952d761 100644 --- a/tests/unit/test_find_urls.py +++ b/tests/unit/test_find_urls.py @@ -153,3 +153,46 @@ def test_find_urls_schema_only(urlextract, text, expected): :param list(str) expected: list of URLs that has to be found in text """ assert urlextract.find_urls(text, with_schema_only=True) == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("multiple protocols, job:https://example.co", ["https://example.co"]), + ( + "more multiple protocols, link:job:https://example.com/r", + ["https://example.com/r"], + ), + ("svn+ssh://example.com", ["svn+ssh://example.com"]), + ], +) +def test_find_urls_multiple_protocol(urlextract, text, expected): + """ + Testing find_urls returning all URLs + + :param fixture urlextract: fixture holding URLExtract object + :param str text: text in which we should find links + :param list(str) expected: list of URLs that has to be found in text + """ + assert urlextract.find_urls(text) == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("svn+ssh://example.com", ["ssh://example.com"]), + ("multiple protocols, job:https://example.co", ["https://example.co"]), + ("test link:job:https://example.com/r", ["https://example.com/r"]), + ], +) +def test_find_urls_multiple_protocol_custom(urlextract, text, expected): + """ + Testing find_urls returning all URLs + + :param fixture urlextract: fixture holding URLExtract object + :param str text: text in which we should find links + :param list(str) expected: list of URLs that has to be found in text + """ + stop_chars = urlextract.get_stop_chars_left_from_scheme() | {"+"} + urlextract.set_stop_chars_left_from_scheme(stop_chars) + assert urlextract.find_urls(text) == expected diff --git a/urlextract/urlextract_core.py b/urlextract/urlextract_core.py index 8283024..24bceb0 100644 --- a/urlextract/urlextract_core.py +++ b/urlextract/urlextract_core.py @@ -111,6 +111,9 @@ def __init__( self._stop_chars_left = set(string.whitespace) self._stop_chars_left |= general_stop_chars | {"|", "=", "]", ")", "}"} + # default stop characters on left side from schema + self._stop_chars_left_from_schema = self._stop_chars_left.copy() | {":"} + # defining default stop chars left self._stop_chars_right = set(string.whitespace) self._stop_chars_right |= general_stop_chars @@ -334,6 +337,31 @@ def set_stop_chars_left(self, stop_chars: Set[str]): self._stop_chars_left = stop_chars + def get_stop_chars_left_from_scheme(self) -> Set[str]: + """ + Returns set of stop chars for text on left from TLD. + + :return: set of stop chars + :rtype: set + """ + return self._stop_chars_left_from_schema + + def set_stop_chars_left_from_scheme(self, stop_chars: Set[str]): + """ + Set stop characters for text on left from scheme. + Stop characters are used when determining end of URL. + + :param set stop_chars: set of characters + :raises: TypeError + """ + if not isinstance(stop_chars, set): + raise TypeError( + "stop_chars should be type set " + "but {} was given".format(type(stop_chars)) + ) + + self._stop_chars_left_from_schema = stop_chars + def get_stop_chars_right(self) -> Set[str]: """ Returns set of stop chars for text on right from TLD. @@ -420,12 +448,18 @@ def _complete_url( max_len = len(text) - 1 end_pos = tld_pos start_pos = tld_pos + in_scheme = False while left_ok or right_ok: if left_ok: if start_pos <= 0: left_ok = False else: - if text[start_pos - 1] not in self._stop_chars_left: + if ( + in_scheme + and text[start_pos - 1] in self._stop_chars_left_from_schema + ): + left_ok = False + if left_ok and text[start_pos - 1] not in self._stop_chars_left: start_pos -= 1 else: left_ok = False @@ -438,6 +472,9 @@ def _complete_url( else: right_ok = False + if text[start_pos : start_pos + 3] == "://": + in_scheme = True + complete_url = text[start_pos : end_pos + 1].lstrip("/") # remove last character from url # when it is allowed character right after TLD (e.g. dot, comma)