From 25b4f4910bccab37e4232abeb9f151f2d60dcf7b Mon Sep 17 00:00:00 2001 From: funilrys Date: Tue, 2 Oct 2018 22:43:42 +0200 Subject: [PATCH] Fix false positive extraction of the adblock decoder. This patch fix #13. Reverse: * Of the last patch for the way we check for URL. Introduction: * Of new test cases. * Of the force update for all version which are older than `0.94.3`. * Because of this patch. Review: * Of the way we extract domain and URL from the given adblock file. Deprecation: * Of all version which are equal or older than `0.109.0`. Thanks: * To @dnmTX * @adblockplus for their documentation * cf: https://adblockplus.org/filter-cheatsheet --- PyFunceble/__init__.py | 2 +- PyFunceble/check.py | 2 +- PyFunceble/core.py | 46 +++++++++++++++++++++++++----------------- tests/test_core.py | 31 +++++++++++----------------- version.yaml | 6 +++--- 5 files changed, 45 insertions(+), 42 deletions(-) diff --git a/PyFunceble/__init__.py b/PyFunceble/__init__.py index d0685d42..d69a352b 100644 --- a/PyFunceble/__init__.py +++ b/PyFunceble/__init__.py @@ -79,7 +79,7 @@ # We set our project name. NAME = "PyFunceble" # We set out project version. -VERSION = "0.109.0.dev-beta (Sarcoline Puku / Mosquito)" +VERSION = "0.110.0.dev-beta (Sarcoline Puku / Mosquito)" if "PYFUNCEBLE_OUTPUT_DIR" in environ: # pragma: no cover # We handle the case that the `PYFUNCEBLE_OUTPUT_DIR` environnement variable is set. diff --git a/PyFunceble/check.py b/PyFunceble/check.py index eba1d4aa..55a976d0 100644 --- a/PyFunceble/check.py +++ b/PyFunceble/check.py @@ -122,7 +122,7 @@ def is_url_valid(self, url=None, return_formated=False): try: # We initiate a regex which will match the domain or the url base. - regex = r"(^([a-z]+:\/\/)(.+?(?=\/)|.+?$))" + regex = r"(^(http:\/\/|https:\/\/)(.+?(?=\/)|.+?$))" # We extract the url base with the help of the initiated regex. formated_base = Regex( diff --git a/PyFunceble/core.py b/PyFunceble/core.py index af533a81..2877b2c9 100644 --- a/PyFunceble/core.py +++ b/PyFunceble/core.py @@ -637,6 +637,13 @@ def _format_adblock_decoded(cls, to_format, result=None): if data: # The currently read line is not empty. + if "^" in data: + # There is an accent in the currently read line. + + # We recall this method but with the current result state + # and splited data. + return cls._format_adblock_decoded(data.split("^"), result) + if "#" in data: # There is a dash in the currently read line. @@ -651,13 +658,6 @@ def _format_adblock_decoded(cls, to_format, result=None): # and splited data. return cls._format_adblock_decoded(data.split(","), result) - if "~" in data: - # There is a tilde in the currently read line. - - # We recall this method but with the current result state - # and splited data. - return cls._format_adblock_decoded(data.split("~"), result) - if "!" in data: # There is an exclamation mark in the currently read line. @@ -719,18 +719,25 @@ def _adblock_decode(self, list_to_test): # the element to format. regex = r"^(?:.*\|\|)([^\/\$\^]{1,}).*$" - # We initiate the second regex we are going to use to get + # We initiate the third regex we are going to use to get # the element to format. - regex_v2 = r"((.*\..*)(?:#{1,}.*))" + regex_v3 = ( + r"(?:#+(?:[a-z]+?)?\[[a-z]+(?:\^|\*)\=(?:\'|\"))(.*\..*)(?:(?:\'|\")\])" + ) - # We initiate the third regex we are going to use to get + # We initiate the fourth regex we are going to use to get # the element to format. - regex_v3 = r"(?:#+(?:[a-z]+?)?\[[a-z]+\^\=(?:\'|\"))(.*\..*)(?:(?:\'|\")\])" + regex_v4 = r"^\|(.*\..*)\|$" for line in list_to_test: # We loop through the different line. - if line.startswith("!"): + if ( + line.startswith("!") + or line.startswith("@@") + or line.startswith("/") + or line.startswith("[") + ): continue # We extract the different group from our first regex. @@ -738,9 +745,12 @@ def _adblock_decode(self, list_to_test): line, regex, return_data=True, rematch=True, group=0 ).match() - # We extract the different group from our second regex. - rematch_v2 = Regex( - line, regex_v2, return_data=True, rematch=True, group=0 + # We extract the different group from our fourth regex. + # + # Note: We execute the following in second because it is more + # specific that others. + rematch_v4 = Regex( + line, regex_v4, return_data=True, rematch=True, group=0 ).match() # We extract the different group from our third regex. @@ -754,11 +764,11 @@ def _adblock_decode(self, list_to_test): # We extend the result with the extracted elements. result.extend(rematch) - if rematch_v2: - # The second extraction was successfull. + if rematch_v4: + # The fourth extraction was successfull. # We extend the formated elements from the extracted elements. - result.extend(List(self._format_adblock_decoded(rematch_v2)).format()) + result.extend(List(self._format_adblock_decoded(rematch_v4)).format()) if rematch_v3: # The second extraction was successfull. diff --git a/tests/test_core.py b/tests/test_core.py index a7fd452d..2a7e788d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -353,8 +353,9 @@ def setUp(self): load_config(True) self.lines = [ - "||google.com$script,image", - "||twitter.com^", + "||funilrys.github.io$script,image", + "||google.com^$script,image", + "||twitter.com^helloworld.com", "||api.google.com/papi/action$popup", "facebook.com###player-above-2", "~github.com,hello.world##.wrapper", @@ -362,36 +363,28 @@ def setUp(self): "!||world.hello/*ad.xml", "bing.com,bingo.com#@##adBanner", "!@@||funceble.world/js", - "yahoo.com,msn.com,api.hello.world#@#awesomeWorld", + "yahoo.com,~msn.com,api.hello.world#@#awesomeWorld", "!funilrys.com##body", "hello#@#badads", "hubgit.com|oohay.com|ipa.elloh.dlorw#@#awesomeWorld", '##[href^="https://funceble.funilrys.com/"]', + "[AdBlock Plus 2.0]", '##div[href^="http://funilrys.com/"]', - '##[href^="ftp://funceble.funilrys.com/"]', + 'com##[href^="ftp://funceble.funilrys-funceble.com/"]', + "/banner/*/img^" "|github.io|", + "|github.io|", + "||api.funilrys.com/widget/$", ] self.expected = [ + "funilrys.github.io", "google.com", "twitter.com", "api.google.com", - "facebook.com", - "github.com", - "hello.world", - "cnn.com", - "world.hello", - "bing.com", - "bingo.com", - "funceble.world", - "api.hello.world", - "msn.com", - "yahoo.com", - "funilrys.com", - "hubgit.com", - "ipa.elloh.dlorw", - "oohay.com", "funceble.funilrys.com", "funilrys.com", + "github.io", + "api.funilrys.com", ] def test_adblock_decode(self): diff --git a/version.yaml b/version.yaml index 62a329aa..db759567 100644 --- a/version.yaml +++ b/version.yaml @@ -1,9 +1,9 @@ -current_version: 0.109.0.dev-beta (Sarcoline Puku / Mosquito) +current_version: 0.110.0.dev-beta (Sarcoline Puku / Mosquito) deprecated: [0.0.0, 0.0.1, 0.65.0, 0.67.1, 0.68.0, 0.69.3, 0.69.5, 0.70.4, 0.71.2, 0.72.7, 0.73.1, 0.74.5, 0.75.1, 0.76.2, 0.77.0, 0.78.0, 0.79.1, 0.80.9, 0.81.8, 0.82.4, 0.83.2, 0.84.5, 0.85.0, 0.86.0, 0.87.1, 0.88.3, 0.89.3, 0.90.2, 0.91.1, 0.92.0, 0.93.0, 0.94.6, 0.95.1, 0.96.2, 0.97.1, 0.98.0, 0.99.4, 0.100.0, 0.101.23, - 0.102.0, 0.103.3, 0.104.4, 0.105.1, 0.106.2, 0.107.1, 0.108.1] + 0.102.0, 0.103.3, 0.104.4, 0.105.1, 0.106.2, 0.107.1, 0.108.1, 0.109.0] force_update: - minimal_version: [0.0.0, 0.0.1, 0.82.5, 0.89.2] + minimal_version: [0.0.0, 0.0.1, 0.82.5, 0.89.2, 0.94.3] status: true