From 651c3ac248ea281cf2221e1eeb60e0951138e67d Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 23 Nov 2023 16:49:12 +0100 Subject: [PATCH] strip common tracking parameters by default --- courlan/clean.py | 20 +++++++++++++++----- tests/unit_tests.py | 24 +++++++++++++++++------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/courlan/clean.py b/courlan/clean.py index cabf5bc..3bb799b 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -36,6 +36,16 @@ TRAILING_AMP = re.compile(r"/\&$") TRAILING_PARTS = re.compile(r'(.*?)[<>"\'\s]') +# https://github.com/AdguardTeam/AdguardFilters/blob/master/TrackParamFilter/sections/general_url.txt +# https://gitlab.com/ClearURLs/rules/-/blob/master/data.min.json +# https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records +TRACKERS_RE = re.compile( + r"^(?:dc|fbc|gc|twc|yc|ysc)lid|" + r"(?:click|gbra|msclk|igsh|partner|wbra)id|" + r"(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|" + r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referer|session|source|uid|xtor)" +) + def clean_url(url: str, language: Optional[str] = None) -> Optional[str]: "Helper function: chained scrubbing and normalization" @@ -108,11 +118,11 @@ def clean_query( for qelem in sorted(qdict): teststr = qelem.lower() # control param - if ( - strict - and teststr not in ALLOWED_PARAMS - and teststr not in CONTROL_PARAMS - ): + if strict: + if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS: + continue + # get rid of trackers + elif TRACKERS_RE.search(teststr): continue # control language if language is not None and teststr in CONTROL_PARAMS: diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 2140d38..777e019 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -493,10 +493,12 @@ def test_normalization(): assert ( normalize_url("https://hanxiao.io//404.html") == "https://hanxiao.io/404.html" ) + # punycode assert normalize_url("http://xn--Mnchen-3ya.de") == "http://münchen.de" assert normalize_url("http://Mnchen-3ya.de") == "http://mnchen-3ya.de" assert normalize_url("http://xn--München.de") == "http://xn--münchen.de" + # account for particular characters assert ( normalize_url( @@ -509,24 +511,32 @@ def test_normalization(): == "https://taz.de/Zukunft-des-49-Euro-Tickets/!5968518/" ) + # trackers + assert normalize_url("http://test.org/?utm_tracker=123") == "http://test.org/" + assert normalize_url("http://test.org/?s_cid=123") == "http://test.org/" + assert normalize_url("http://test.org/?aftr_source=0") == "http://test.org/" + assert normalize_url("http://test.org/?fb_ref=0") == "http://test.org/" + def test_qelems(): assert ( normalize_url("http://test.net/foo.html?utm_source=twitter") - == "http://test.net/foo.html?utm_source=twitter" + == "http://test.net/foo.html" ) assert ( - normalize_url("http://test.net/foo.html?utm_source=twitter", strict=True) + normalize_url("http://test.net/foo.html?testid=1") + == "http://test.net/foo.html?testid=1" + ) + assert ( + normalize_url("http://test.net/foo.html?testid=1", strict=True) == "http://test.net/foo.html" ) assert ( - normalize_url("http://test.net/foo.html?utm_source=twitter&post=abc&page=2") - == "http://test.net/foo.html?page=2&post=abc&utm_source=twitter" + normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2") + == "http://test.net/foo.html?page=2&post=abc&testid=1" ) assert ( - normalize_url( - "http://test.net/foo.html?utm_source=twitter&post=abc&page=2", strict=True - ) + normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2", strict=True) == "http://test.net/foo.html?page=2&post=abc" ) assert (