From 651c3ac248ea281cf2221e1eeb60e0951138e67d Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Thu, 23 Nov 2023 16:49:12 +0100
Subject: [PATCH] strip common tracking parameters by default

---
 courlan/clean.py    | 20 +++++++++++++++-----
 tests/unit_tests.py | 24 +++++++++++++++++-------
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/courlan/clean.py b/courlan/clean.py
index cabf5bc..3bb799b 100644
--- a/courlan/clean.py
+++ b/courlan/clean.py
@@ -36,6 +36,16 @@
 TRAILING_AMP = re.compile(r"/\&$")
 TRAILING_PARTS = re.compile(r'(.*?)[<>"\'\s]')
 
+# https://github.com/AdguardTeam/AdguardFilters/blob/master/TrackParamFilter/sections/general_url.txt
+# https://gitlab.com/ClearURLs/rules/-/blob/master/data.min.json
+# https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
+TRACKERS_RE = re.compile(
+    r"^(?:dc|fbc|gc|twc|yc|ysc)lid|"
+    r"(?:click|gbra|msclk|igsh|partner|wbra)id|"
+    r"(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|"
+    r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referer|session|source|uid|xtor)"
+)
+
 
 def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
     "Helper function: chained scrubbing and normalization"
@@ -108,11 +118,11 @@ def clean_query(
         for qelem in sorted(qdict):
             teststr = qelem.lower()
             # control param
-            if (
-                strict
-                and teststr not in ALLOWED_PARAMS
-                and teststr not in CONTROL_PARAMS
-            ):
+            if strict:
+                if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
+                    continue
+            # get rid of trackers
+            elif TRACKERS_RE.search(teststr):
                 continue
             # control language
             if language is not None and teststr in CONTROL_PARAMS:
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 2140d38..777e019 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -493,10 +493,12 @@ def test_normalization():
     assert (
         normalize_url("https://hanxiao.io//404.html") == "https://hanxiao.io/404.html"
     )
+
     # punycode
     assert normalize_url("http://xn--Mnchen-3ya.de") == "http://münchen.de"
     assert normalize_url("http://Mnchen-3ya.de") == "http://mnchen-3ya.de"
     assert normalize_url("http://xn--München.de") == "http://xn--münchen.de"
+
     # account for particular characters
     assert (
         normalize_url(
@@ -509,24 +511,32 @@ def test_normalization():
         == "https://taz.de/Zukunft-des-49-Euro-Tickets/!5968518/"
     )
 
+    # trackers
+    assert normalize_url("http://test.org/?utm_tracker=123") == "http://test.org/"
+    assert normalize_url("http://test.org/?s_cid=123") == "http://test.org/"
+    assert normalize_url("http://test.org/?aftr_source=0") == "http://test.org/"
+    assert normalize_url("http://test.org/?fb_ref=0") == "http://test.org/"
+
 
 def test_qelems():
     assert (
         normalize_url("http://test.net/foo.html?utm_source=twitter")
-        == "http://test.net/foo.html?utm_source=twitter"
+        == "http://test.net/foo.html"
     )
     assert (
-        normalize_url("http://test.net/foo.html?utm_source=twitter", strict=True)
+        normalize_url("http://test.net/foo.html?testid=1")
+        == "http://test.net/foo.html?testid=1"
+    )
+    assert (
+        normalize_url("http://test.net/foo.html?testid=1", strict=True)
         == "http://test.net/foo.html"
     )
     assert (
-        normalize_url("http://test.net/foo.html?utm_source=twitter&post=abc&page=2")
-        == "http://test.net/foo.html?page=2&post=abc&utm_source=twitter"
+        normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2")
+        == "http://test.net/foo.html?page=2&post=abc&testid=1"
     )
     assert (
-        normalize_url(
-            "http://test.net/foo.html?utm_source=twitter&post=abc&page=2", strict=True
-        )
+        normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2", strict=True)
         == "http://test.net/foo.html?page=2&post=abc"
     )
     assert (