Skip to content

Commit

Permalink
strip common tracking parameters by default
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 23, 2023
1 parent b828bd0 commit 651c3ac
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 12 deletions.
20 changes: 15 additions & 5 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@
TRAILING_AMP = re.compile(r"/\&$")
TRAILING_PARTS = re.compile(r'(.*?)[<>"\'\s]')

# https://github.com/AdguardTeam/AdguardFilters/blob/master/TrackParamFilter/sections/general_url.txt
# https://gitlab.com/ClearURLs/rules/-/blob/master/data.min.json
# https://firefox.settings.services.mozilla.com/v1/buckets/main/collections/query-stripping/records
TRACKERS_RE = re.compile(
r"^(?:dc|fbc|gc|twc|yc|ysc)lid|"
r"(?:click|gbra|msclk|igsh|partner|wbra)id|"
r"(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|"
r"(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referer|session|source|uid|xtor)"
)


def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
"Helper function: chained scrubbing and normalization"
Expand Down Expand Up @@ -108,11 +118,11 @@ def clean_query(
for qelem in sorted(qdict):
teststr = qelem.lower()
# control param
if (
strict
and teststr not in ALLOWED_PARAMS
and teststr not in CONTROL_PARAMS
):
if strict:
if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
continue
# get rid of trackers
elif TRACKERS_RE.search(teststr):
continue
# control language
if language is not None and teststr in CONTROL_PARAMS:
Expand Down
24 changes: 17 additions & 7 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,10 +493,12 @@ def test_normalization():
assert (
normalize_url("https://hanxiao.io//404.html") == "https://hanxiao.io/404.html"
)

# punycode
assert normalize_url("http://xn--Mnchen-3ya.de") == "http://münchen.de"
assert normalize_url("http://Mnchen-3ya.de") == "http://mnchen-3ya.de"
assert normalize_url("http://xn--München.de") == "http://xn--münchen.de"

# account for particular characters
assert (
normalize_url(
Expand All @@ -509,24 +511,32 @@ def test_normalization():
== "https://taz.de/Zukunft-des-49-Euro-Tickets/!5968518/"
)

# trackers
assert normalize_url("http://test.org/?utm_tracker=123") == "http://test.org/"
assert normalize_url("http://test.org/?s_cid=123") == "http://test.org/"
assert normalize_url("http://test.org/?aftr_source=0") == "http://test.org/"
assert normalize_url("http://test.org/?fb_ref=0") == "http://test.org/"


def test_qelems():
assert (
normalize_url("http://test.net/foo.html?utm_source=twitter")
== "http://test.net/foo.html?utm_source=twitter"
== "http://test.net/foo.html"
)
assert (
normalize_url("http://test.net/foo.html?utm_source=twitter", strict=True)
normalize_url("http://test.net/foo.html?testid=1")
== "http://test.net/foo.html?testid=1"
)
assert (
normalize_url("http://test.net/foo.html?testid=1", strict=True)
== "http://test.net/foo.html"
)
assert (
normalize_url("http://test.net/foo.html?utm_source=twitter&post=abc&page=2")
== "http://test.net/foo.html?page=2&post=abc&utm_source=twitter"
normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2")
== "http://test.net/foo.html?page=2&post=abc&testid=1"
)
assert (
normalize_url(
"http://test.net/foo.html?utm_source=twitter&post=abc&page=2", strict=True
)
normalize_url("http://test.net/foo.html?testid=1&post=abc&page=2", strict=True)
== "http://test.net/foo.html?page=2&post=abc"
)
assert (
Expand Down

0 comments on commit 651c3ac

Please sign in to comment.