From 1785653b1e7d5e8ebe87c85fa873e3d6c081f708 Mon Sep 17 00:00:00 2001 From: Uku Taht Date: Tue, 3 Jan 2023 15:35:23 +0200 Subject: [PATCH] Ignore unknown countries (#2556) * Ignore XX and T1 countries * Add fallback if country_code=nil * Lookup city overrides directly in CityOverrides module * Changelog * Add empty moduledoc * Remove redundant comment --- CHANGELOG.md | 1 + config/test.exs | 4 +- lib/plausible/ingestion/city_overrides.ex | 2 +- lib/plausible/ingestion/event.ex | 39 ++------------- lib/plausible/ingestion/geolocation.ex | 47 +++++++++++++++++++ .../api/external_controller_test.exs | 41 ++++++++++++++++ 6 files changed, 96 insertions(+), 38 deletions(-) create mode 100644 lib/plausible/ingestion/geolocation.ex diff --git a/CHANGELOG.md b/CHANGELOG.md index f6d7fdd3d02c..3d47785321fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. ### Changed - Reject events with long URIs and data URIs plausible/analytics#2536 - Always show direct traffic in sources reports plausible/analytics#2531 +- Stop recording XX and T1 country codes plausible/analytics#2556 ## Fixed - Cascade delete sent_renewal_notifications table when user is deleted plausible/analytics#2549 diff --git a/config/test.exs b/config/test.exs index 475cd355a308..df2aba638226 100644 --- a/config/test.exs +++ b/config/test.exs @@ -56,7 +56,9 @@ config :geolix, {1, 1, 1, 1} => %{country: %{iso_code: "US"}}, {2, 2, 2, 2} => geolix_sample_lookup, {1, 1, 1, 1, 1, 1, 1, 1} => %{country: %{iso_code: "US"}}, - {0, 0, 0, 0} => %{country: %{iso_code: "ZZ"}} + {0, 0, 0, 0} => %{country: %{iso_code: "ZZ"}, city: %{geoname_id: 123_123}}, + {0, 0, 0, 1} => %{country: %{iso_code: "XX"}, subdivisions: [%{iso_code: "IDF"}]}, + {0, 0, 0, 2} => %{country: %{iso_code: "T1"}, subdivisions: [%{}, %{iso_code: "IDF"}]} } } ] diff --git a/lib/plausible/ingestion/city_overrides.ex b/lib/plausible/ingestion/city_overrides.ex index 61f046c94a3b..a00eb24ab9f8 100644 --- a/lib/plausible/ingestion/city_overrides.ex +++ b/lib/plausible/ingestion/city_overrides.ex @@ -198,5 +198,5 @@ defmodule Plausible.Ingestion.CityOverrides do # Hackney -> London 2_647_694 => 2_643_743 } - def get, do: @overrides + def get(key, default), do: Map.get(@overrides, key, default) end diff --git a/lib/plausible/ingestion/event.ex b/lib/plausible/ingestion/event.ex index f4c4cfb92dab..84d79ed9c9b9 100644 --- a/lib/plausible/ingestion/event.ex +++ b/lib/plausible/ingestion/event.ex @@ -5,7 +5,7 @@ defmodule Plausible.Ingestion.Event do are uniformly either buffered in batches (to Clickhouse) or dropped (e.g. due to spam blocklist) from the processing pipeline. """ - alias Plausible.Ingestion.{Request, CityOverrides} + alias Plausible.Ingestion.Request alias Plausible.ClickhouseEvent alias Plausible.Site.GateKeeper @@ -168,39 +168,9 @@ defmodule Plausible.Ingestion.Event do end defp put_geolocation(%__MODULE__{} = event) do - result = Geolix.lookup(event.request.remote_ip, where: :geolocation) + result = Plausible.Ingestion.Geolocation.lookup(event.request.remote_ip) - country_code = - get_in(result, [:country, :iso_code]) - |> ignore_unknown_country() - - city_geoname_id = get_in(result, [:city, :geoname_id]) - city_geoname_id = Map.get(CityOverrides.get(), city_geoname_id, city_geoname_id) - - subdivision1_code = - case result do - %{subdivisions: [%{iso_code: iso_code} | _rest]} -> - country_code <> "-" <> iso_code - - _ -> - "" - end - - subdivision2_code = - case result do - %{subdivisions: [_first, %{iso_code: iso_code} | _rest]} -> - country_code <> "-" <> iso_code - - _ -> - "" - end - - update_attrs(event, %{ - country_code: country_code, - subdivision1_code: subdivision1_code, - subdivision2_code: subdivision2_code, - city_geoname_id: city_geoname_id - }) + update_attrs(event, result) end defp put_screen_size(%__MODULE__{} = event) do @@ -372,9 +342,6 @@ defmodule Plausible.Ingestion.Event do end end - defp ignore_unknown_country("ZZ"), do: nil - defp ignore_unknown_country(country), do: country - defp generate_user_id(request, domain, hostname, salt) do cond do is_nil(salt) -> diff --git a/lib/plausible/ingestion/geolocation.ex b/lib/plausible/ingestion/geolocation.ex new file mode 100644 index 000000000000..f30f30af13e8 --- /dev/null +++ b/lib/plausible/ingestion/geolocation.ex @@ -0,0 +1,47 @@ +defmodule Plausible.Ingestion.Geolocation do + @moduledoc false + alias Plausible.Ingestion.CityOverrides + + def lookup(remote_ip) do + result = Geolix.lookup(remote_ip, where: :geolocation) + + country_code = + get_in(result, [:country, :iso_code]) + |> ignore_unknown_country() + + city_geoname_id = country_code && get_in(result, [:city, :geoname_id]) + city_geoname_id = CityOverrides.get(city_geoname_id, city_geoname_id) + + %{ + country_code: country_code, + subdivision1_code: subdivision1_code(country_code, result), + subdivision2_code: subdivision2_code(country_code, result), + city_geoname_id: city_geoname_id + } + end + + defp subdivision1_code(country_code, %{subdivisions: [%{iso_code: iso_code} | _rest]}) + when not is_nil(country_code) do + country_code <> "-" <> iso_code + end + + defp subdivision1_code(_, _), do: nil + + defp subdivision2_code(country_code, %{subdivisions: [_first, %{iso_code: iso_code} | _rest]}) + when not is_nil(country_code) do + country_code <> "-" <> iso_code + end + + defp subdivision2_code(_, _), do: nil + + @ignored_countries [ + # Worldwide + "ZZ", + # Disputed territory + "XX", + # Tor exit node + "T1" + ] + defp ignore_unknown_country(code) when code in @ignored_countries, do: nil + defp ignore_unknown_country(country), do: country +end diff --git a/test/plausible_web/controllers/api/external_controller_test.exs b/test/plausible_web/controllers/api/external_controller_test.exs index 315b7423a703..061be8b57833 100644 --- a/test/plausible_web/controllers/api/external_controller_test.exs +++ b/test/plausible_web/controllers/api/external_controller_test.exs @@ -685,6 +685,47 @@ defmodule PlausibleWeb.Api.ExternalControllerTest do pageview = get_event(domain) assert pageview.country_code == <<0, 0>> + assert pageview.subdivision1_code == "" + assert pageview.subdivision2_code == "" + assert pageview.city_geoname_id == 0 + end + + test "ignores disputed territory code XX", %{conn: conn, domain: domain} do + params = %{ + name: "pageview", + domain: domain, + url: "http://gigride.live/" + } + + conn + |> put_req_header("x-forwarded-for", "0.0.0.1") + |> post("/api/event", params) + + pageview = get_event(domain) + + assert pageview.country_code == <<0, 0>> + assert pageview.subdivision1_code == "" + assert pageview.subdivision2_code == "" + assert pageview.city_geoname_id == 0 + end + + test "ignores TOR exit node country code T1", %{conn: conn, domain: domain} do + params = %{ + name: "pageview", + domain: domain, + url: "http://gigride.live/" + } + + conn + |> put_req_header("x-forwarded-for", "0.0.0.2") + |> post("/api/event", params) + + pageview = get_event(domain) + + assert pageview.country_code == <<0, 0>> + assert pageview.subdivision1_code == "" + assert pageview.subdivision2_code == "" + assert pageview.city_geoname_id == 0 end test "scrubs port from x-forwarded-for", %{conn: conn, domain: domain} do