From e54443b158f15d2584750571bb27d5e3fd5ef178 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 3 Dec 2024 15:44:50 -0500 Subject: [PATCH] chore(crawler): add full analytics ignore --- Cargo.lock | 12 ++++++------ spider/Cargo.toml | 2 +- spider_chrome/Cargo.toml | 2 +- spider_chrome/src/handler/network.rs | 7 ++++++- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 8 files changed, 18 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 35ae0d929..8b666e027 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4297,7 +4297,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.13.100" +version = "2.14.0" dependencies = [ "ahash", "aho-corasick", @@ -4359,7 +4359,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.13.100" +version = "2.14.0" dependencies = [ "adblock", "async-tungstenite", @@ -4394,7 +4394,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.13.100" +version = "2.14.0" dependencies = [ "clap", "env_logger", @@ -4419,7 +4419,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.13.100" +version = "2.14.0" dependencies = [ "aho-corasick", "fast_html2md", @@ -4441,7 +4441,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.13.100" +version = "2.14.0" dependencies = [ "indexmap 1.9.3", "serde", @@ -4453,7 +4453,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.13.100" +version = "2.14.0" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 94017d842..657e97fad 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.13.100" +version = "2.14.0" authors = [ "j-mendez " ] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 8cc6ebe3a..a4faf10c9 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.13.100" +version = "2.14.0" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index 5fe47f184..11a432d35 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -31,6 +31,7 @@ lazy_static::lazy_static! { "d3.js", "app.js", "main.js", + "index.js", // Verified 3rd parties for request "https://m.stripe.network/inner.html", "https://m.stripe.network/out-4.5.43.js", @@ -83,12 +84,16 @@ lazy_static::lazy_static! { static ref URL_IGNORE_TRIE: Trie = { let mut trie = Trie::new(); let patterns = [ + "https://pagead2.googlesyndication.com", + "https://googleads.g.doubleclick.net", "https://www.google-analytics.com", "https://www.googletagmanager.com", "https://px.ads.linkedin.com", "https://connect.facebook.net", - "https://analytics.twitter.com", "https://ads.twitter.com", + "https://cdn.segment.com", + "https://analytics.", + "http://analytics.", "sc.omtrdc.net", "doubleclick.net", "hotjar.com", diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 5895dd85a..52e09cce8 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.13.100" +version = "2.14.0" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index ded825c24..101d81ada 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.13.100" +version = "2.14.0" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 03dcc40cd..36335e114 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.13.100" +version = "2.14.0" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 19de97abf..f942a7d36 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.13.100" +version = "2.14.0" authors = [ "j-mendez " ]