Skip to content

Commit

Permalink
chore(page): fix subdomain entry point handling root (#146)
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez authored Oct 28, 2023
1 parent 1bb357e commit c14cd6c
Show file tree
Hide file tree
Showing 9 changed files with 49 additions and 27 deletions.
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 4 additions & 6 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.46.4"
version = "1.46.5"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.46.4"
version = "1.46.5"
path = "../spider"
features = ["serde"]

Expand Down Expand Up @@ -61,7 +61,7 @@ path = "callback.rs"
[[example]]
name = "sitemap"
path = "sitemap.rs"
features = ["sitemap"]
required-features = ["spider/sitemap"]

[[example]]
name = "configuration"
Expand All @@ -70,6 +70,4 @@ path = "configuration.rs"
[[example]]
name = "budget"
path = "budget.rs"

[features]
default = ["spider/budget", "spider/sync"]
required-features = ["spider/budget", "spider/sync"]
2 changes: 1 addition & 1 deletion examples/download.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async fn main() {
website.scrape().await;

for page in website.get_pages().unwrap().iter() {
let download_file = page.get_url().clone();
let download_file = page.get_url();
let download_file = download_file.replace(website_name, "");
let download_file = download_file.replace(".", "-");
let download_file = download_file.replace("/", "-");
Expand Down
2 changes: 1 addition & 1 deletion examples/download_to_react.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ async fn main() {
match website.get_pages() {
Some(pages) => {
for page in pages.iter() {
let download_file = page.get_url().clone();
let download_file = page.get_url();
let download_file = download_file.replace(website_name, "");
let download_file = download_file.replace(".", "-");
let download_file = download_file.replace("/", "-");
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.46.4"
version = "1.46.5"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand Down
12 changes: 6 additions & 6 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.46.4"
spider = "1.46.5"
```

And then the code:
Expand Down Expand Up @@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl

```toml
[dependencies]
spider = { version = "1.46.4", features = ["regex", "ua_generator"] }
spider = { version = "1.46.5", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand Down Expand Up @@ -116,7 +116,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "1.46.4", features = ["decentralized"] }
spider = { version = "1.46.5", features = ["decentralized"] }
```

```sh
Expand All @@ -136,7 +136,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.46.4", features = ["sync"] }
spider = { version = "1.46.5", features = ["sync"] }
```

```rust,no_run
Expand Down Expand Up @@ -166,7 +166,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.46.4", features = ["regex"] }
spider = { version = "1.46.5", features = ["regex"] }
```

```rust,no_run
Expand All @@ -193,7 +193,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.46.4", features = ["control"] }
spider = { version = "1.46.5", features = ["control"] }
```

```rust
Expand Down
32 changes: 28 additions & 4 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,13 @@ impl Page {
let mut abs = self.abs_path(href);
let host_name = abs.host_str();
let mut can_process = match host_name {
Some(host) => parent_host.ends_with(host),
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
} else {
parent_host.ends_with(host)
}
}
_ => false,
};
if !can_process
Expand Down Expand Up @@ -409,7 +415,13 @@ impl Page {
let mut abs = self.abs_path(href.inner());
let host_name = abs.host_str();
let mut can_process = match host_name {
Some(host) => parent_host.ends_with(host),
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
} else {
parent_host.ends_with(host)
}
}
_ => false,
};
if !can_process && host_name.is_some() && !self.external_domains_caseless.is_empty()
Expand Down Expand Up @@ -494,7 +506,13 @@ impl Page {

// determine if the crawl can continue based on host match
let mut can_process = match abs.host_str() {
Some(host) => parent_host.ends_with(host),
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
} else {
parent_host.ends_with(host)
}
}
_ => false,
};

Expand Down Expand Up @@ -567,7 +585,13 @@ impl Page {
let mut abs = self.abs_path(href);

let can_process = match abs.host_str() {
Some(host) => parent_host.ends_with(host),
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
} else {
parent_host.ends_with(host)
}
}
_ => false,
};

Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.46.4"
version = "1.46.5"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler CLI written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -26,7 +26,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.46.4"
version = "1.46.5"
path = "../spider"

[[bin]]
Expand Down
4 changes: 2 additions & 2 deletions spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "1.46.4"
version = "1.46.5"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler as a worker or proxy."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ lazy_static = "1.4.0"
env_logger = "0.10.0"

[dependencies.spider]
version = "1.46.4"
version = "1.46.5"
path = "../spider"
features = ["serde", "flexbuffers"]

Expand Down

0 comments on commit c14cd6c

Please sign in to comment.