From 2ac4f0935a7a323a8be3f4ba975caa1e8664fd38 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Mon, 23 Sep 2024 12:56:34 -0400 Subject: [PATCH] chore(lib): add style and script ignore --- .github/workflows/rust.yml | 29 + .github/workflows/stale.yml | 21 + Cargo.lock | 2 +- Cargo.toml | 2 +- src/anchors.rs | 37 +- src/lib.rs | 20 +- test-samples/real-world-1.html | 209 +++++ test-samples/real-world-ja-1.html | 1217 +++++++++++++++++++++++++++++ tests/integration.rs | 25 + 9 files changed, 1546 insertions(+), 16 deletions(-) create mode 100644 .github/workflows/rust.yml create mode 100644 .github/workflows/stale.yml create mode 100644 test-samples/real-world-1.html create mode 100644 test-samples/real-world-ja-1.html diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..316d68a --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,29 @@ +name: Rust +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/cache@v3 + id: cache + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + - name: Build + run: cargo build --verbose --release + - name: Run tests + run: cargo test diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 0000000..4903e81 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,21 @@ +name: Mark stale issues and pull requests + +on: + schedule: + - cron: "30 1 * * *" + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-issue-message: "This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days." + stale-pr-message: "This PR is stale because it has been open 45 days with no activity. Remove stale label or comment or this will be closed in 10 days." + close-issue-message: "This issue was closed because it has been stalled for 5 days with no activity." + close-pr-message: "This PR was closed because it has been stalled for 10 days with no activity." + days-before-issue-stale: 30 + days-before-pr-stale: 45 + days-before-issue-close: 5 + days-before-pr-close: 10 diff --git a/Cargo.lock b/Cargo.lock index f4c3ff0..d2727bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,7 +95,7 @@ dependencies = [ [[package]] name = "fast_html2md" -version = "0.0.11" +version = "0.0.12" dependencies = [ "auto_encoder", "html5ever", diff --git a/Cargo.toml b/Cargo.toml index 3cd401b..93be3af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fast_html2md" -version = "0.0.11" +version = "0.0.12" edition = "2021" description = "A fast html2md crate for rust" categories = ["development-tools", "parsing", "parser-implementations"] diff --git a/src/anchors.rs b/src/anchors.rs index 821e17d..7a46e20 100644 --- a/src/anchors.rs +++ b/src/anchors.rs @@ -12,8 +12,6 @@ pub struct AnchorHandler { impl TagHandler for AnchorHandler { fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) { self.start_pos = printer.data.len(); - - // try to extract a hyperlink self.url = match tag.data { NodeData::Element { ref attrs, .. } => { let attrs = attrs.borrow(); @@ -22,7 +20,7 @@ impl TagHandler for AnchorHandler { .find(|attr| attr.name.local.as_bytes() == b"href"); match href { - Some(link) => link.value.trim().into(), + Some(link) => link.value.trim_ascii().into(), None => String::new(), } } @@ -31,8 +29,35 @@ impl TagHandler for AnchorHandler { } fn after_handle(&mut self, printer: &mut StructuredPrinter) { - // add braces around already present text, put an url afterwards - printer.insert_str(self.start_pos, "["); - printer.append_str(&format!("]({})", self.url)) + match printer.data.get(self.start_pos..) { + Some(d) => { + let starts_new_line = d.starts_with("\n"); + let ends_new_line = d.ends_with("\n"); + + if starts_new_line || ends_new_line { + // handle start + if starts_new_line { + printer.insert_str(self.start_pos + 1, "["); + } else { + printer.insert_str(self.start_pos, "["); + } + + // handle end + if ends_new_line { + let next_position = printer.data.len(); + printer.insert_str(next_position - 1, &format!("]({})", self.url)); + } else { + printer.append_str(&format!("]({})", self.url)); + } + } else { + printer.insert_str(self.start_pos, "["); + printer.append_str(&format!("]({})", self.url)); + } + } + _ => { + printer.insert_str(self.start_pos, "["); + printer.append_str(&format!("]({})", self.url)); + } + } } } diff --git a/src/lib.rs b/src/lib.rs index 6b88932..68dbeb9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,6 +77,7 @@ pub fn parse_html_custom( Ok(dom) => { let mut result = StructuredPrinter::default(); walk(&dom.document, &mut result, custom, commonmark); + // we want to eventually remove the clean step. clean_markdown(&result.data) } _ => Default::default(), @@ -128,10 +129,12 @@ fn walk( NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {} NodeData::Text { ref contents } => { let mut text = contents.borrow().to_string(); - let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre"); + + let inside_pre = result.parent_chain.iter().any(|t| t == "pre"); if inside_pre { // this is preformatted text, insert as-is result.append_str(&text); + } else if !(text.trim().len() == 0 && (result.data.chars().last() == Some('\n') || result.data.chars().last() == Some(' '))) @@ -139,19 +142,18 @@ fn walk( // in case it's not just a whitespace after the newline or another whitespace // regular text, collapse whitespace and newlines in text - let inside_code = result.parent_chain.iter().any(|tag| tag == "code"); + let inside_code = result.parent_chain.iter().any(|t| t == "code"); if !inside_code { text = escape_markdown(result, &text); } let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " "); - let minified_text = minified_text.trim_matches(|ch: char| ch == '\n' || ch == '\r'); - result.append_str(&minified_text); + result.append_str(&minified_text.trim_ascii()); } } NodeData::Comment { .. } => {} // ignore comments NodeData::Element { ref name, .. } => { - tag_name = name.local.to_string(); let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre"); + tag_name = name.local.to_string(); if inside_pre { // don't add any html tags inside the pre section @@ -200,22 +202,24 @@ fn walk( } } + let ignore_tags = tag_name == "style" || tag_name == "script"; + // handle this tag, while it's not in parent chain // and doesn't have child siblings handler.handle(&input, result); // save this tag name as parent for child nodes - result.parent_chain.push(tag_name.to_string()); // e.g. it was ["body"] and now it's ["body", "p"] + result.parent_chain.push(tag_name.clone()); // e.g. it was ["body"] and now it's ["body", "p"] let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2 // create space for siblings of next level result.siblings.insert(current_depth, vec![]); for child in input.children.borrow().iter() { - if handler.skip_descendants() { + if handler.skip_descendants() || ignore_tags { continue; } - + walk(&child, result, custom, commonmark); match child.data { diff --git a/test-samples/real-world-1.html b/test-samples/real-world-1.html new file mode 100644 index 0000000..82bfcd6 --- /dev/null +++ b/test-samples/real-world-1.html @@ -0,0 +1,209 @@ +Spider: The Web Crawler for AI + + + +

+To help you get started with Spider, we’ll give you $200 in credits when you spend $100. + +Terms apply +

The Web Crawler for AI Agents and LLMs

+Spider offers the finest data collecting solution. Engineered for speed and scalability, it + allows you to elevate your AI projects. +

Get Started
Example request
import requests, os, json
+
+headers = {
+    'Authorization': os.getenv("SPIDER_API_KEY"),
+    'Content-Type': 'application/jsonl',
+}
+
+json_data = {"limit":50,"metadata":True,"url":"https://spider.cloud"}
+
+response = requests.post('https://api.spider.cloud/crawl', 
+  headers=headers, json=json_data, stream=True)
+
+with response as r:
+    r.raise_for_status()
+    
+    for chunk in r.iter_lines(
+        chunk_size=None, 
+        decode_unicode=True
+    ):
+        data = json.loads(chunk)
+        print(data)
Free Trial

+Built with the need for Speed

+Experience the power of Spider, built fully in Rust for + next-generation scalability. +

+2.4secs

To crawl over 20,000 pages

+500-1000x

Faster than alternatives

+500x

Cheaper than traditional scraping services

Benchmarks displaying performance between Spider API request modes.
+Spider API Request Modes · Benchmarked tailwindcss.com ·

Seamless Integrations

+Seamlessly integrate Spider with a wide range of platforms, ensuring data curation + perfectly aligned with your requirements. Compatible with all major AI tools. +

Concurrent Streaming

+Save time and money without having to worry about bandwidth concerns by effectively + streaming all the results concurrently. The latency cost that is saved becomes drastic as + you crawl more websites. +

Warp Speed

+Powered by the cutting-edge Spider open-source project, our robust Rust engine scales effortlessly to handle extreme + workloads. We ensure continuous maintenance and improvement for top-tier performance. +

+Kickstart Your Data Collecting Projects Today +

+Jumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping. +

+Performance Tuned +

+Spider is written in Rust and runs in full concurrency to achieve crawling thousands of + pages in secs. +

+Multiple response formats +

+Get clean and formatted markdown, HTML, or text content for fine-tuning or training AI + models. +

+Caching +

+Further boost speed by caching repeated web page crawls to minimize expenses while + building. +

+Smart Mode +

+Spider dynamically switches to Headless Chrome when it needs to quick. +

Beta

+Scrape with AI +

+Do custom browser scripting and data extraction using the latest AI models with no cost + step caching. +

+The crawler for LLMs +

+Don't let crawling and scraping be the highest latency in your LLM & AI agent stack. +

+Scrape with no headaches

  • Auto Proxy rotations
  • Agent headers
  • Anti-bot detections
  • Headless chrome
  • Markdown responses

+The Fastest Web Crawler +

  • +Powered by spider-rs
  • 20,000 pages/seconds
  • Unlimited concurrency
  • Simple API
  • 50,000 RPM

+Do more with AI

  • Browser scripting
  • Advanced extraction
  • Data pipelines
  • Ideal for LLMs and AI Agents
  • Accurate labeling

+Achieve more with these new API features +

+Our API is set to stream so you can act in realtime. +

A user interface with a search bar containing the text "Latest sports news," a green "Submit" button, and two icon buttons to display searching and extracting with the service.

Search

+Get access to search engine results from anywhere and easily crawl and transform pages to + LLM-ready markdown. +

A user interface segment showing three icons representing different stages of data transformation.

Transform

+Convert raw HTML into markdown easily by using this API. Transform thousands of html pages + in seconds. +

+Join the community +

+Backed by a network of early advocates, contributors, and supporters. +

+FAQ +

Frequently asked questions about Spider.

What is Spider?

Spider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.

Why is my website not crawling?

Your crawl may fail if it requires JavaScript rendering. Try setting your request to 'chrome' to solve this issue.

Can you crawl all pages?

Yes, Spider accurately crawls all necessary content without needing a sitemap.

What formats can Spider convert web data into?

Spider outputs HTML, raw, text, and various markdown formats. It supports JSON, JSONL, CSV, and XML for API responses.

Is Spider suitable for large scraping projects?

Absolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.

How can I try Spider?

Purchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.

Does it respect robots.txt?

Yes, compliance with robots.txt is default, but you can disable this if necessary.

Unable to get dynamic content?

If you are having trouble getting dynamic pages, try setting the request parameter to "chrome" or "smart." You may also need to set `disable_intercept` to allow third-party or external scripts to run.

Why is my crawl going slow?

If you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.

Do you offer a Free Trial?

Yes, you can try out the service before being charged for free at checkout.

+Comprehensive Data Curation for Everyone +

+Trusted by leading tech businesses worldwide to deliver accurate and insightful data solutions. +

Outer Labs
Elementus Logo
Super AI Logo
LayerX Logo
Swiss Re
Write Sonic Logo
Alioth Logo

Next generation data for AI, scale to millions +

\ No newline at end of file diff --git a/test-samples/real-world-ja-1.html b/test-samples/real-world-ja-1.html new file mode 100644 index 0000000..321acc8 --- /dev/null +++ b/test-samples/real-world-ja-1.html @@ -0,0 +1,1217 @@ + + + + 通販ラーメン・つけ麺 総合ランキング | 宅麺.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+ +
+ + + +
+
+
+ + + +
+
+ +
+ + + + +
+
+ +
+
+
+
+
+
+
+
+
+
+ +
+ +
+
+
+ + + +
+
+ +
+ + + + +
+
+ +
+
+
+
+
+ + +
+ + +
+
+ +
+
+
+ + + + +
+ + +

+ 総合ランキング + + ※2024年8月販売実績 + +

+
+
+
+
+ カテゴリー +
+
+ +
+
+
+
+ +
+
+ +
+
+
+
+ その他 +
+
+ +
+
+
+
+
+ + +
+
+
+ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/integration.rs b/tests/integration.rs index 1c6ebd6..fa6140d 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -19,6 +19,31 @@ fn test_marcfs() { println!("{}", result); } +#[test] +#[ignore] +fn test_real_world() { + let mut html = String::new(); + let mut html_file = File::open("test-samples/real-world-1.html").unwrap(); + html_file + .read_to_string(&mut html) + .expect("File must be readable"); + let result = parse_html(&html, false); + println!("{}", result); +} + +#[test] +#[ignore] +fn test_real_world_ja() { + let mut html = String::new(); + let mut html_file = File::open("test-samples/real-world-ja-1.html").unwrap(); + html_file + .read_to_string(&mut html) + .expect("File must be readable"); + let result = parse_html(&html, false); + println!("{}", result); +} + + #[test] #[ignore] fn test_cheatsheet() {