perf(sift): add one pass convert

spider-rs · Nov 15, 2024 · 316e77a · 316e77a
1 parent 81ef4ed
commit 316e77a
Show file tree

Hide file tree

Showing 7 changed files with 1,638 additions and 7 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/fast_html2md/Cargo.toml b/fast_html2md/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fast_html2md"
-version = "0.0.35"
+version = "0.0.38"
 edition = "2021"
 description = "A fast html2md crate for rust"
 categories = ["development-tools", "parsing", "parser-implementations"]

diff --git a/fast_html2md/src/extended/sifter.rs b/fast_html2md/src/extended/sifter.rs
@@ -46,7 +46,43 @@ pub trait WhitespaceSifter: AsRef<str> {
     }
 }
 
+/// A trait containing all `Vec<u8>` whitespace-sifting functions.
+pub trait WhitespaceSifterBytes: AsRef<[u8]> {
+    /// This removes duplicate whitespaces from a `Vec<u8>`.
+    /// It supports the same whitespace definition as [char::is_ascii_whitespace].
+    #[must_use]
+    fn sift_bytes(&self) -> String {
+        let input = self.as_ref();
+        let mut out: String = String::with_capacity(input.len());
+        sift_preallocated(input, &mut out);
+        out
+    }
+
+    /// This removes duplicate whitespaces from a `Vec<u8>`.
+    /// It preserves deduplicated newlines.
+    #[must_use]
+    fn sift_bytes_preserve_newlines(&self) -> String {
+        let bytes = self.as_ref();
+        let mut out = String::with_capacity(bytes.len());
+        let mut ind: usize = 0;
+
+        while ind < bytes.len() {
+            sift_preallocated_until_newline(bytes, &mut ind, &mut out);
+        }
+
+        if out.ends_with("\r\n") {
+            let _ = out.pop();
+            let _ = out.pop();
+        } else if out.ends_with('\n') {
+            let _ = out.pop();
+        }
+
+        out
+    }
+}
+
 impl<T: AsRef<str>> WhitespaceSifter for T {}
+impl<T: AsRef<[u8]>> WhitespaceSifterBytes for T {}
 
 /// A custom implementation of `str::trim_start`.
 fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {

diff --git a/fast_html2md/src/lib.rs b/fast_html2md/src/lib.rs
@@ -1,3 +1,4 @@
+use extended::sifter::WhitespaceSifterBytes;
 use html5ever::driver::ParseOpts;
 use html5ever::parse_document;
 use html5ever::tendril::TendrilSink;
@@ -404,7 +405,14 @@ fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
 ///
 /// Clears excessive punctuation that would be trimmed by renderer anyway
 pub fn clean_markdown(input: &str) -> String {
-    input.sift().into()
+    input.sift()
+}
+
+/// Called after all processing has been finished
+///
+/// Clears excessive punctuation that would be trimmed by renderer anyway
+pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
+    input.sift_bytes()
 }
 
 /// Intermediate result of HTML -> Markdown conversion.

diff --git a/fast_html2md/src/rewriter/writer.rs b/fast_html2md/src/rewriter/writer.rs
@@ -4,11 +4,11 @@ use super::images::rewrite_image_element;
 use super::lists::handle_list_or_item;
 use super::quotes::{rewrite_blockquote_element, rewrite_blockquote_text};
 use super::styles::rewrite_style_element;
-use crate::{clean_markdown, escape_markdown_base};
+use crate::clean_markdown_bytes;
 use lol_html::html_content::ContentType::{Html, Text};
 use lol_html::html_content::Element;
-use lol_html::{doc_comments, text};
-use lol_html::{element, rewrite_str, RewriteStrSettings};
+use lol_html::{doc_comments, doctype, text};
+use lol_html::{element, RewriteStrSettings};
 use std::cell::RefCell;
 use std::rc::Rc;
 use url::Url;
@@ -254,6 +254,10 @@ pub fn get_rewriter_settings(
         document_content_handlers: vec![doc_comments!(|c| {
             c.remove();
             Ok(())
+        }),
+        doctype!(|c| {
+            c.remove();
+            Ok(())
         })],
         element_content_handlers,
         ..RewriteStrSettings::default()
@@ -270,7 +274,24 @@ pub(crate) fn convert_html_to_markdown(
     let settings = get_rewriter_settings(commonmark, custom, url.clone());
 
     match rewrite_str(&Box::new(html), settings) {
-        Ok(markdown) => Ok(clean_markdown(&markdown)),
+        Ok(markdown) => Ok(clean_markdown_bytes(&markdown)),
         Err(e) => Err(e.into()),
     }
 }
+
+/// Shortcut to rewrite string and encode correctly
+pub fn rewrite_str<'h, 's, H: lol_html::HandlerTypes>(
+    html: &str,
+    settings: impl Into<lol_html::Settings<'h, 's, H>>,
+) -> Result<Vec<u8>, lol_html::errors::RewritingError> {
+    let mut output = vec![];
+
+    let mut rewriter = lol_html::HtmlRewriter::new(settings.into(), |c: &[u8]| {
+        output.extend_from_slice(c);
+    });
+
+    rewriter.write(html.as_bytes())?;
+    rewriter.end()?;
+
+    Ok(output)
+}
diff --git a/fast_html2md/tests/integration.rs b/fast_html2md/tests/integration.rs
@@ -74,6 +74,18 @@ fn test_real_world_ja() {
     assert!(!result.is_empty());
 }
 
+#[test]
+#[ignore]
+fn test_real_spider() {
+    let mut html = String::new();
+    let mut html_file: File = File::open("../test-samples/spider-cloud.html").unwrap();
+    html_file
+        .read_to_string(&mut html)
+        .expect("File must be readable");
+    let result = rewrite_html(&html, false);
+    assert!(result == r#"To help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits)\n# The Web Crawler for AI Agents and LLMs\nSpider offers the finest data collecting solution. Engineered for speed and scalability, it\nallows you to elevate your AI projects.\n[Get Started](https://spider.cloud/credits/new)View Preview\n* Basic\n* Streaming\nExample request\nPython\nJSONL\nCopy\n```\n`import requests, os, json\nheaders = {\n&#x27;&#x27;Authorization &#x27;&#x27;: f &#x27;&#x27;Bearer {os.getenv(&quot;&quot;SPIDER\\_API\\_KEY &quot;&quot;)}&#x27;&#x27;,\n&#x27;&#x27;Content-Type &#x27;&#x27;: &#x27;&#x27;application/jsonl &#x27;&#x27;,\n}\njson\\_data = {&quot;&quot;limit &quot;&quot;:50,&quot;&quot;metadata &quot;&quot;:True,&quot;&quot;url &quot;&quot;:&quot;&quot;https://spider.cloud &quot;&quot;}\nresponse = requests.post(&#x27;&#x27;https://api.spider.cloud/crawl &#x27;&#x27;, headers=headers, json=json\\_data, stream=True)\nwith response as r:\nr.raise\\_for\\_status()\nfor chunk in r.iter\\_lines(\nchunk\\_size=None, decode\\_unicode=True\n):\ndata = json.loads(chunk)\nprint(data)`\n```\n[Free Trial](https://spider.cloud/credits/new?free-trial=1)\nExample Response\n## Built with the need for**Speed**\nExperience the power of**Spider**, built fully in**Rust**for\nnext-generation scalability.\n### 2.4secs\nTo crawl over 20,000 pages\n### 500-1000x\nFaster than alternatives\n### 500x\nCheaper than traditional scraping services\nBenchmarks displaying performance between Spider API request modes.\nSpider API Request Modes &middot;Benchmarked tailwindcss.com &middot;06/16/2024\n[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md)\n### Seamless Integrations\nSeamlessly integrate Spider with a wide range of platforms, ensuring data curation\nperfectly aligned with your requirements. Compatible with all major AI tools.\n[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integrationLlama Index Logo](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integrationCrewAI Logo](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integrationFlowiseAI LogoFlowiseAI](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integrationComposio Logo](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integrationPhiData Logo](https://docs.phidata.com/tools/spider)\n### Concurrent Streaming\nSave time and money without having to worry about bandwidth concerns by effectively\nstreaming all the results concurrently. The latency cost that is saved becomes drastic as\nyou crawl more websites.\n### Warp Speed\nPowered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme\nworkloads. We ensure continuous maintenance and improvement for top-tier performance.\n## Kickstart Your Data Collecting Projects Today\nJumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping.\n### Performance Tuned\nSpider is written in Rust and runs in full concurrency to achieve crawling thousands of\npages in secs.\n### Multiple response formats\nGet clean and formatted markdown, HTML, or text content for fine-tuning or training AI\nmodels.\n### Caching\nFurther boost speed by caching repeated web page crawls to minimize expenses while\nbuilding.\n### Smart Mode\nSpider dynamically switches to Headless Chrome when it needs to quick.\nBeta\n### Scrape with AI\nDo custom browser scripting and data extraction using the latest AI models with no cost\nstep caching.\n### The crawler for LLMs\nDon't let crawling and scraping be the highest latency in your LLM & AI agent stack.\n### Scrape with no headaches\n* Auto Proxy rotations\n* Agent headers\n* Anti-bot detections\n* Headless chrome\n* Markdown responses\n### The Fastest Web Crawler\n* Powered by[spider-rs](https://github.com/spider-rs/spider)\n* 100,000 pages/seconds\n* Unlimited concurrency\n* Simple API\n* 50,000 RPM\n### Do more with AI\n* Browser scripting\n* Advanced extraction\n* Data pipelines\n* Ideal for LLMs and AI Agents\n* Accurate labeling\n## Achieve more with these new API features\nOur API is set to stream so you can act in realtime.\n![A user interface with a search bar containing the text &#34;Latest sports news,&#34; a green &#34;Submit&#34; button, and two icon buttons to display searching and extracting with the service.](/img/search_feature.webp)\n### Search\nGet access to search engine results from anywhere and easily crawl and transform pages to\nLLM-ready markdown.\n[Explore SearchRight Arrow](https://spider.cloud/docs/api#search)\n![A user interface segment showing three icons representing different stages of data transformation.](/img/transform_feature_example.webp)\n### Transform\nConvert raw HTML into markdown easily by using this API. Transform thousands of html pages\nin seconds.\n[Explore TransformRight Arrow](https://spider.cloud/docs/api#transform)\n## Join the community\nBacked by a network of early advocates, contributors, and supporters.\n[GitHub discussions\nChat Icon\n](https://github.com/orgs/spider-rs/discussions)[Discord\nChat Icon\n](https://discord.spider.cloud)\n[\n![iammerrick's avatar](/img/external/iammerrick_twitter.webp)\n@iammerrick\nRust based crawler Spider is next level for crawling &amp;scraping sites. So fast.\nTheir cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider\n](https://twitter.com/iammerrick/status/1787873425446572462)\n[\n![WilliamEspegren's avatar](/img/external/william_twitter.webp)\n@WilliamEspegren\nWeb crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor\nName a reason for me to use anything else?\ngit.luolix.top/spider-rs/spid…\n](https://twitter.com/WilliamEspegren/status/1789419820821184764)\n[\n![gasa's avatar](/img/external/gaza_twitter.webp)\n@gasa\n@gasathenaper\nis the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant\n](https://x.com/gasathenaper/status/1810612492596383948)\n[\n![Ashpreet Bedi's avatar](/img/external/ashpreet_bedi.webp)\n@Ashpreet Bedi\n@ashpreetbedi\nis THE best crawler out there, give it a try\n](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw)\n[\n![Troyusrex's avatar](/img/external/troy_twitter.webp)\n@Troyusrex\nI found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.\n](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326)\n[\n![Dify.AI's avatar](/img/external/difyai.webp)\n@Dify.AI\n🕷\u{fe0f}Spider @spider\\_rust\ncan be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.\n](https://x.com/dify_ai/status/1818226971056243089)\n## FAQ\nFrequently asked questions about Spider.\n### What is Spider?\nSpider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.\n### Why is my website not crawling?\nYour crawl may fail if it requires JavaScript rendering. Try setting your request to &#x27;chrome &#x27;to solve this issue.\n### Can you crawl all pages?\nYes, Spider accurately crawls all necessary content without needing a sitemap.\n### What formats can Spider convert web data into?\nSpider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses.\n### Is Spider suitable for large scraping projects?\nAbsolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.\n### How can I try Spider?\nPurchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.\n### Does it respect robots.txt?\nYes, compliance with robots.txt is default, but you can disable this if necessary.\n### Unable to get dynamic content?\nIf you are having trouble getting dynamic pages, try setting the request parameter to &quot;&quot;chrome &quot;&quot;or &quot;&quot;smart.&quot;&quot;You may also need to set `disable\\_intercept` to allow third-party or external scripts to run.\n### Why is my crawl going slow?\nIf you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.\n### Do you offer a Free Trial?\nYes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1).\n## Comprehensive Data Curation for Everyone\nTrusted by leading tech businesses worldwide to deliver accurate and insightful data solutions.\nOuter Labs\n[Zapier LogoZapier](https://zapier.com/apps/spider/integrations)\nElementus Logo\nSuper AI Logo\nLayerX Logo\nSwiss Re\nWrite Sonic Logo\nAlioth Logo\n### Next generation data for AI, scale to millions\n[Start now](https://spider.cloud/credits/new)\n### Company\n* [About](https://spider.cloud/about)\n* [Privacy](https://spider.cloud/privacy)\n* [Terms](https://spider.cloud/eula)\n* [FAQ](https://spider.cloud/faq)\n### Resources\n* [API](https://spider.cloud/docs/api)\n* [Docs](https://spider.cloud/docs/overview)\n* [Guides](https://spider.cloud/guides)\n* [Spider.rs Docs](https://docs.rs/spider/latest/spider/)\n### Services\n* [Pricing](https://spider.cloud/credits/new)\n* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping)\n[All systems normal.](https://spidercloud.statuspage.io/)\n[\nGithub LogoGitHub\n](https://github.com/spider-rs/spider)[\nDiscord LogoDiscord\n](https://discord.spider.cloud)[\nTwitter LogoTwitter\n](https://twitter.com/spider_rust)"#);
+}
+
 #[test]
 #[ignore]
 fn test_cheatsheet() {