Skip to content

Commit

Permalink
chore(transformations): add root selector across formats
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 21, 2024
1 parent 43a0ddc commit 9f55927
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 217 deletions.
191 changes: 94 additions & 97 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.9.14"
version = "2.9.15"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.9.14"
version = "2.9.15"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.9.14"
version = "2.9.15"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.9.14"
version = "2.9.15"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
181 changes: 67 additions & 114 deletions spider_transformations/src/transformation/content.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,25 @@ fn get_html(res: &Page, encoding: &Option<String>) -> String {
}
}

/// get the html with the root selector
fn get_html_with_selector(
res: &Page,
encoding: &Option<String>,
root_selector: Option<&String>,
) -> String {
let html = get_html(&res, &encoding);
if let Some(selector) = root_selector {
if let Ok(parsed_selector) = Selector::parse(selector) {
let fragment = Html::parse_fragment(&html);
let root_element = fragment.select(&parsed_selector).next();
if let Some(root_node) = root_element {
return root_node.html();
}
}
};
html
}

/// Transform format the content.
pub fn transform_content(
res: &Page,
Expand All @@ -238,29 +257,30 @@ pub fn transform_content(
let return_format = c.return_format;
let filter_images = c.filter_images;
let url_parsed = res.get_url_parsed().as_ref();
let base_html = get_html_with_selector(res, encoding, root_selector.as_ref());

match return_format {
ReturnFormat::Raw | ReturnFormat::Bytes => {
if c.readability {
match llm_readability::extractor::extract(
&mut res.get_html_bytes_u8(),
&mut base_html.as_bytes(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
},
&None,
) {
Ok(product) => product.content,
_ => get_html(res, &encoding),
_ => base_html,
}
} else {
get_html(res, &encoding)
base_html
}
}
ReturnFormat::CommonMark => {
let mut html = if c.readability && !res.is_empty() {
match llm_readability::extractor::extract(
&mut res.get_html_bytes_u8(),
&mut base_html.as_bytes(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
Expand All @@ -269,15 +289,15 @@ pub fn transform_content(
) {
Ok(product) => {
if product.content.is_empty() {
get_html(res, &encoding)
base_html
} else {
product.content
}
}
_ => get_html(res, &encoding),
_ => base_html,
}
} else {
get_html(res, &encoding)
base_html
};

let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> =
Expand All @@ -304,7 +324,7 @@ pub fn transform_content(
ReturnFormat::Markdown => {
let mut html = if c.readability {
match llm_readability::extractor::extract(
&mut res.get_html_bytes_u8(),
&mut base_html.as_bytes(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
Expand All @@ -313,15 +333,15 @@ pub fn transform_content(
) {
Ok(product) => {
if product.content.is_empty() {
get_html(res, encoding)
base_html
} else {
product.content
}
}
_ => get_html(res, encoding),
_ => base_html,
}
} else {
get_html(res, encoding)
base_html
};

let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> =
Expand Down Expand Up @@ -350,91 +370,39 @@ pub fn transform_content(

html
}
ReturnFormat::Html2Text => match encoding {
Some(ref e) => {
let b = res.get_html_encoded(e);
let b = if c.readability {
match llm_readability::extractor::extract(
&mut b.as_bytes(),
match res.get_url_parsed() {
Some(u) => u,
_ => &EXAMPLE_URL,
},
&None,
) {
Ok(product) => {
if product.content.is_empty() {
get_html(res, &encoding)
} else {
product.content
}
}
_ => b,
}
} else {
b
};

if b.len() > 0 {
crate::html2text::from_read(&b.as_bytes()[..], b.len())
} else {
Default::default()
}
}
_ => {
if c.readability {
match llm_readability::extractor::extract(
&mut res.get_html_bytes_u8(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
},
&None,
) {
Ok(product) => {
let b = {
if product.content.is_empty() {
res.get_html_bytes_u8()
} else {
product.content.as_bytes()
}
};

if b.len() > 0 {
crate::html2text::from_read(&b[..], b.len())
} else {
Default::default()
}
}
_ => match res.get_bytes() {
Some(b) => {
if b.len() > 0 {
crate::html2text::from_read(&b[..], b.len())
} else {
Default::default()
}
}
_ => Default::default(),
},
}
} else {
match res.get_bytes() {
Some(b) => {
if b.len() > 0 {
crate::html2text::from_read(&b[..], b.len())
} else {
Default::default()
}
ReturnFormat::Html2Text => {
let b = if c.readability {
match llm_readability::extractor::extract(
&mut base_html.as_bytes(),
match res.get_url_parsed() {
Some(u) => u,
_ => &EXAMPLE_URL,
},
&None,
) {
Ok(product) => {
if product.content.is_empty() {
base_html
} else {
product.content
}
_ => Default::default(),
}
_ => base_html,
}
} else {
base_html
};

if b.len() > 0 {
crate::html2text::from_read(&b.as_bytes()[..], b.len())
} else {
Default::default()
}
},
}
ReturnFormat::Text => {
let b = if c.readability {
match llm_readability::extractor::extract(
&mut res.get_html_bytes_u8(),
&mut base_html.as_bytes(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
Expand All @@ -443,36 +411,23 @@ pub fn transform_content(
) {
Ok(product) => {
if product.content.is_empty() {
get_html(res, encoding)
base_html
} else {
product.content
}
}
_ => get_html(res, encoding),
_ => base_html,
}
} else {
get_html(res, encoding)
base_html
};

let fragment = Html::parse_document(&b);

let d = if root_selector.is_some() {
let selector = &match root_selector {
Some(ref root_selector) => match Selector::parse(root_selector) {
Ok(qs) => qs,
_ => SELECTOR.as_ref().clone(),
},
_ => SELECTOR.as_ref().clone(),
};
fragment
.select(&selector)
.filter_map(|c| ElementRef::wrap(*c))
.collect::<Vec<_>>()
} else {
fragment
.select(SELECTOR.as_ref())
.filter_map(|c| ElementRef::wrap(*c))
.collect::<Vec<_>>()
};
let d = fragment
.select(SELECTOR.as_ref())
.filter_map(|c| ElementRef::wrap(*c))
.collect::<Vec<_>>();

super::text_extract::extract_text(&d)
}
Expand All @@ -484,7 +439,7 @@ pub fn transform_content(

if c.readability {
match llm_readability::extractor::extract(
&mut res.get_html_bytes_u8(),
&mut base_html.as_bytes(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
Expand All @@ -511,9 +466,7 @@ pub fn transform_content(
}
}
} else {
if let Ok(xml) =
convert_html_to_xml(&get_html(res, &encoding), &target_url, &encoding)
{
if let Ok(xml) = convert_html_to_xml(&base_html, &target_url, &encoding) {
xml
} else {
Default::default()
Expand Down
21 changes: 21 additions & 0 deletions spider_transformations/src/transformation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,25 @@ mod tests {
"The tranform to xml is invalid"
);
}

#[test]
fn test_transformations_root_selector() {
let markup = template().into_string();
let url = "https://spider.cloud";

let mut conf = content::TransformConfig::default();
let mut page_response = PageResponse::default();

page_response.content = Some(Bytes::from(markup));
let page = build(url, page_response);

conf.return_format = ReturnFormat::Markdown;

let content = content::transform_content(&page, &conf, &None, &Some("pre".into()));

assert!(
content.contains(&"The content is ready"),
"The tranform to markdown is invalid"
);
}
}
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.9.14"
version = "2.9.15"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.9.14"
version = "2.9.15"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit 9f55927

Please sign in to comment.