From 1fda654ad9bc9620c897eac66a5fcef3b1478a2d Mon Sep 17 00:00:00 2001 From: Michalis Kargakis Date: Thu, 30 Jan 2025 13:39:15 +0100 Subject: [PATCH] Simplify code by leveraging automatic extension detection --- Makefile | 2 +- src/chain/evm.rs | 91 +++++++--------------------------------------- src/chain/tezos.rs | 2 - src/content/mod.rs | 78 ++++++++++++++++----------------------- src/url/mod.rs | 90 +++++++-------------------------------------- 5 files changed, 59 insertions(+), 204 deletions(-) diff --git a/Makefile b/Makefile index dced0ea..5561fef 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ clippy: .PHONY: test test: - cargo test -- $(filter-out $@,$(MAKECMDGOALS)) + cargo test .PHONY: check check: diff --git a/src/chain/evm.rs b/src/chain/evm.rs index 0734149..e4ecfa4 100644 --- a/src/chain/evm.rs +++ b/src/chain/evm.rs @@ -11,7 +11,7 @@ use std::time::Duration; use std::{future::Future, path::Path}; use tokio::fs; use tokio::time::sleep; -use tracing::{debug, error, warn}; +use tracing::{debug, error}; use crate::content::{ extensions::fetch_and_save_additional_content, fetch_and_save_content, Options, @@ -22,11 +22,7 @@ pub struct NFTMetadata { pub name: Option, pub description: Option, pub image: Option, - #[serde(default)] - pub image_details: Option
, pub animation_url: Option, - #[serde(default)] - pub animation_details: Option
, pub external_url: Option, pub attributes: Option>, pub media: Option, @@ -34,13 +30,6 @@ pub struct NFTMetadata { pub assets: Option, } -#[derive(Debug, Serialize, Deserialize)] -#[serde(untagged)] -pub enum Details { - Structured { format: String }, - Raw(String), -} - #[derive(Debug, Serialize, Deserialize)] pub struct NFTAttribute { pub trait_type: String, @@ -50,11 +39,6 @@ pub struct NFTAttribute { #[derive(Debug, Serialize, Deserialize)] pub struct Media { pub uri: String, - pub dimensions: Option, - pub size: Option, - #[serde(rename = "mimeType")] - pub mime_type: Option, - pub mime: Option, } #[derive(Debug, Serialize, Deserialize)] @@ -138,77 +122,35 @@ async fn get_token_uri( Ok(uri) } -fn get_extension_from_mime(mime: &str) -> Option { - match mime.split('/').last() { - Some("gltf-binary") => Some("glb".to_string()), - Some("octet-stream") => None, - Some(ext) => Some(ext.to_string()), - None => None, - } -} - -fn get_extension_from_media(media: &Media) -> Option { - if let Some(mime_type) = &media.mime_type { - return get_extension_from_mime(mime_type); - } - if let Some(mime) = &media.mime { - return get_extension_from_mime(mime); - } - None -} - -fn parse_details(details: &Details) -> String { - match details { - Details::Structured { format } => format.to_lowercase(), - // Ugly but apparently some metadata is not structured properly - // eg. AMC's OraciĆ³n - Details::Raw(raw_string) => serde_json::from_str::(raw_string) - .unwrap() - .get("format") - .unwrap() - .as_str() - .unwrap() - .to_string(), - } -} - -fn get_uri_and_extension_from_media(media: &Media, fallback_uri: &str) -> (String, Option) { +fn get_uri_from_media(media: &Media, fallback_uri: &str) -> String { let mut uri = media.uri.to_string(); if uri.is_empty() { uri = fallback_uri.to_string(); } - (uri, get_extension_from_media(media)) + uri } -fn get_uri_and_extension_from_metadata( +fn get_uri_from_metadata( metadata: &NFTMetadata, fallback_uri: &str, check_image_details: bool, check_animation_details: bool, -) -> (String, Option) { +) -> String { if !check_image_details && !check_animation_details { panic!("Need to check the extension of either an image or animation"); } if let Some(media) = &metadata.media { - return get_uri_and_extension_from_media(media, fallback_uri); + return get_uri_from_media(media, fallback_uri); } if let Some(content) = &metadata.content { - return get_uri_and_extension_from_media(content, fallback_uri); - } - if check_image_details && metadata.image_details.is_some() { - let format = parse_details(metadata.image_details.as_ref().unwrap()); - return (fallback_uri.to_string(), Some(format)); - } - if check_animation_details && metadata.animation_details.is_some() { - let format = parse_details(metadata.animation_details.as_ref().unwrap()); - return (fallback_uri.to_string(), Some(format)); + return get_uri_from_media(content, fallback_uri); } if let Some(assets) = &metadata.assets { if let Some(glb) = &assets.glb { - return (glb.to_string(), Some("glb".to_string())); + return glb.to_string(); } } - (fallback_uri.to_string(), None) + fallback_uri.to_string() } pub async fn process_nfts( @@ -236,7 +178,7 @@ pub async fn process_nfts( let contract_addr = match contract.address.parse::
() { Ok(addr) => addr, Err(e) => { - warn!("Failed to parse contract address on {}: {}", chain_name, e); + error!("Failed to parse contract address on {}: {}", chain_name, e); continue; } }; @@ -245,7 +187,7 @@ pub async fn process_nfts( let token_id = match U256::from_str_radix(&contract.token_id, 10) { Ok(id) => id, Err(e) => { - warn!("Failed to parse token ID: {}", e); + error!("Failed to parse token ID: {}", e); continue; } }; @@ -254,7 +196,7 @@ pub async fn process_nfts( let token_uri = match get_token_uri(contract_addr, provider.clone(), token_id).await { Ok(uri) => uri, Err(e) => { - error!("Failed to get token URI: {}, skipping token", e); + error!("Failed to get token URI: {}", e); continue; } }; @@ -272,7 +214,6 @@ pub async fn process_nfts( Options { overriden_filename: Some("metadata.json".to_string()), fallback_filename: None, - fallback_extension: None, }, ) .await @@ -289,8 +230,7 @@ pub async fn process_nfts( // Save linked content if let Some(image_url) = &metadata.image { - let (image_url, extension) = - get_uri_and_extension_from_metadata(&metadata, image_url, true, false); + let image_url = get_uri_from_metadata(&metadata, image_url, true, false); debug!("Downloading image from {}", image_url); fetch_and_save_content( &image_url, @@ -301,15 +241,13 @@ pub async fn process_nfts( Options { overriden_filename: None, fallback_filename: Some("image".to_string()), - fallback_extension: extension, }, ) .await?; } if let Some(animation_url) = &metadata.animation_url { - let (animation_url, extension) = - get_uri_and_extension_from_metadata(&metadata, animation_url, false, true); + let animation_url = get_uri_from_metadata(&metadata, animation_url, false, true); debug!("Downloading animation from {}", animation_url); fetch_and_save_content( &animation_url, @@ -320,7 +258,6 @@ pub async fn process_nfts( Options { overriden_filename: None, fallback_filename: Some("animation".to_string()), - fallback_extension: extension, }, ) .await?; diff --git a/src/chain/tezos.rs b/src/chain/tezos.rs index 9d5e98b..2391c29 100644 --- a/src/chain/tezos.rs +++ b/src/chain/tezos.rs @@ -127,7 +127,6 @@ pub async fn process_nfts( Options { overriden_filename: Some("metadata.json".to_string()), fallback_filename: None, - fallback_extension: None, }, ); let metadata_content_str = fs::read_to_string(metadata_content.await?).await?; @@ -192,7 +191,6 @@ pub async fn process_nfts( Options { overriden_filename: Some(file_name), fallback_filename: None, - fallback_extension: None, }, ) .await?; diff --git a/src/content/mod.rs b/src/content/mod.rs index a3e0b27..e068ea4 100644 --- a/src/content/mod.rs +++ b/src/content/mod.rs @@ -1,7 +1,6 @@ -use crate::url::{ - get_data_url_content, get_data_url_mime_type, get_last_path_segment, get_url, is_data_url, -}; +use crate::url::{get_data_url, get_last_path_segment, get_url, is_data_url}; use anyhow::Result; +use serde_json::Value; use std::path::{Path, PathBuf}; use tokio::fs; use tracing::{debug, info}; @@ -9,7 +8,7 @@ use tracing::{debug, info}; pub mod extensions; pub mod html; -async fn fetch_http_content(url: &str) -> Result<(Vec, String)> { +async fn fetch_http_content(url: &str) -> Result> { let client = reqwest::Client::new(); let response = client.get(url).send().await?; @@ -22,22 +21,14 @@ async fn fetch_http_content(url: &str) -> Result<(Vec, String)> { )); } - let content_type = response - .headers() - .get(reqwest::header::CONTENT_TYPE) - .and_then(|h| h.to_str().ok()) - .unwrap_or("") - .to_string(); - let content = response.bytes().await?.to_vec(); - Ok((content, content_type)) + Ok(content) } pub struct Options { pub overriden_filename: Option, pub fallback_filename: Option, - pub fallback_extension: Option, } async fn get_filename( @@ -54,16 +45,10 @@ async fn get_filename( .join(token_id); // Determine filename - let mut filename = if let Some(name) = options.overriden_filename { + let filename = if let Some(name) = options.overriden_filename { name.to_string() } else if is_data_url(url) { - // For data URLs, use content type as filename - let mime_type = get_data_url_mime_type(url); - format!( - "{}.{}", - options.fallback_filename.unwrap_or("content".to_string()), - mime_type - ) + options.fallback_filename.unwrap_or("content".to_string()) } else { // For regular URLs, try to extract filename from path get_last_path_segment( @@ -75,12 +60,6 @@ async fn get_filename( ) }; - if let Some(extension) = options.fallback_extension { - if !filename.contains('.') { - filename = format!("{}.{}", filename, extension); - } - } - let file_path = dir_path.join(&filename); Ok(file_path) @@ -101,6 +80,16 @@ fn detect_media_extension(content: &[u8]) -> Option<&'static str> { [0x00, 0x00, 0x00, _, 0x66, 0x74, 0x79, 0x70, 0x6D, 0x70, 0x34, 0x32, ..] => Some("mp4"), // QuickTime MOV [0x00, 0x00, 0x00, 0x14, 0x66, 0x74, 0x79, 0x70, 0x71, 0x74, 0x20, 0x20, ..] => Some("mov"), + // HTML + [b'<', b'h', b't', b'm', b'l', ..] => Some("html"), + // HTML starting with { + Some("html") + } + // JSON + [b'{', ..] => Some("json"), + // GLB + [0x47, 0x4C, 0x42, 0x0D, 0x0A, 0x1A, 0x0A, ..] => Some("glb"), _ => None, } } @@ -123,8 +112,8 @@ pub async fn fetch_and_save_content( } // Get content based on URL type - let (mut content, content_type) = if is_data_url(url) { - get_data_url_content(url)? + let mut content = if is_data_url(url) { + get_data_url(url).unwrap() } else { let content_url = get_url(url); // TODO: Rotate IPFS gateways to handle rate limits @@ -134,24 +123,7 @@ pub async fn fetch_and_save_content( // Create directory and save content fs::create_dir_all(file_path.parent().unwrap()).await?; - if content_type.contains("text/html") || content_type.contains("application/xhtml") { - if !file_path.to_string_lossy().ends_with(".html") { - file_path = file_path.with_extension("html"); - } - debug!("Downloading HTML content from {}. The saved files may be incomplete as they may have more dependencies.", url); - let content_str = String::from_utf8_lossy(&content); - html::download_html_resources(&content_str, url, file_path.parent().unwrap()).await?; - } else if content_type.contains("application/json") { - // Try to parse and format JSON content - if let Ok(content_str) = String::from_utf8(content.clone()) { - if let Ok(json_value) = serde_json::from_str::(&content_str) { - content = serde_json::to_string_pretty(&json_value)?.into(); - } - } - } - - // After the HTML/JSON handling block, add media extension detection: - // Check for media files if no extension detected + // Detect media extension if no extension is present if file_path.extension().is_none() { if let Some(ext) = detect_media_extension(&content) { file_path = file_path.with_extension(ext); @@ -159,6 +131,18 @@ pub async fn fetch_and_save_content( } } + match file_path.extension().unwrap_or_default().to_str() { + Some("json") => { + let json_value: Value = serde_json::from_slice(&content)?; + content = serde_json::to_string_pretty(&json_value)?.into(); + } + Some("html") => { + let content_str = String::from_utf8_lossy(&content); + html::download_html_resources(&content_str, url, file_path.parent().unwrap()).await?; + } + _ => {} + } + // Check if file exists again before downloading if fs::try_exists(&file_path).await? { debug!("File already exists at {}", file_path.display()); diff --git a/src/url/mod.rs b/src/url/mod.rs index c0b3a82..0718da0 100644 --- a/src/url/mod.rs +++ b/src/url/mod.rs @@ -1,22 +1,12 @@ use ::url::Url; -use anyhow::Result; use base64::Engine; pub fn is_data_url(url: &str) -> bool { url.starts_with("data:") } -pub fn get_data_url_mime_type(url: &str) -> String { - url.trim_start_matches("data:") - .split(';') - .next() - .and_then(|s| s.split('/').last()) - .unwrap_or("bin") - .to_string() -} - /// Extract content from a data URL -fn get_data_url(url: &str) -> Option<(String, Vec)> { +pub fn get_data_url(url: &str) -> Option> { if !is_data_url(url) { return None; } @@ -26,20 +16,11 @@ fn get_data_url(url: &str) -> Option<(String, Vec)> { return None; } - let mime_part = parts[0].trim_start_matches("data:").trim_end_matches(";"); let data = base64::engine::general_purpose::STANDARD .decode(parts[1]) .ok()?; - Some((mime_part.to_string(), data)) -} - -/// Get content from a data URL, returns the content and suggested file extension -pub fn get_data_url_content(url: &str) -> Result<(Vec, String)> { - let (mime_type, content) = - get_data_url(url).ok_or_else(|| anyhow::anyhow!("Invalid data URL format"))?; - - Ok((content, mime_type)) + Some(data) } /// Converts IPFS URLs to use a gateway, otherwise returns the original URL @@ -83,23 +64,20 @@ mod tests { } #[test] - fn test_get_data_url_content_json() { + fn test_get_data_url_json() { let data_url = "data:application/json;base64,eyAidGVzdCI6IDEyMyB9"; // base64 encoded '{ "test": 123 }' - let (content, mime_type) = get_data_url_content(data_url).unwrap(); + let content = get_data_url(data_url).unwrap(); assert_eq!(String::from_utf8_lossy(&content), "{ \"test\": 123 }"); - assert_eq!(mime_type, "application/json"); } #[test] - fn test_get_data_url_content_invalid() { - let result = get_data_url_content("data:text/plain;base64,invalid@@base64"); - assert!(result.is_err()); + fn test_get_data_url_invalid() { + assert!(get_data_url("data:text/plain;base64,invalid@@base64").is_none()); } #[test] - fn test_get_data_url_content_not_data_url() { - let result = get_data_url_content("https://example.com/image.png"); - assert!(result.is_err()); + fn test_get_data_url_not_data_url() { + assert!(get_data_url("https://example.com/image.png").is_none()); } #[test] @@ -113,42 +91,15 @@ mod tests { } #[test] - fn test_get_data_url_mime_type() { - assert_eq!( - get_data_url_mime_type("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA"), - "png" - ); - assert_eq!( - get_data_url_mime_type("data:text/plain;charset=UTF-8;base64,SGVsbG8="), - "plain" - ); - assert_eq!( - get_data_url_mime_type("data:application/vnd.custom+json;base64,eyJhIjogMn0="), - "vnd.custom+json" - ); - assert_eq!( - get_data_url_mime_type("data:invalidmime;base64,SGVsbG8="), - "invalidmime" - ); - } - - #[test] - fn test_get_data_url() { - // Valid base64 URL - let (mime, data) = get_data_url("data:text/plain;base64,SGVsbG8gd29ybGQ=").unwrap(); - assert_eq!(mime, "text/plain"); - assert_eq!(data, b"Hello world"); + fn test_get_data_url_base64() { + let content = get_data_url("data:text/plain;base64,SGVsbG8gd29ybGQ=").unwrap(); + assert_eq!(content, b"Hello world"); - // URL without base64 marker assert!(get_data_url("data:image/png,rawdata").is_none()); - - // Invalid base64 data assert!(get_data_url("data:text/plain;base64,Invalid@Base64!").is_none()); - // Empty data - let (mime, data) = get_data_url("data:text/plain;base64,").unwrap(); - assert_eq!(mime, "text/plain"); - assert!(data.is_empty()); + let empty = get_data_url("data:text/plain;base64,").unwrap(); + assert!(empty.is_empty()); } #[test] @@ -195,19 +146,4 @@ mod tests { "docs" ); } - - #[test] - fn test_get_data_url_content_non_base64() { - // URL with missing base64 marker - let result = get_data_url_content("data:text/plain,HelloWorld"); - assert!(result.is_err()); - } - - #[test] - fn test_get_data_url_content_empty_mime() { - let data_url = "data:;base64,dGVzdCBjb250ZW50"; // base64 encoded 'test content' - let (content, mime_type) = get_data_url_content(data_url).unwrap(); - assert_eq!(String::from_utf8_lossy(&content), "test content"); - assert_eq!(mime_type, ""); - } }