From 1fda654ad9bc9620c897eac66a5fcef3b1478a2d Mon Sep 17 00:00:00 2001
From: Michalis Kargakis <kargakis@protonmail.com>
Date: Thu, 30 Jan 2025 13:39:15 +0100
Subject: [PATCH] Simplify code by leveraging automatic extension detection

---
 Makefile           |  2 +-
 src/chain/evm.rs   | 91 +++++++---------------------------------------
 src/chain/tezos.rs |  2 -
 src/content/mod.rs | 78 ++++++++++++++++-----------------------
 src/url/mod.rs     | 90 +++++++--------------------------------------
 5 files changed, 59 insertions(+), 204 deletions(-)
diff --git a/Makefile b/Makefile
index dced0ea..5561fef 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ clippy:
 
 .PHONY: test
 test:
-	cargo test -- $(filter-out $@,$(MAKECMDGOALS))
+	cargo test
 
 .PHONY: check
 check:
diff --git a/src/chain/evm.rs b/src/chain/evm.rs
index 0734149..e4ecfa4 100644
--- a/src/chain/evm.rs
+++ b/src/chain/evm.rs
@@ -11,7 +11,7 @@ use std::time::Duration;
 use std::{future::Future, path::Path};
 use tokio::fs;
 use tokio::time::sleep;
-use tracing::{debug, error, warn};
+use tracing::{debug, error};
 
 use crate::content::{
     extensions::fetch_and_save_additional_content, fetch_and_save_content, Options,
@@ -22,11 +22,7 @@ pub struct NFTMetadata {
     pub name: Option<String>,
     pub description: Option<String>,
     pub image: Option<String>,
-    #[serde(default)]
-    pub image_details: Option<Details>,
     pub animation_url: Option<String>,
-    #[serde(default)]
-    pub animation_details: Option<Details>,
     pub external_url: Option<String>,
     pub attributes: Option<Vec<NFTAttribute>>,
     pub media: Option<Media>,
@@ -34,13 +30,6 @@ pub struct NFTMetadata {
     pub assets: Option<Assets>,
 }
 
-#[derive(Debug, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum Details {
-    Structured { format: String },
-    Raw(String),
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct NFTAttribute {
     pub trait_type: String,
@@ -50,11 +39,6 @@ pub struct NFTAttribute {
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Media {
     pub uri: String,
-    pub dimensions: Option<String>,
-    pub size: Option<String>,
-    #[serde(rename = "mimeType")]
-    pub mime_type: Option<String>,
-    pub mime: Option<String>,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -138,77 +122,35 @@ async fn get_token_uri(
     Ok(uri)
 }
 
-fn get_extension_from_mime(mime: &str) -> Option<String> {
-    match mime.split('/').last() {
-        Some("gltf-binary") => Some("glb".to_string()),
-        Some("octet-stream") => None,
-        Some(ext) => Some(ext.to_string()),
-        None => None,
-    }
-}
-
-fn get_extension_from_media(media: &Media) -> Option<String> {
-    if let Some(mime_type) = &media.mime_type {
-        return get_extension_from_mime(mime_type);
-    }
-    if let Some(mime) = &media.mime {
-        return get_extension_from_mime(mime);
-    }
-    None
-}
-
-fn parse_details(details: &Details) -> String {
-    match details {
-        Details::Structured { format } => format.to_lowercase(),
-        // Ugly but apparently some metadata is not structured properly
-        // eg. AMC's Oración
-        Details::Raw(raw_string) => serde_json::from_str::<serde_json::Value>(raw_string)
-            .unwrap()
-            .get("format")
-            .unwrap()
-            .as_str()
-            .unwrap()
-            .to_string(),
-    }
-}
-
-fn get_uri_and_extension_from_media(media: &Media, fallback_uri: &str) -> (String, Option<String>) {
+fn get_uri_from_media(media: &Media, fallback_uri: &str) -> String {
     let mut uri = media.uri.to_string();
     if uri.is_empty() {
         uri = fallback_uri.to_string();
     }
-    (uri, get_extension_from_media(media))
+    uri
 }
 
-fn get_uri_and_extension_from_metadata(
+fn get_uri_from_metadata(
     metadata: &NFTMetadata,
     fallback_uri: &str,
     check_image_details: bool,
     check_animation_details: bool,
-) -> (String, Option<String>) {
+) -> String {
     if !check_image_details && !check_animation_details {
         panic!("Need to check the extension of either an image or animation");
     }
     if let Some(media) = &metadata.media {
-        return get_uri_and_extension_from_media(media, fallback_uri);
+        return get_uri_from_media(media, fallback_uri);
     }
     if let Some(content) = &metadata.content {
-        return get_uri_and_extension_from_media(content, fallback_uri);
-    }
-    if check_image_details && metadata.image_details.is_some() {
-        let format = parse_details(metadata.image_details.as_ref().unwrap());
-        return (fallback_uri.to_string(), Some(format));
-    }
-    if check_animation_details && metadata.animation_details.is_some() {
-        let format = parse_details(metadata.animation_details.as_ref().unwrap());
-        return (fallback_uri.to_string(), Some(format));
+        return get_uri_from_media(content, fallback_uri);
     }
     if let Some(assets) = &metadata.assets {
         if let Some(glb) = &assets.glb {
-            return (glb.to_string(), Some("glb".to_string()));
+            return glb.to_string();
         }
     }
-    (fallback_uri.to_string(), None)
+    fallback_uri.to_string()
 }
 
 pub async fn process_nfts(
@@ -236,7 +178,7 @@ pub async fn process_nfts(
         let contract_addr = match contract.address.parse::<Address>() {
             Ok(addr) => addr,
             Err(e) => {
-                warn!("Failed to parse contract address on {}: {}", chain_name, e);
+                error!("Failed to parse contract address on {}: {}", chain_name, e);
                 continue;
             }
         };
@@ -245,7 +187,7 @@ pub async fn process_nfts(
         let token_id = match U256::from_str_radix(&contract.token_id, 10) {
             Ok(id) => id,
             Err(e) => {
-                warn!("Failed to parse token ID: {}", e);
+                error!("Failed to parse token ID: {}", e);
                 continue;
             }
         };
@@ -254,7 +196,7 @@ pub async fn process_nfts(
         let token_uri = match get_token_uri(contract_addr, provider.clone(), token_id).await {
             Ok(uri) => uri,
             Err(e) => {
-                error!("Failed to get token URI: {}, skipping token", e);
+                error!("Failed to get token URI: {}", e);
                 continue;
             }
         };
@@ -272,7 +214,6 @@ pub async fn process_nfts(
             Options {
                 overriden_filename: Some("metadata.json".to_string()),
                 fallback_filename: None,
-                fallback_extension: None,
             },
         )
         .await
@@ -289,8 +230,7 @@ pub async fn process_nfts(
 
         // Save linked content
         if let Some(image_url) = &metadata.image {
-            let (image_url, extension) =
-                get_uri_and_extension_from_metadata(&metadata, image_url, true, false);
+            let image_url = get_uri_from_metadata(&metadata, image_url, true, false);
             debug!("Downloading image from {}", image_url);
             fetch_and_save_content(
                 &image_url,
@@ -301,15 +241,13 @@ pub async fn process_nfts(
                 Options {
                     overriden_filename: None,
                     fallback_filename: Some("image".to_string()),
-                    fallback_extension: extension,
                 },
             )
             .await?;
         }
 
         if let Some(animation_url) = &metadata.animation_url {
-            let (animation_url, extension) =
-                get_uri_and_extension_from_metadata(&metadata, animation_url, false, true);
+            let animation_url = get_uri_from_metadata(&metadata, animation_url, false, true);
             debug!("Downloading animation from {}", animation_url);
             fetch_and_save_content(
                 &animation_url,
@@ -320,7 +258,6 @@ pub async fn process_nfts(
                 Options {
                     overriden_filename: None,
                     fallback_filename: Some("animation".to_string()),
-                    fallback_extension: extension,
                 },
             )
             .await?;
diff --git a/src/chain/tezos.rs b/src/chain/tezos.rs
index 9d5e98b..2391c29 100644
--- a/src/chain/tezos.rs
+++ b/src/chain/tezos.rs
@@ -127,7 +127,6 @@ pub async fn process_nfts(
                 Options {
                     overriden_filename: Some("metadata.json".to_string()),
                     fallback_filename: None,
-                    fallback_extension: None,
                 },
             );
             let metadata_content_str = fs::read_to_string(metadata_content.await?).await?;
@@ -192,7 +191,6 @@ pub async fn process_nfts(
                     Options {
                         overriden_filename: Some(file_name),
                         fallback_filename: None,
-                        fallback_extension: None,
                     },
                 )
                 .await?;
diff --git a/src/content/mod.rs b/src/content/mod.rs
index a3e0b27..e068ea4 100644
--- a/src/content/mod.rs
+++ b/src/content/mod.rs
@@ -1,7 +1,6 @@
-use crate::url::{
-    get_data_url_content, get_data_url_mime_type, get_last_path_segment, get_url, is_data_url,
-};
+use crate::url::{get_data_url, get_last_path_segment, get_url, is_data_url};
 use anyhow::Result;
+use serde_json::Value;
 use std::path::{Path, PathBuf};
 use tokio::fs;
 use tracing::{debug, info};
@@ -9,7 +8,7 @@ use tracing::{debug, info};
 pub mod extensions;
 pub mod html;
 
-async fn fetch_http_content(url: &str) -> Result<(Vec<u8>, String)> {
+async fn fetch_http_content(url: &str) -> Result<Vec<u8>> {
     let client = reqwest::Client::new();
     let response = client.get(url).send().await?;
 
@@ -22,22 +21,14 @@ async fn fetch_http_content(url: &str) -> Result<(Vec<u8>, String)> {
         ));
     }
 
-    let content_type = response
-        .headers()
-        .get(reqwest::header::CONTENT_TYPE)
-        .and_then(|h| h.to_str().ok())
-        .unwrap_or("")
-        .to_string();
-
     let content = response.bytes().await?.to_vec();
 
-    Ok((content, content_type))
+    Ok(content)
 }
 
 pub struct Options {
     pub overriden_filename: Option<String>,
     pub fallback_filename: Option<String>,
-    pub fallback_extension: Option<String>,
 }
 
 async fn get_filename(
@@ -54,16 +45,10 @@ async fn get_filename(
         .join(token_id);
 
     // Determine filename
-    let mut filename = if let Some(name) = options.overriden_filename {
+    let filename = if let Some(name) = options.overriden_filename {
         name.to_string()
     } else if is_data_url(url) {
-        // For data URLs, use content type as filename
-        let mime_type = get_data_url_mime_type(url);
-        format!(
-            "{}.{}",
-            options.fallback_filename.unwrap_or("content".to_string()),
-            mime_type
-        )
+        options.fallback_filename.unwrap_or("content".to_string())
     } else {
         // For regular URLs, try to extract filename from path
         get_last_path_segment(
@@ -75,12 +60,6 @@ async fn get_filename(
         )
     };
 
-    if let Some(extension) = options.fallback_extension {
-        if !filename.contains('.') {
-            filename = format!("{}.{}", filename, extension);
-        }
-    }
-
     let file_path = dir_path.join(&filename);
 
     Ok(file_path)
@@ -101,6 +80,16 @@ fn detect_media_extension(content: &[u8]) -> Option<&'static str> {
         [0x00, 0x00, 0x00, _, 0x66, 0x74, 0x79, 0x70, 0x6D, 0x70, 0x34, 0x32, ..] => Some("mp4"),
         // QuickTime MOV
         [0x00, 0x00, 0x00, 0x14, 0x66, 0x74, 0x79, 0x70, 0x71, 0x74, 0x20, 0x20, ..] => Some("mov"),
+        // HTML
+        [b'<', b'h', b't', b'm', b'l', ..] => Some("html"),
+        // HTML starting with <!DOCTYPE html
+        [0x3C, 0x21, 0x44, 0x4F, 0x43, 0x54, 0x59, 0x50, 0x45, 0x20, 0x68, 0x74, 0x6D, 0x6C, ..] => {
+            Some("html")
+        }
+        // JSON
+        [b'{', ..] => Some("json"),
+        // GLB
+        [0x47, 0x4C, 0x42, 0x0D, 0x0A, 0x1A, 0x0A, ..] => Some("glb"),
         _ => None,
     }
 }
@@ -123,8 +112,8 @@ pub async fn fetch_and_save_content(
     }
 
     // Get content based on URL type
-    let (mut content, content_type) = if is_data_url(url) {
-        get_data_url_content(url)?
+    let mut content = if is_data_url(url) {
+        get_data_url(url).unwrap()
     } else {
         let content_url = get_url(url);
         // TODO: Rotate IPFS gateways to handle rate limits
@@ -134,24 +123,7 @@ pub async fn fetch_and_save_content(
     // Create directory and save content
     fs::create_dir_all(file_path.parent().unwrap()).await?;
 
-    if content_type.contains("text/html") || content_type.contains("application/xhtml") {
-        if !file_path.to_string_lossy().ends_with(".html") {
-            file_path = file_path.with_extension("html");
-        }
-        debug!("Downloading HTML content from {}. The saved files may be incomplete as they may have more dependencies.", url);
-        let content_str = String::from_utf8_lossy(&content);
-        html::download_html_resources(&content_str, url, file_path.parent().unwrap()).await?;
-    } else if content_type.contains("application/json") {
-        // Try to parse and format JSON content
-        if let Ok(content_str) = String::from_utf8(content.clone()) {
-            if let Ok(json_value) = serde_json::from_str::<serde_json::Value>(&content_str) {
-                content = serde_json::to_string_pretty(&json_value)?.into();
-            }
-        }
-    }
-
-    // After the HTML/JSON handling block, add media extension detection:
-    // Check for media files if no extension detected
+    // Detect media extension if no extension is present
     if file_path.extension().is_none() {
         if let Some(ext) = detect_media_extension(&content) {
             file_path = file_path.with_extension(ext);
@@ -159,6 +131,18 @@ pub async fn fetch_and_save_content(
         }
     }
 
+    match file_path.extension().unwrap_or_default().to_str() {
+        Some("json") => {
+            let json_value: Value = serde_json::from_slice(&content)?;
+            content = serde_json::to_string_pretty(&json_value)?.into();
+        }
+        Some("html") => {
+            let content_str = String::from_utf8_lossy(&content);
+            html::download_html_resources(&content_str, url, file_path.parent().unwrap()).await?;
+        }
+        _ => {}
+    }
+
     // Check if file exists again before downloading
     if fs::try_exists(&file_path).await? {
         debug!("File already exists at {}", file_path.display());
diff --git a/src/url/mod.rs b/src/url/mod.rs
index c0b3a82..0718da0 100644
--- a/src/url/mod.rs
+++ b/src/url/mod.rs
@@ -1,22 +1,12 @@
 use ::url::Url;
-use anyhow::Result;
 use base64::Engine;
 
 pub fn is_data_url(url: &str) -> bool {
     url.starts_with("data:")
 }
 
-pub fn get_data_url_mime_type(url: &str) -> String {
-    url.trim_start_matches("data:")
-        .split(';')
-        .next()
-        .and_then(|s| s.split('/').last())
-        .unwrap_or("bin")
-        .to_string()
-}
-
 /// Extract content from a data URL
-fn get_data_url(url: &str) -> Option<(String, Vec<u8>)> {
+pub fn get_data_url(url: &str) -> Option<Vec<u8>> {
     if !is_data_url(url) {
         return None;
     }
@@ -26,20 +16,11 @@ fn get_data_url(url: &str) -> Option<(String, Vec<u8>)> {
         return None;
     }
 
-    let mime_part = parts[0].trim_start_matches("data:").trim_end_matches(";");
     let data = base64::engine::general_purpose::STANDARD
         .decode(parts[1])
         .ok()?;
 
-    Some((mime_part.to_string(), data))
-}
-
-/// Get content from a data URL, returns the content and suggested file extension
-pub fn get_data_url_content(url: &str) -> Result<(Vec<u8>, String)> {
-    let (mime_type, content) =
-        get_data_url(url).ok_or_else(|| anyhow::anyhow!("Invalid data URL format"))?;
-
-    Ok((content, mime_type))
+    Some(data)
 }
 
 /// Converts IPFS URLs to use a gateway, otherwise returns the original URL
@@ -83,23 +64,20 @@ mod tests {
     }
 
     #[test]
-    fn test_get_data_url_content_json() {
+    fn test_get_data_url_json() {
         let data_url = "data:application/json;base64,eyAidGVzdCI6IDEyMyB9"; // base64 encoded '{ "test": 123 }'
-        let (content, mime_type) = get_data_url_content(data_url).unwrap();
+        let content = get_data_url(data_url).unwrap();
         assert_eq!(String::from_utf8_lossy(&content), "{ \"test\": 123 }");
-        assert_eq!(mime_type, "application/json");
     }
 
     #[test]
-    fn test_get_data_url_content_invalid() {
-        let result = get_data_url_content("data:text/plain;base64,invalid@@base64");
-        assert!(result.is_err());
+    fn test_get_data_url_invalid() {
+        assert!(get_data_url("data:text/plain;base64,invalid@@base64").is_none());
     }
 
     #[test]
-    fn test_get_data_url_content_not_data_url() {
-        let result = get_data_url_content("https://example.com/image.png");
-        assert!(result.is_err());
+    fn test_get_data_url_not_data_url() {
+        assert!(get_data_url("https://example.com/image.png").is_none());
     }
 
     #[test]
@@ -113,42 +91,15 @@ mod tests {
     }
 
     #[test]
-    fn test_get_data_url_mime_type() {
-        assert_eq!(
-            get_data_url_mime_type("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA"),
-            "png"
-        );
-        assert_eq!(
-            get_data_url_mime_type("data:text/plain;charset=UTF-8;base64,SGVsbG8="),
-            "plain"
-        );
-        assert_eq!(
-            get_data_url_mime_type("data:application/vnd.custom+json;base64,eyJhIjogMn0="),
-            "vnd.custom+json"
-        );
-        assert_eq!(
-            get_data_url_mime_type("data:invalidmime;base64,SGVsbG8="),
-            "invalidmime"
-        );
-    }
-
-    #[test]
-    fn test_get_data_url() {
-        // Valid base64 URL
-        let (mime, data) = get_data_url("data:text/plain;base64,SGVsbG8gd29ybGQ=").unwrap();
-        assert_eq!(mime, "text/plain");
-        assert_eq!(data, b"Hello world");
+    fn test_get_data_url_base64() {
+        let content = get_data_url("data:text/plain;base64,SGVsbG8gd29ybGQ=").unwrap();
+        assert_eq!(content, b"Hello world");
 
-        // URL without base64 marker
         assert!(get_data_url("data:image/png,rawdata").is_none());
-
-        // Invalid base64 data
         assert!(get_data_url("data:text/plain;base64,Invalid@Base64!").is_none());
 
-        // Empty data
-        let (mime, data) = get_data_url("data:text/plain;base64,").unwrap();
-        assert_eq!(mime, "text/plain");
-        assert!(data.is_empty());
+        let empty = get_data_url("data:text/plain;base64,").unwrap();
+        assert!(empty.is_empty());
     }
 
     #[test]
@@ -195,19 +146,4 @@ mod tests {
             "docs"
         );
     }
-
-    #[test]
-    fn test_get_data_url_content_non_base64() {
-        // URL with missing base64 marker
-        let result = get_data_url_content("data:text/plain,HelloWorld");
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_get_data_url_content_empty_mime() {
-        let data_url = "data:;base64,dGVzdCBjb250ZW50"; // base64 encoded 'test content'
-        let (content, mime_type) = get_data_url_content(data_url).unwrap();
-        assert_eq!(String::from_utf8_lossy(&content), "test content");
-        assert_eq!(mime_type, "");
-    }
 }