feat: started with dedup actions

0xCCF4 · Nov 30, 2024 · 00be903 · 00be903
1 parent 9d4a3f9
commit 00be903
Show file tree

Hide file tree

Showing 10 changed files with 401 additions and 44 deletions.
diff --git a/src/data/hash.rs b/src/data/hash.rs
@@ -6,7 +6,7 @@ use const_format::concatcp;
 use serde::de::Error;
 use serde::{Deserialize, Serialize, Serializer};
 use std::fmt;
-use std::fmt::Display;
+use std::fmt::{Debug, Display};
 use std::io::Read;
 use std::path::Path;
 use std::str::FromStr;
@@ -237,7 +237,7 @@ impl Display for GeneralHashType {
 /// * [GeneralHashType] - representation of the different types of hash functions.
 /// * [GeneralHasher] - trait for computing hash values.
 ///
-#[derive(Debug, Hash, PartialEq, Eq, Clone, PartialOrd)]
+#[derive(Hash, PartialEq, Eq, Clone, PartialOrd)]
 pub enum GeneralHash {
     #[cfg(feature = "hash-sha2")]
     /// A SHA512 hash value.
@@ -318,6 +318,12 @@ impl Display for GeneralHash {
     }
 }
 
+impl Debug for GeneralHash {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self)
+    }
+}
+
 impl Serialize for GeneralHash {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where

diff --git a/src/lib.rs b/src/lib.rs
@@ -18,6 +18,8 @@ pub mod stages {
     pub mod build;
     /// Contains the implementation of the clean command.
     pub mod clean;
+    /// Contains the implementation of the dedup command.
+    pub mod dedup;
 }
 
 mod data {

diff --git a/src/main.rs b/src/main.rs
@@ -2,12 +2,14 @@ use backup_deduplicator::hash::GeneralHashType;
 use backup_deduplicator::stages::analyze::cmd::AnalysisSettings;
 use backup_deduplicator::stages::build::cmd::BuildSettings;
 use backup_deduplicator::stages::clean::cmd::CleanSettings;
-use backup_deduplicator::stages::{analyze, build, clean};
+use backup_deduplicator::stages::{analyze, build, clean, dedup};
 use backup_deduplicator::utils;
 use clap::{arg, Parser, Subcommand};
 use log::{debug, info, trace, LevelFilter};
 use std::env;
+use std::path::PathBuf;
 use std::str::FromStr;
+use backup_deduplicator::stages::dedup::golden_model::cmd::DedupGoldenModelSettings;
 
 /// A simple command line tool to deduplicate backups.
 #[derive(Parser, Debug)]
@@ -35,7 +37,7 @@ enum Command {
     Build {
         /// The directory to analyze
         #[arg()]
-        directory: String,
+        directory: Vec<String>,
         /// Traverse into archives
         #[arg(short, long)]
         archives: bool,
@@ -89,12 +91,46 @@ enum Command {
         #[arg(short, long, default_value = "hash_tree.bdd")]
         input: String,
         /// Output file for the analysis result
-        #[arg(short, long, default_value = "analysis.json")]
+        #[arg(short, long, default_value = "analysis.bda")]
         output: String,
         /// Overwrite the output file
         #[arg(long = "overwrite", default_value = "false")]
         overwrite: bool,
     },
+    /// Compile a list of actions to deduplicate the file tree
+    Dedup {
+        /// The input analysis file to generation actions for.
+        #[arg(short, long, default_value = "analysis.bda")]
+        input: String,
+        /// The output actions file to write the actions to.
+        #[arg(short, long, default_value = "actions.bdc")]
+        output: String,
+        /// Overwrite the output file, if set it already exists 
+        #[arg(long = "overwrite", default_value = "false")]
+        overwrite: bool,
+        /// Deduplication mode and settings
+        #[command(subcommand)]
+        mode: DedupMode,
+    }
+}
+
+#[derive(Subcommand, Debug)]
+enum DedupMode {
+    /// In golden model mode, a directory is declared that serves as reference model.
+    /// Files from within the reference model are not altered. A list of other directories
+    /// can be given; from within those directories all files that have a duplicate in the reference model
+    /// are marked for deletion.
+    /// 
+    /// This mode is useful if having multiple backups of the same data. If you would like to quickly
+    /// remove files from older backups that are also present in the newer one.
+    GoldenModel {
+        /// The reference model directory
+        #[arg(short, long)]
+        reference_model: String,
+        /// The directories to delete files from.
+        #[arg(short, long)]
+        directories: Vec<String>,
+    }
 }
 
 fn main() {
@@ -155,10 +191,10 @@ fn main() {
 
             // Convert to paths and check if they exist
 
-            let directory = utils::main::parse_path(
+            let directory = directory.into_iter().map(|directory| utils::main::parse_path(
                 directory.as_str(),
                 utils::main::ParsePathKind::AbsoluteNonExisting,
-            );
+            )).collect::<Vec<PathBuf>>();
             let output = utils::main::parse_path(
                 output.as_str(),
                 utils::main::ParsePathKind::AbsoluteNonExisting,
@@ -167,8 +203,8 @@ fn main() {
                 utils::main::parse_path(w.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting)
             });
 
-            if !directory.exists() {
-                eprintln!("Target directory does not exist: {}", directory.display());
+            if let Some(dir) = directory.iter().find(|dir| !dir.exists()) {
+                eprintln!("Target directory does not exist: {}", dir.display());
                 std::process::exit(exitcode::CONFIG);
             }
 
@@ -198,12 +234,12 @@ fn main() {
 
             // Convert paths to relative path to working directory
 
-            let directory = directory.strip_prefix(&working_directory).unwrap_or_else(|_| {
+            let directory = directory.into_iter().map(|dir| dir.strip_prefix(&working_directory).unwrap_or_else(|_| {
                 eprintln!("IO error, could not resolve target directory relative to working directory");
                 std::process::exit(exitcode::CONFIG);
-            });
+            }).to_path_buf()).collect::<Vec<PathBuf>>();
 
-            info!("Target directory: {:?}", directory);
+            info!("Target directories: {:?}", directory);
             // info!("Archives: {:?}", archives);
             info!("Follow symlinks: {:?}", follow_symlinks);
             info!("Output: {:?}", output);
@@ -213,7 +249,7 @@ fn main() {
             // Run the command
 
             match build::cmd::run(BuildSettings {
-                directory: directory.to_path_buf(),
+                directory,
                 into_archives: archives,
                 follow_symlinks,
                 output: output.clone(),
@@ -345,5 +381,53 @@ fn main() {
                 }
             }
         }
+        Command::Dedup {
+            mode,
+            input,
+            output,
+            overwrite,
+        } => {
+            let input = utils::main::parse_path(
+                input.as_str(),
+                utils::main::ParsePathKind::AbsoluteExisting,
+            );
+            let output = utils::main::parse_path(
+                output.as_str(),
+                utils::main::ParsePathKind::AbsoluteNonExisting,
+            );
+
+            if !input.exists() {
+                eprintln!("Input file does not exist: {:?}", input);
+                std::process::exit(exitcode::CONFIG);
+            }
+
+            if output.exists() && !overwrite {
+                eprintln!(
+                    "Output file already exists: {:?}. Set --overwrite to override its content",
+                    output
+                );
+                std::process::exit(exitcode::CONFIG);
+            }
+
+            match mode {
+                DedupMode::GoldenModel { reference_model, directories } => {
+                    match dedup::golden_model::cmd::run(DedupGoldenModelSettings {
+                        input,
+                        output,
+                        reference_model,
+                        directories,
+                    }) {
+                        Ok(_) => {
+                            info!("Dedup command completed successfully");
+                            std::process::exit(exitcode::OK);
+                        }
+                        Err(e) => {
+                            eprintln!("Error: {:?}", e);
+                            std::process::exit(exitcode::SOFTWARE);
+                        }
+                    }
+                }
+            }
+        }
     }
 }
diff --git a/src/stages/analyze/cmd.rs b/src/stages/analyze/cmd.rs
@@ -1,7 +1,7 @@
 use crate::hash::{GeneralHash, GeneralHashType};
 use crate::pool::ThreadPool;
 use crate::stages::analyze::intermediary_analysis_data::AnalysisFile;
-use crate::stages::analyze::output::DupSetEntryRef;
+use crate::stages::analyze::output::{DupSetEntryRef, DupSetFile, DupSetFileVersion};
 use crate::stages::analyze::worker::AnalysisIntermediaryFile;
 use crate::stages::analyze::worker::{
     worker_run, AnalysisJob, AnalysisResult, AnalysisWorkerArgument,
@@ -11,7 +11,7 @@ use anyhow::{anyhow, Result};
 use log::{error, info, trace};
 use std::collections::HashMap;
 use std::fs;
-use std::io::Write;
+use std::io::{Read, Write};
 use std::ops::Deref;
 use std::path::PathBuf;
 use std::sync::{Arc, Mutex};
@@ -63,7 +63,7 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> {
         }
     };
 
-    let output_file = match output_file_options.open(analysis_settings.output) {
+    let output_file = match output_file_options.open(&analysis_settings.output) {
         Ok(file) => file,
         Err(err) => {
             return Err(anyhow!("Failed to open output file: {}", err));
@@ -185,6 +185,55 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> {
         "There are {} GB of duplicated files",
         duplicated_bytes / 1024 / 1024 / 1024
     );
+
+    drop(output_buf_writer);
+
+    let output_file_reader = match input_file_options.open(&analysis_settings.output) {
+        Ok(file) => file,
+        Err(err) => {
+            return Err(anyhow!("Failed to open output file readable: {}", err));
+        }
+    };
+
+    let mut output_buf_reader = std::io::BufReader::new(&output_file_reader);
+    let mut text = String::new();
+
+    if let Err(err) = output_buf_reader.read_to_string(&mut text) {
+        return Err(anyhow!("Failed to read output file: {}", err));
+    }
+
+    drop(output_buf_reader);
+
+    let mut result = DupSetFile {
+        version: DupSetFileVersion::V1,
+        entries: Vec::new(),
+    };
+    for line in text.lines() {
+        let entry = match serde_json::from_str(&line) {
+            Ok(entry) => entry,
+            Err(err) => {
+                error!("Failed to parse line in output: {}", err);
+                continue;
+            }
+        };
+        result.entries.push(entry);
+    }
+
+    let output_file = match output_file_options.open(&analysis_settings.output) {
+        Ok(file) => file,
+        Err(err) => {
+            return Err(anyhow!("Failed to open output file writable: {}", err));
+        }
+    };
+    let mut output_buf_writer = std::io::BufWriter::new(&output_file);
+
+    if let Err(err) = serde_json::to_writer(&mut output_buf_writer, &result) {
+        return Err(anyhow!("Failed to write output file: {}", err));
+    }
+
+    if let Err(err) = output_buf_writer.flush() {
+        return Err(anyhow!("Failed to flush output file: {}", err));
+    }
 
     Ok(())
 }

diff --git a/src/stages/analyze/output/dupset_file.rs b/src/stages/analyze/output/dupset_file.rs
@@ -1,7 +1,7 @@
 use crate::hash::GeneralHash;
 use crate::path::FilePath;
 use crate::stages::build::output::HashTreeFileEntryType;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 
 /// The result of the analysis worker. A duplicate set entry.
 ///
@@ -21,3 +21,71 @@ pub struct DupSetEntryRef<'a, 'b, 'c> {
     /// The conflicting files.
     pub conflicting: Vec<&'c FilePath>,
 }
+
+/// The result of the analysis worker. A duplicate set entry.
+///
+/// # Fields
+/// * `ftype` - The type of the file.
+/// * `size` - The size of the file.
+/// * `hash` - The hash of the file content.
+/// * `conflicting` - The conflicting files.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DupSetEntry {
+    /// The type of the file.
+    pub ftype: HashTreeFileEntryType,
+    /// The size of the file.
+    pub size: u64,
+    /// The hash of the file content.
+    pub hash: GeneralHash,
+    /// The conflicting files.
+    pub conflicting: Vec<FilePath>,
+}
+
+impl From<&DupSetEntryRef<'_, '_, '_>> for DupSetEntry {
+    fn from(entry: &DupSetEntryRef) -> Self {
+        DupSetEntry {
+            ftype: *entry.ftype,
+            size: entry.size,
+            hash: entry.hash.clone(),
+            conflicting: entry.conflicting.clone().into_iter().cloned().collect::<Vec<FilePath>>(),
+        }
+    }
+}
+
+/// Deduplication set file version.
+#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
+pub enum DupSetFileVersion {
+    /// Version 1 of the file format.
+    V1,
+}
+
+/// Deduplication set file.
+/// 
+/// # Fields
+/// * `version` - The version of the file format.
+/// * `entries` - The deduplication set entries.
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct DupSetFile {
+    pub version: DupSetFileVersion,
+    pub entries: Vec<DupSetEntry>,
+}
+
+/// Deduplication set file. (Reference version)
+/// 
+/// # Fields
+/// * `version` - The version of the file format.
+/// * `entries` - The deduplication set entries.
+#[derive(Debug, Serialize)]
+pub struct DupSetFileRef<'a, 'b, 'c> {
+    pub version: DupSetFileVersion,
+    pub entries: Vec<DupSetEntryRef<'a, 'b, 'c>>,
+}
+
+impl From<&DupSetFileRef<'_, '_, '_>> for DupSetFile {
+    fn from(value: &DupSetFileRef<'_, '_, '_>) -> Self {
+        DupSetFile {
+            version: value.version,
+            entries: value.entries.iter().map(DupSetEntry::from).collect::<Vec<DupSetEntry>>(),
+        }
+    }
+}