Skip to content

Commit

Permalink
feat: started with dedup actions
Browse files Browse the repository at this point in the history
  • Loading branch information
0xCCF4 committed Nov 30, 2024
1 parent 9d4a3f9 commit 00be903
Show file tree
Hide file tree
Showing 10 changed files with 401 additions and 44 deletions.
10 changes: 8 additions & 2 deletions src/data/hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use const_format::concatcp;
use serde::de::Error;
use serde::{Deserialize, Serialize, Serializer};
use std::fmt;
use std::fmt::Display;
use std::fmt::{Debug, Display};
use std::io::Read;
use std::path::Path;
use std::str::FromStr;
Expand Down Expand Up @@ -237,7 +237,7 @@ impl Display for GeneralHashType {
/// * [GeneralHashType] - representation of the different types of hash functions.
/// * [GeneralHasher] - trait for computing hash values.
///
#[derive(Debug, Hash, PartialEq, Eq, Clone, PartialOrd)]
#[derive(Hash, PartialEq, Eq, Clone, PartialOrd)]
pub enum GeneralHash {
#[cfg(feature = "hash-sha2")]
/// A SHA512 hash value.
Expand Down Expand Up @@ -318,6 +318,12 @@ impl Display for GeneralHash {
}
}

impl Debug for GeneralHash {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self)
}
}

impl Serialize for GeneralHash {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ pub mod stages {
pub mod build;
/// Contains the implementation of the clean command.
pub mod clean;
/// Contains the implementation of the dedup command.
pub mod dedup;
}

mod data {
Expand Down
106 changes: 95 additions & 11 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ use backup_deduplicator::hash::GeneralHashType;
use backup_deduplicator::stages::analyze::cmd::AnalysisSettings;
use backup_deduplicator::stages::build::cmd::BuildSettings;
use backup_deduplicator::stages::clean::cmd::CleanSettings;
use backup_deduplicator::stages::{analyze, build, clean};
use backup_deduplicator::stages::{analyze, build, clean, dedup};
use backup_deduplicator::utils;
use clap::{arg, Parser, Subcommand};
use log::{debug, info, trace, LevelFilter};
use std::env;
use std::path::PathBuf;
use std::str::FromStr;
use backup_deduplicator::stages::dedup::golden_model::cmd::DedupGoldenModelSettings;

/// A simple command line tool to deduplicate backups.
#[derive(Parser, Debug)]
Expand Down Expand Up @@ -35,7 +37,7 @@ enum Command {
Build {
/// The directory to analyze
#[arg()]
directory: String,
directory: Vec<String>,
/// Traverse into archives
#[arg(short, long)]
archives: bool,
Expand Down Expand Up @@ -89,12 +91,46 @@ enum Command {
#[arg(short, long, default_value = "hash_tree.bdd")]
input: String,
/// Output file for the analysis result
#[arg(short, long, default_value = "analysis.json")]
#[arg(short, long, default_value = "analysis.bda")]
output: String,
/// Overwrite the output file
#[arg(long = "overwrite", default_value = "false")]
overwrite: bool,
},
/// Compile a list of actions to deduplicate the file tree
Dedup {
/// The input analysis file to generation actions for.
#[arg(short, long, default_value = "analysis.bda")]
input: String,
/// The output actions file to write the actions to.
#[arg(short, long, default_value = "actions.bdc")]
output: String,
/// Overwrite the output file, if set it already exists
#[arg(long = "overwrite", default_value = "false")]
overwrite: bool,
/// Deduplication mode and settings
#[command(subcommand)]
mode: DedupMode,
}
}

#[derive(Subcommand, Debug)]
enum DedupMode {
/// In golden model mode, a directory is declared that serves as reference model.
/// Files from within the reference model are not altered. A list of other directories
/// can be given; from within those directories all files that have a duplicate in the reference model
/// are marked for deletion.
///
/// This mode is useful if having multiple backups of the same data. If you would like to quickly
/// remove files from older backups that are also present in the newer one.
GoldenModel {
/// The reference model directory
#[arg(short, long)]
reference_model: String,
/// The directories to delete files from.
#[arg(short, long)]
directories: Vec<String>,
}
}

fn main() {
Expand Down Expand Up @@ -155,10 +191,10 @@ fn main() {

// Convert to paths and check if they exist

let directory = utils::main::parse_path(
let directory = directory.into_iter().map(|directory| utils::main::parse_path(
directory.as_str(),
utils::main::ParsePathKind::AbsoluteNonExisting,
);
)).collect::<Vec<PathBuf>>();
let output = utils::main::parse_path(
output.as_str(),
utils::main::ParsePathKind::AbsoluteNonExisting,
Expand All @@ -167,8 +203,8 @@ fn main() {
utils::main::parse_path(w.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting)
});

if !directory.exists() {
eprintln!("Target directory does not exist: {}", directory.display());
if let Some(dir) = directory.iter().find(|dir| !dir.exists()) {
eprintln!("Target directory does not exist: {}", dir.display());
std::process::exit(exitcode::CONFIG);
}

Expand Down Expand Up @@ -198,12 +234,12 @@ fn main() {

// Convert paths to relative path to working directory

let directory = directory.strip_prefix(&working_directory).unwrap_or_else(|_| {
let directory = directory.into_iter().map(|dir| dir.strip_prefix(&working_directory).unwrap_or_else(|_| {
eprintln!("IO error, could not resolve target directory relative to working directory");
std::process::exit(exitcode::CONFIG);
});
}).to_path_buf()).collect::<Vec<PathBuf>>();

info!("Target directory: {:?}", directory);
info!("Target directories: {:?}", directory);
// info!("Archives: {:?}", archives);
info!("Follow symlinks: {:?}", follow_symlinks);
info!("Output: {:?}", output);
Expand All @@ -213,7 +249,7 @@ fn main() {
// Run the command

match build::cmd::run(BuildSettings {
directory: directory.to_path_buf(),
directory,
into_archives: archives,
follow_symlinks,
output: output.clone(),
Expand Down Expand Up @@ -345,5 +381,53 @@ fn main() {
}
}
}
Command::Dedup {
mode,
input,
output,
overwrite,
} => {
let input = utils::main::parse_path(
input.as_str(),
utils::main::ParsePathKind::AbsoluteExisting,
);
let output = utils::main::parse_path(
output.as_str(),
utils::main::ParsePathKind::AbsoluteNonExisting,
);

if !input.exists() {
eprintln!("Input file does not exist: {:?}", input);
std::process::exit(exitcode::CONFIG);
}

if output.exists() && !overwrite {
eprintln!(
"Output file already exists: {:?}. Set --overwrite to override its content",
output
);
std::process::exit(exitcode::CONFIG);
}

match mode {
DedupMode::GoldenModel { reference_model, directories } => {
match dedup::golden_model::cmd::run(DedupGoldenModelSettings {
input,
output,
reference_model,
directories,
}) {
Ok(_) => {
info!("Dedup command completed successfully");
std::process::exit(exitcode::OK);
}
Err(e) => {
eprintln!("Error: {:?}", e);
std::process::exit(exitcode::SOFTWARE);
}
}
}
}
}
}
}
55 changes: 52 additions & 3 deletions src/stages/analyze/cmd.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::hash::{GeneralHash, GeneralHashType};
use crate::pool::ThreadPool;
use crate::stages::analyze::intermediary_analysis_data::AnalysisFile;
use crate::stages::analyze::output::DupSetEntryRef;
use crate::stages::analyze::output::{DupSetEntryRef, DupSetFile, DupSetFileVersion};
use crate::stages::analyze::worker::AnalysisIntermediaryFile;
use crate::stages::analyze::worker::{
worker_run, AnalysisJob, AnalysisResult, AnalysisWorkerArgument,
Expand All @@ -11,7 +11,7 @@ use anyhow::{anyhow, Result};
use log::{error, info, trace};
use std::collections::HashMap;
use std::fs;
use std::io::Write;
use std::io::{Read, Write};
use std::ops::Deref;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
Expand Down Expand Up @@ -63,7 +63,7 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> {
}
};

let output_file = match output_file_options.open(analysis_settings.output) {
let output_file = match output_file_options.open(&analysis_settings.output) {
Ok(file) => file,
Err(err) => {
return Err(anyhow!("Failed to open output file: {}", err));
Expand Down Expand Up @@ -185,6 +185,55 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> {
"There are {} GB of duplicated files",
duplicated_bytes / 1024 / 1024 / 1024
);

drop(output_buf_writer);

let output_file_reader = match input_file_options.open(&analysis_settings.output) {
Ok(file) => file,
Err(err) => {
return Err(anyhow!("Failed to open output file readable: {}", err));
}
};

let mut output_buf_reader = std::io::BufReader::new(&output_file_reader);
let mut text = String::new();

if let Err(err) = output_buf_reader.read_to_string(&mut text) {
return Err(anyhow!("Failed to read output file: {}", err));
}

drop(output_buf_reader);

let mut result = DupSetFile {
version: DupSetFileVersion::V1,
entries: Vec::new(),
};
for line in text.lines() {
let entry = match serde_json::from_str(&line) {
Ok(entry) => entry,
Err(err) => {
error!("Failed to parse line in output: {}", err);
continue;
}
};
result.entries.push(entry);
}

let output_file = match output_file_options.open(&analysis_settings.output) {
Ok(file) => file,
Err(err) => {
return Err(anyhow!("Failed to open output file writable: {}", err));
}
};
let mut output_buf_writer = std::io::BufWriter::new(&output_file);

if let Err(err) = serde_json::to_writer(&mut output_buf_writer, &result) {
return Err(anyhow!("Failed to write output file: {}", err));
}

if let Err(err) = output_buf_writer.flush() {
return Err(anyhow!("Failed to flush output file: {}", err));
}

Ok(())
}
Expand Down
70 changes: 69 additions & 1 deletion src/stages/analyze/output/dupset_file.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::hash::GeneralHash;
use crate::path::FilePath;
use crate::stages::build::output::HashTreeFileEntryType;
use serde::Serialize;
use serde::{Deserialize, Serialize};

/// The result of the analysis worker. A duplicate set entry.
///
Expand All @@ -21,3 +21,71 @@ pub struct DupSetEntryRef<'a, 'b, 'c> {
/// The conflicting files.
pub conflicting: Vec<&'c FilePath>,
}

/// The result of the analysis worker. A duplicate set entry.
///
/// # Fields
/// * `ftype` - The type of the file.
/// * `size` - The size of the file.
/// * `hash` - The hash of the file content.
/// * `conflicting` - The conflicting files.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DupSetEntry {
/// The type of the file.
pub ftype: HashTreeFileEntryType,
/// The size of the file.
pub size: u64,
/// The hash of the file content.
pub hash: GeneralHash,
/// The conflicting files.
pub conflicting: Vec<FilePath>,
}

impl From<&DupSetEntryRef<'_, '_, '_>> for DupSetEntry {
fn from(entry: &DupSetEntryRef) -> Self {
DupSetEntry {
ftype: *entry.ftype,
size: entry.size,
hash: entry.hash.clone(),
conflicting: entry.conflicting.clone().into_iter().cloned().collect::<Vec<FilePath>>(),
}
}
}

/// Deduplication set file version.
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
pub enum DupSetFileVersion {
/// Version 1 of the file format.
V1,
}

/// Deduplication set file.
///
/// # Fields
/// * `version` - The version of the file format.
/// * `entries` - The deduplication set entries.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DupSetFile {
pub version: DupSetFileVersion,
pub entries: Vec<DupSetEntry>,
}

/// Deduplication set file. (Reference version)
///
/// # Fields
/// * `version` - The version of the file format.
/// * `entries` - The deduplication set entries.
#[derive(Debug, Serialize)]
pub struct DupSetFileRef<'a, 'b, 'c> {
pub version: DupSetFileVersion,
pub entries: Vec<DupSetEntryRef<'a, 'b, 'c>>,
}

impl From<&DupSetFileRef<'_, '_, '_>> for DupSetFile {
fn from(value: &DupSetFileRef<'_, '_, '_>) -> Self {
DupSetFile {
version: value.version,
entries: value.entries.iter().map(DupSetEntry::from).collect::<Vec<DupSetEntry>>(),
}
}
}
Loading

0 comments on commit 00be903

Please sign in to comment.