diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..f1b763a --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,5 @@ +# Default owner +* @0xCCF4 + +# Github workflows +/.github/ @0xCCF4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 21c8124..8622ffa 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,14 +14,13 @@ jobs: archive: zip - target: x86_64-unknown-linux-musl archive: tar.gz - - target: x86_64-apple-darwin - archive: zip steps: - uses: actions/checkout@master - uses: actions-rust-lang/setup-rust-toolchain@v1.8.0 with: toolchain: "stable" + target: ${{ matrix.target }} - name: Build run: cargo build --verbose diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7618268..0583e6b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -11,7 +11,10 @@ env: jobs: test: - + strategy: + matrix: + features: ["", "hash-sha1,hash-sha2,hash-xxh"] + runs-on: ubuntu-latest steps: @@ -21,5 +24,9 @@ jobs: with: toolchain: "stable" - - name: Build with stable toolchain - run: cargo build --verbose + - name: Build feature set ${{ matrix.features }} + run: cargo build --no-default-features --features "${{ matrix.features }}" + + - name: Test feature set ${{ matrix.features }} + run: cargo test --no-default-features --features "${{ matrix.features }}" + diff --git a/Cargo.lock b/Cargo.lock index b802725..8b53470 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] @@ -61,16 +61,17 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.81" +version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" +checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519" [[package]] name = "backup-deduplicator" -version = "0.1.0" +version = "0.3.0" dependencies = [ "anyhow", "clap", + "const_format", "env_logger", "exitcode", "file-id", @@ -81,7 +82,6 @@ dependencies = [ "serde_json", "sha1", "sha2", - "sysinfo", "xxhash-rust", ] @@ -168,45 +168,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7e3352a27098ba6b09546e5f13b15165e6a88b5c2723afecb3ea9576b27e3ea" [[package]] -name = "core-foundation-sys" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" - -[[package]] -name = "cpufeatures" -version = "0.2.12" +name = "const_format" +version = "0.2.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "e3a214c7af3d04997541b18d432afaff4c455e79e2029079647e72fc2bd27673" dependencies = [ - "libc", + "const_format_proc_macros", ] [[package]] -name = "crossbeam-deque" -version = "0.8.5" +name = "const_format_proc_macros" +version = "0.2.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "c7f6ff08fd20f4f299298a28e2dfa8a8ba1036e6cd2460ac1de7b425d76f2500" dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", + "proc-macro2", + "quote", + "unicode-xid", ] [[package]] -name = "crossbeam-epoch" -version = "0.9.18" +name = "cpufeatures" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" dependencies = [ - "crossbeam-utils", + "libc", ] -[[package]] -name = "crossbeam-utils" -version = "0.8.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" - [[package]] name = "crypto-common" version = "0.1.6" @@ -249,12 +238,6 @@ dependencies = [ "crypto-common 0.2.0-pre.5", ] -[[package]] -name = "either" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" - [[package]] name = "env_filter" version = "0.1.0" @@ -317,9 +300,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" dependencies = [ "cfg-if", "libc", @@ -346,18 +329,18 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hybrid-array" -version = "0.2.0-rc.7" +version = "0.2.0-rc.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87c2311a0adecbffff284aabcf1249b1485193b16e685f9ef171b1ba82979cff" +checksum = "53668f5da5a41d9eaf4bf7064be46d1ebe6a4e1ceed817f387587b18f2b51047" dependencies = [ "typenum", ] [[package]] name = "itoa" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "libc" @@ -373,18 +356,9 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "memchr" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" - -[[package]] -name = "ntapi" -version = "0.4.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" -dependencies = [ - "winapi", -] +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "num_cpus" @@ -396,12 +370,6 @@ dependencies = [ "libc", ] -[[package]] -name = "once_cell" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" - [[package]] name = "proc-macro2" version = "1.0.79" @@ -413,9 +381,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -429,26 +397,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "rayon" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" -dependencies = [ - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" -dependencies = [ - "crossbeam-deque", - "crossbeam-utils", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -460,9 +408,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.3" +version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ "aho-corasick", "memchr", @@ -483,9 +431,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "ryu" @@ -548,36 +496,21 @@ dependencies = [ [[package]] name = "strsim" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.52" +version = "2.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] -[[package]] -name = "sysinfo" -version = "0.30.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c385888ef380a852a16209afc8cfad22795dd8873d69c9a14d2e2088f118d18" -dependencies = [ - "cfg-if", - "core-foundation-sys", - "libc", - "ntapi", - "once_cell", - "rayon", - "windows", -] - [[package]] name = "typenum" version = "1.17.0" @@ -590,6 +523,12 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + [[package]] name = "utf8parse" version = "0.2.1" @@ -608,47 +547,6 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" -dependencies = [ - "windows-core", - "windows-targets 0.52.4", -] - -[[package]] -name = "windows-core" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" -dependencies = [ - "windows-targets 0.52.4", -] - [[package]] name = "windows-sys" version = "0.48.0" diff --git a/Cargo.toml b/Cargo.toml index 30f4375..452768a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "backup-deduplicator" -version = "0.1.0" +version = "0.3.0" edition = "2021" description = """ A tool to deduplicate backups. It builds a hash tree of all files and folders @@ -11,21 +11,26 @@ keywords = ["archive-management", "file", "deduplication", "cleanup"] license = "GPL-3.0-or-later" homepage = "https://github.com/0xCCF4/BackupDeduplicator" repository = "https://github.com/0xCCF4/BackupDeduplicator" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +documentation = "https://docs.rs/backup-deduplicator" [dependencies] -anyhow = "1.0.80" +anyhow = "1.0.82" clap = { version = "4.5.4", features = ["derive"] } env_logger = "0.11.2" log = "0.4.20" filetime = "0.2.23" exitcode = "1.1.2" -sha2 = "0.10.8" serde = { version = "1.0.197", features = ["derive", "rc"] } serde_json = "1.0.115" -file-id = "0.2.1" -sysinfo = "0.30.5" num_cpus = "1.16.0" -xxhash-rust = { version = "0.8.10", features = ["xxh32", "xxh64"] } -sha1 = "0.11.0-pre.3" +file-id = "0.2.1" +xxhash-rust = { version = "0.8.10", features = ["xxh32", "xxh64"], optional = true } +sha1 = { version = "0.11.0-pre.3", optional = true } +sha2 = { version = "0.10.8", optional = true } +const_format = "0.2.32" + +[features] +hash-sha1 = ["dep:sha1"] +hash-sha2 = ["dep:sha2"] +hash-xxh = ["dep:xxhash-rust"] +default = ["hash-sha1", "hash-sha2", "hash-xxh"] diff --git a/README.md b/README.md index 5e1b24f..3f68892 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,12 @@ The tool is a command line tool. There are two stages: `build` and `analyze`. ### Build Exemplary usage to build a hash tree of a directory: ```bash -backup-deduplicator --threads 16 build -w /parent -o /parent/hash.bdd /parent/target +backup-deduplicator + --threads 16 + build + --working-directory /parent + --output /parent/hash.bdd + /parent/target ``` This will build a hash tree of the directory `/path/to/parent/target` and save it to `hash.bdd` in the parent directory. The tool will use 16 threads to split the hash @@ -62,7 +67,10 @@ calculation work. ### Analyze Exemplary usage to analyze a hash tree: ```bash -backup-deduplicator analyze -o /parent/analysis.bdd /parent/hash.bdd +backup-deduplicator + analyze + --output /parent/analysis.bdd + /parent/hash.bdd ``` This will analyze the hash tree in `hash.bdd` and save the analysis result to `analysis.bdd`. The analysis file will then contain a list of JSON objects (one per line), @@ -76,8 +84,18 @@ The tool is written in Rust, and can be installed using `cargo`: cargo install backup-deduplicator ``` +Precompiled binaries are available for download on the release page +. + +## Features Flags +The tool uses the rust features flags to enable or disable certain features. +The following flags are available: +* `hash-sha1`: Use the [sha1](https://crates.io/crates/sha1) module to enable SHA1 hash function +* `hash-sha2`: Use the [sha2](https://crates.io/crates/sha2) module to enable SHA512, SHA256 hash functions +* `hash-xxh`: Use the [xxhash-rust](https://crates.io/crates/xxhash-rust) module to enable XXH3 (32/64) hash functions + ## Contribution -Contributions to PhotoSort are welcome! If you have a feature request, +Contributions to the project are welcome! If you have a feature request, bug report, or want to contribute to the code, please open an issue or a pull request. diff --git a/src/cmd/analyze/analysis.rs b/src/cmd/analyze/analysis.rs deleted file mode 100644 index cb386af..0000000 --- a/src/cmd/analyze/analysis.rs +++ /dev/null @@ -1,69 +0,0 @@ -use std::sync::Weak; -use std::sync::{Arc, Mutex}; -use serde::{Deserialize, Serialize}; -use crate::data::{FilePath, GeneralHash, SaveFileEntryType}; - -#[derive(Debug, Serialize, Deserialize)] -pub enum AnalysisFile { - File(FileInformation), - Directory(DirectoryInformation), - Symlink(SymlinkInformation), - Other(OtherInformation), -} - -impl AnalysisFile { - pub fn parent(&self) -> &Mutex>> { - match self { - AnalysisFile::File(info) => &info.parent, - AnalysisFile::Directory(info) => &info.parent, - AnalysisFile::Symlink(info) => &info.parent, - AnalysisFile::Other(info) => &info.parent, - } - } - - pub fn path(&self) -> &FilePath { - match self { - AnalysisFile::File(info) => &info.path, - AnalysisFile::Directory(info) => &info.path, - AnalysisFile::Symlink(info) => &info.path, - AnalysisFile::Other(info) => &info.path, - } - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct FileInformation { - pub path: FilePath, - pub content_hash: GeneralHash, - pub parent: Mutex>>, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct DirectoryInformation { - pub path: FilePath, - pub content_hash: GeneralHash, - pub children: Mutex>>, - pub parent: Mutex>>, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct SymlinkInformation { - pub path: FilePath, - pub content_hash: GeneralHash, - pub parent: Mutex>>, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct OtherInformation { - pub path: FilePath, - pub parent: Mutex>>, -} - - -#[derive(Debug, Serialize)] -pub struct ResultEntryRef<'a, 'b, 'c> { - pub ftype: &'a SaveFileEntryType, - pub size: u64, - pub hash: &'b GeneralHash, - pub conflicting: Vec<&'c FilePath>, -} diff --git a/src/cmd/build/worker/other.rs b/src/cmd/build/worker/other.rs deleted file mode 100644 index a6e8d32..0000000 --- a/src/cmd/build/worker/other.rs +++ /dev/null @@ -1,33 +0,0 @@ -use std::path::PathBuf; -use std::sync::mpsc::Sender; -use log::trace; -use crate::build::JobResult; -use crate::build::worker::{worker_fetch_savedata, worker_publish_result_or_trigger_parent, WorkerArgument}; -use crate::data::{File, Job, OtherInformation, SaveFileEntryType}; - -pub fn worker_run_other(path: PathBuf, modified: u64, size: u64, id: usize, job: Job, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { - trace!("[{}] analyzing other {} > {:?}", id, &job.target_path, path); - - match worker_fetch_savedata(arg, &job.target_path) { - Some(found) => { - if found.file_type == SaveFileEntryType::Other && found.modified == modified && found.size == size { - trace!("Other {:?} is already in save file", path); - worker_publish_result_or_trigger_parent(id, true, File::Other(OtherInformation { - path: job.target_path.clone(), - content_size: size, - modified, - }), job, result_publish, job_publish, arg); - return; - } - } - None => {} - } - - let file = File::Other(OtherInformation { - path: job.target_path.clone(), - content_size: size, - modified, - }); - - worker_publish_result_or_trigger_parent(id, false, file, job, result_publish, job_publish, arg); -} \ No newline at end of file diff --git a/src/data/file.rs b/src/data/file.rs deleted file mode 100644 index ac22b58..0000000 --- a/src/data/file.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::path::{PathBuf}; -use serde::{Deserialize, Serialize}; -use crate::data::{FilePath, GeneralHash}; - -// type ResolveNodeFn = fn(&HandleIdentifier) -> Result>>; -// type PathInScopeFn = fn(&Path) -> bool; - - - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FileInformation { - pub path: FilePath, - pub modified: u64, - pub content_hash: GeneralHash, - pub content_size: u64, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DirectoryInformation { - pub path: FilePath, - pub modified: u64, - pub content_hash: GeneralHash, - pub number_of_children: u64, - pub children: Vec, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SymlinkInformation { - pub path: FilePath, - pub modified: u64, - pub content_hash: GeneralHash, // equal to the target file's hash or if not following symlinks, the symlink's path hashed - pub target: PathBuf, - pub content_size: u64, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct OtherInformation { - pub path: FilePath, - pub modified: u64, - pub content_size: u64, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StubInformation { - pub path: FilePath, - pub content_hash: GeneralHash, -} - - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum File { - File(FileInformation), - Directory(DirectoryInformation), - Symlink(SymlinkInformation), - Other(OtherInformation), // for unsupported file types like block devices, character devices, etc., or files without permission - Stub(StubInformation), // for files that are already analyzed -} - -// ---- IMPLEMENTATION ---- - -impl File { - pub fn get_content_hash(&self) -> &GeneralHash { - match self { - File::File(info) => &info.content_hash, - File::Directory(info) => &info.content_hash, - File::Symlink(info) => &info.content_hash, - File::Other(_) => &GeneralHash::NULL, - File::Stub(info) => &info.content_hash, - } - } - - pub fn get_path(&self) -> &FilePath { - match self { - File::File(info) => &info.path, - File::Directory(info) => &info.path, - File::Symlink(info) => &info.path, - File::Other(info) => &info.path, - File::Stub(info) => &info.path, - } - } - - pub fn is_directory(&self) -> bool { - match self { - File::Directory(_) => true, - _ => false, - } - } - - pub fn is_symlink(&self) -> bool { - match self { - File::Symlink(_) => true, - _ => false, - } - } - - pub fn is_file(&self) -> bool { - match self { - File::File(_) => true, - _ => false, - } - } - - pub fn is_other(&self) -> bool { - match self { - File::Other(_) => true, - _ => false, - } - } -} diff --git a/src/data/fileid.rs b/src/data/fileid.rs index ea89264..4414e70 100644 --- a/src/data/fileid.rs +++ b/src/data/fileid.rs @@ -6,41 +6,63 @@ use std::path::Path; use file_id::FileId; use serde::Serialize; +/// Device id type. #[cfg(target_family = "unix")] type DeviceIdType = u64; +/// Device id type. #[cfg(target_family = "windows")] type DeviceIdType = u64; // high-res file-id -// file id - +/// File id type #[cfg(target_family = "unix")] type FileIdType = u64; +/// File id type #[cfg(target_family = "windows")] type FileIdType = u128; // high-res file-id -// structs - +/// A file id handle. +/// +/// # Fields +/// * `inode` - The inode of the file. +/// * `drive` - The device id of the file. #[derive(Debug, Clone, PartialEq, Serialize)] pub struct HandleIdentifier { pub inode: FileIdType, pub drive: DeviceIdType, } -pub fn from_path(path: impl AsRef) -> io::Result { - match file_id::get_file_id(path)? { - FileId::Inode { device_id, inode_number } => Ok(HandleIdentifier { - inode: inode_number as FileIdType, - drive: device_id as DeviceIdType, - }), - FileId::LowRes { volume_serial_number, file_index } => Ok(HandleIdentifier { - inode: file_index as FileIdType, - drive: volume_serial_number as DeviceIdType, - }), - FileId::HighRes { volume_serial_number, file_id } => Ok(HandleIdentifier { - inode: file_id as FileIdType, // path windows only -> no downcast will happen - drive: volume_serial_number as DeviceIdType, - }), +impl HandleIdentifier { + /// Create a new handle identifier from a path. + /// + /// # Arguments + /// * `path` - The path to the file. + /// + /// # Returns + /// The handle identifier. + /// + /// # Errors + /// If the file id cannot be retrieved. + pub fn from_path(path: impl AsRef) -> io::Result { + match file_id::get_file_id(path)? { + FileId::Inode { device_id, inode_number } => Ok(HandleIdentifier { + // unix + inode: inode_number as FileIdType, + drive: device_id as DeviceIdType, + }), + FileId::LowRes { volume_serial_number, file_index } => Ok(HandleIdentifier { + // windows + inode: file_index as FileIdType, + drive: volume_serial_number as DeviceIdType, + }), + FileId::HighRes { volume_serial_number, file_id } => Ok(HandleIdentifier { + // windows + inode: file_id as FileIdType, + drive: volume_serial_number as DeviceIdType, + }), + } } } + + diff --git a/src/data/hash.rs b/src/data/hash.rs index 62103e7..63fad7a 100644 --- a/src/data/hash.rs +++ b/src/data/hash.rs @@ -1,79 +1,251 @@ use std::fmt; use std::fmt::Display; +use std::path::Path; use std::str::FromStr; use serde::{Deserialize, Serialize, Serializer}; use serde::de::Error; +use const_format::concatcp; +use crate::stages::build::intermediary_build_data::BuildFile; +use crate::path::FilePath; +#[cfg(any(feature = "hash-sha2", feature = "hash-sha1", feature = "hash-xxh"))] use crate::utils; + +/// `GeneralHashType` is an enum that represents the different types of hash functions that can be used. +/// +/// The following hash functions are supported: SHA512, SHA256, SHA1, XXH64, XXH32, and NULL. +/// +/// The `hasher` method returns a new instance of a `GeneralHasher` trait object that corresponds to the hash type. +/// The `hasher` can then be used to compute a hash of that kind. +/// +/// # Traits +/// * `FromStr` - to allow parsing a string into a `GeneralHashType`. +/// * `Display` - to allow formatting a `GeneralHashType` into a string. +/// +/// # Examples +/// ``` +/// use std::str::FromStr; +/// use backup_deduplicator::hash::GeneralHashType; +/// +/// #[cfg(feature = "hash-sha2")] +/// { +/// let hash_type = GeneralHashType::from_str("SHA256").unwrap(); +/// let mut hasher = hash_type.hasher(); +/// hasher.update(b"Hello, world!".as_slice()); +/// +/// assert_eq!(hash_type.to_string(), "SHA256"); +/// +/// let hash = hasher.finalize(); +/// assert_eq!(hash.to_string(), "SHA256:315f5bdb76d078c43b8ac0064e4a0164612b1fce77c869345bfc94c75894edd3"); +/// assert_eq!(hash_type, GeneralHashType::SHA256); +/// } +/// +/// ``` +/// +/// # See also +/// * [GeneralHash] - representation of a hash value. +/// * [GeneralHasher] - trait for computing hash values. +/// +/// # Features +/// * `hash-sha2` - enables the SHA512 and SHA256 hash functions. +/// * `hash-sha1` - enables the SHA1 hash function. +/// * `hash-xxh` - enables the XXH64 and XXH32 hash functions. #[derive(Debug, Hash, PartialEq, Clone, Copy, Serialize, Deserialize)] pub enum GeneralHashType { + #[cfg(feature = "hash-sha2")] SHA512, + #[cfg(feature = "hash-sha2")] SHA256, + #[cfg(feature = "hash-sha1")] SHA1, + #[cfg(feature = "hash-xxh")] XXH64, + #[cfg(feature = "hash-xxh")] XXH32, NULL, } impl GeneralHashType { + /// Returns a new instance of a `GeneralHasher` trait object that corresponds to the hash type. + /// The `hasher` can then be used to compute a hash of that kind. + /// + /// # Returns + /// A new instance of a `GeneralHasher` trait object. + /// + /// # Examples + /// See the example in the `GeneralHashType` documentation. + /// + /// # Features + /// * `hash-sha2` - enables the SHA512 and SHA256 hash functions. + /// * `hash-sha1` - enables the SHA1 hash function. + /// * `hash-xxh` - enables the XXH64 and XXH32 hash functions. pub fn hasher(&self) -> Box { match self { + #[cfg(feature = "hash-sha2")] GeneralHashType::SHA512 => Box::new(sha2::Sha512Hasher::new()), + #[cfg(feature = "hash-sha2")] GeneralHashType::SHA256 => Box::new(sha2::Sha256Hasher::new()), + #[cfg(feature = "hash-sha1")] GeneralHashType::SHA1 => Box::new(sha1::Sha1Hasher::new()), + #[cfg(feature = "hash-xxh")] GeneralHashType::XXH64 => Box::new(xxh::Xxh64Hasher::new()), + #[cfg(feature = "hash-xxh")] GeneralHashType::XXH32 => Box::new(xxh::Xxh32Hasher::new()), GeneralHashType::NULL => Box::new(null::NullHasher::new()), } } } +impl GeneralHashType { + /// Returns the available hash types as a string. + /// + /// # Returns + /// The available hash types as a string. + /// + /// # Examples + /// ``` + /// use backup_deduplicator::hash::GeneralHashType; + /// + /// let supported = GeneralHashType::supported_algorithms(); + /// println!("Supported algorithms: {}", supported); + /// ``` + pub const fn supported_algorithms() -> &'static str { + const SHA2: &'static str = if cfg!(feature = "hash-sha2") { "SHA512, SHA256, " } else { "" }; + const SHA1: &'static str = if cfg!(feature = "hash-sha1") { "SHA1, " } else { "" }; + const XXH: &'static str = if cfg!(feature = "hash-xxh") { "XXH64, XXH32, " } else { "" }; + const NULL: &'static str = "NULL"; + + concatcp!(SHA2, SHA1, XXH, NULL) + } +} + impl FromStr for GeneralHashType { + /// Error type for parsing a `GeneralHashType` from a string. type Err = &'static str; + /// Parses a string into a `GeneralHashType`. + /// + /// # Arguments + /// * `s` - The string to parse. + /// + /// # Returns + /// The `GeneralHashType` that corresponds to the string or an error. + /// + /// # Errors + /// Returns an error if the string does not correspond to a `GeneralHashType`. + /// Returns the available hash types in the error message. fn from_str(s: &str) -> Result { match s.to_uppercase().as_str() { + #[cfg(feature = "hash-sha2")] "SHA512" => Ok(GeneralHashType::SHA512), + #[cfg(feature = "hash-sha2")] "SHA256" => Ok(GeneralHashType::SHA256), + #[cfg(feature = "hash-sha1")] "SHA1" => Ok(GeneralHashType::SHA1), + #[cfg(feature = "hash-xxh")] "XXH64" => Ok(GeneralHashType::XXH64), + #[cfg(feature = "hash-xxh")] "XXH32" => Ok(GeneralHashType::XXH32), "NULL" => Ok(GeneralHashType::NULL), - _ => Err("SHA512, SHA256, SHA1, XXH64, XXH32, NULL"), + _ => Err(GeneralHashType::supported_algorithms()), } } } -impl fmt::Display for GeneralHashType { +impl Display for GeneralHashType { + /// Converts a `GeneralHashType` into a string. + /// + /// # Arguments + /// * `f` - The formatter to write to. + /// + /// # Returns + /// A result indicating whether the operation was successful. + /// + /// # Errors + /// Never fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + #[cfg(feature = "hash-sha2")] GeneralHashType::SHA512 => write!(f, "SHA512"), + #[cfg(feature = "hash-sha2")] GeneralHashType::SHA256 => write!(f, "SHA256"), + #[cfg(feature = "hash-sha1")] GeneralHashType::SHA1 => write!(f, "SHA1"), + #[cfg(feature = "hash-xxh")] GeneralHashType::XXH64 => write!(f, "XXH64"), + #[cfg(feature = "hash-xxh")] GeneralHashType::XXH32 => write!(f, "XXH32"), GeneralHashType::NULL => write!(f, "NULL"), } } } +/// `GeneralHash` is an enum that represents a hash value. +/// +/// The hash value is stored as a byte array of a fixed size. +/// The size of the byte array depends on the hash function used. +/// +/// The following hash functions are supported: SHA512, SHA256, SHA1, XXH64, XXH32, and NULL. +/// +/// The `hash_type` method returns the type of the hash function used. +/// The `hasher` method returns a new instance of a `GeneralHasher` trait object that corresponds to the hash type. +/// The `hasher` can then be used to compute a hash of that kind. +/// +/// # Traits +/// * `Display` - to allow formatting a `GeneralHash` into a string. +/// * `FromStr` - to allow parsing a string into a `GeneralHash`. +/// * `Serialize` - to allow serializing a `GeneralHash` into a string. +/// * `Deserialize` - to allow deserializing a `GeneralHash` from a string. +/// +/// # Examples +/// ``` +/// use std::str::FromStr; +/// use backup_deduplicator::hash::{GeneralHash, GeneralHashType}; +/// +/// #[cfg(feature = "hash-sha2")] +/// { +/// let hash = GeneralHash::from_str("SHA256:315f5bdb76d078c43b8ac0064e4a0164612b1fce77c869345bfc94c75894edd3").unwrap(); +/// +/// let mut hasher = GeneralHashType::SHA256.hasher(); +/// hasher.update(b"Hello, world!".as_slice()); +/// let new_hash = hasher.finalize(); +/// +/// assert_eq!(hash, new_hash); +/// assert_eq!(hash.to_string(), new_hash.to_string()); +/// } +/// ``` +/// +/// # See also +/// * [GeneralHashType] - representation of the different types of hash functions. +/// * [GeneralHasher] - trait for computing hash values. +/// #[derive(Debug, Hash, PartialEq, Eq, Clone, PartialOrd)] pub enum GeneralHash { + #[cfg(feature = "hash-sha2")] SHA512([u8; 64]), + #[cfg(feature = "hash-sha2")] SHA256([u8; 32]), + #[cfg(feature = "hash-sha1")] SHA1([u8; 20]), + #[cfg(feature = "hash-xxh")] XXH64([u8; 8]), + #[cfg(feature = "hash-xxh")] XXH32([u8; 4]), NULL, } impl Display for GeneralHash { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let capacity = match self { + #[cfg(feature = "hash-sha2")] GeneralHash::SHA512(_) => 128, + #[cfg(feature = "hash-sha2")] GeneralHash::SHA256(_) => 64, + #[cfg(feature = "hash-sha1")] GeneralHash::SHA1(_) => 40, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH64(_) => 16, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH32(_) => 8, GeneralHash::NULL => 0, }; @@ -83,18 +255,23 @@ impl Display for GeneralHash { hex.push_str((self.hash_type().to_string() + ":").as_str()); match self { + #[cfg(feature = "hash-sha2")] GeneralHash::SHA512(data) => for byte in data { hex.push_str(&format!("{:02x}", byte)); }, + #[cfg(feature = "hash-sha2")] GeneralHash::SHA256(data) => for byte in data { hex.push_str(&format!("{:02x}", byte)); }, + #[cfg(feature = "hash-sha1")] GeneralHash::SHA1(data) => for byte in data { hex.push_str(&format!("{:02x}", byte)); }, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH64(data) => for byte in data { hex.push_str(&format!("{:02x}", byte)); }, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH32(data) => for byte in data { hex.push_str(&format!("{:02x}", byte)); }, @@ -113,44 +290,69 @@ impl Serialize for GeneralHash { } } -impl<'de> Deserialize<'de> for GeneralHash { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de> { - let hex = String::deserialize(deserializer)?; +impl FromStr for GeneralHash { + // Error type for parsing a `GeneralHash` from a string. + type Err = &'static str; + + /// Parses a string into a `GeneralHash`. + /// + /// # Arguments + /// * `hex` - The string to parse, in the format `hash_type:hash_data (hex)`. + /// + /// # Returns + /// The `GeneralHash` that corresponds to the string or an error. + /// + /// # Errors + /// Returns an error if the string does not correspond to a `GeneralHash`. + /// * If the hash type is not recognized. + /// * If the hash data is not valid (wrong length or non-hex string). + fn from_str(hex: &str) -> Result { let mut iter = hex.split(':'); - let hash_type = GeneralHashType::from_str(iter.next().ok_or_else(|| D::Error::custom("No hash type"))?).map_err(|err| D::Error::custom(format!("Failed to parse hash type: {}", err)))?; - let data = iter.next().ok_or_else(|| D::Error::custom("No hash data"))?; - let data = utils::decode_hex(data).map_err(|err| D::Error::custom(format!("Failed to decode hash data: {}", err)))?; + let hash_type = GeneralHashType::from_str(iter.next().ok_or_else(|| "No hash type")?).map_err(|_| "Failed to parse hash type")?; + + #[cfg(any(feature = "hash-sha2", feature = "hash-sha1", feature = "hash-xxh"))] + let data = match hash_type { + GeneralHashType::NULL => Vec::new(), + _ => { + let data = iter.next().ok_or_else(|| "No hash data")?; + utils::decode_hex(data).map_err(|_| "Failed to decode hash data")? + } + }; + let mut hash = GeneralHash::from_type(hash_type); match &mut hash { + #[cfg(feature = "hash-sha2")] GeneralHash::SHA512(target_data) => { if data.len() != 64 { - return Err(D::Error::custom("Invalid data length")); + return Err("Invalid data length"); } target_data.copy_from_slice(&data); }, + #[cfg(feature = "hash-sha2")] GeneralHash::SHA256(target_data) => { if data.len() != 32 { - return Err(D::Error::custom("Invalid data length")); + return Err("Invalid data length"); } target_data.copy_from_slice(&data); }, + #[cfg(feature = "hash-sha1")] GeneralHash::SHA1(target_data) => { if data.len() != 20 { - return Err(D::Error::custom("Invalid data length")); + return Err("Invalid data length"); } target_data.copy_from_slice(&data); }, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH64(target_data) => { if data.len() != 8 { - return Err(D::Error::custom("Invalid data length")); + return Err("Invalid data length"); } target_data.copy_from_slice(&data); }, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH32(target_data) => { if data.len() != 4 { - return Err(D::Error::custom("Invalid data length")); + return Err("Invalid data length"); } target_data.copy_from_slice(&data); }, @@ -160,58 +362,281 @@ impl<'de> Deserialize<'de> for GeneralHash { } } +impl<'de> Deserialize<'de> for GeneralHash { + /// Deserializes a `GeneralHash` from a string. + /// + /// # Arguments + /// * `deserializer` - The deserializer to use. + /// + /// # Returns + /// The deserialized `GeneralHash`. + /// + /// # Errors + /// If the string could not be deserialized. + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de> { + let hex = String::deserialize(deserializer)?; + GeneralHash::from_str(hex.as_str()).map_err(D::Error::custom) + } +} + impl GeneralHash { + /// Returns the hash value as a byte array. + /// + /// # Returns + /// A reference to the byte array that represents the hash value. pub fn as_bytes(&self) -> &[u8] { match self { + #[cfg(feature = "hash-sha2")] GeneralHash::SHA512(data) => data, + #[cfg(feature = "hash-sha2")] GeneralHash::SHA256(data) => data, + #[cfg(feature = "hash-sha1")] GeneralHash::SHA1(data) => data, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH64(data) => data, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH32(data) => data, GeneralHash::NULL => &[0; 0], } } + #[cfg(feature = "hash-sha2")] + /// Returns a new instance of a SHA512 hash value. pub fn new_sha512() -> Self { Self::from_type(GeneralHashType::SHA512) } + + #[cfg(feature = "hash-sha2")] + /// Returns a new instance of a SHA256 hash value. pub fn new_sha256() -> Self { Self::from_type(GeneralHashType::SHA256) } + + #[cfg(feature = "hash-sha1")] + /// Returns a new instance of a SHA1 hash value. pub fn new_sha1() -> Self { Self::from_type(GeneralHashType::SHA1) } + + #[cfg(feature = "hash-xxh")] + /// Returns a new instance of a XXH64 hash value. pub fn new_xxh64() -> Self { Self::from_type(GeneralHashType::XXH64) } + + #[cfg(feature = "hash-xxh")] + /// Returns a new instance of a XXH32 hash value. pub fn new_xxh32() -> Self { Self::from_type(GeneralHashType::XXH32) } + /// Returns the type of the hash function used. + /// + /// # Returns + /// The type of the hash function used. + /// + /// # Examples + /// ``` + /// use backup_deduplicator::hash::{GeneralHash, GeneralHashType}; + /// + /// #[cfg(feature = "hash-sha2")] + /// { + /// let hash = GeneralHash::new_sha256(); + // + // let m = match hash.hash_type() { + // GeneralHashType::SHA256 => true, + // _ => false, + // }; + // + // assert!(m); + /// } + /// ``` pub fn hash_type(&self) -> GeneralHashType { match self { + #[cfg(feature = "hash-sha2")] GeneralHash::SHA512(_) => GeneralHashType::SHA512, + #[cfg(feature = "hash-sha2")] GeneralHash::SHA256(_) => GeneralHashType::SHA256, + #[cfg(feature = "hash-sha1")] GeneralHash::SHA1(_) => GeneralHashType::SHA1, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH64(_) => GeneralHashType::XXH64, + #[cfg(feature = "hash-xxh")] GeneralHash::XXH32(_) => GeneralHashType::XXH32, GeneralHash::NULL => GeneralHashType::NULL, } } - + + /// Returns a new instance of a `GeneralHash` with the specified hash type. + /// + /// # Arguments + /// * `hash_type` - The type of the hash function to use. + /// + /// # Returns + /// A new instance of a `GeneralHash` with the specified hash type. pub fn from_type(hash_type: GeneralHashType) -> Self { match hash_type { + #[cfg(feature = "hash-sha2")] GeneralHashType::SHA512 => GeneralHash::SHA512([0; 64]), + #[cfg(feature = "hash-sha2")] GeneralHashType::SHA256 => GeneralHash::SHA256([0; 32]), + #[cfg(feature = "hash-sha1")] GeneralHashType::SHA1 => GeneralHash::SHA1([0; 20]), + #[cfg(feature = "hash-xxh")] GeneralHashType::XXH64 => GeneralHash::XXH64([0; 8]), + #[cfg(feature = "hash-xxh")] GeneralHashType::XXH32 => GeneralHash::XXH32([0; 4]), GeneralHashType::NULL => GeneralHash::NULL, } } + /// Returns a new instance of a `GeneralHash` with the specified hash type. + /// + /// # Arguments + /// * `hash_type` - The type of the hash function to use. + /// + /// # Returns + /// A new instance of a `GeneralHash` with the specified hash type. + /// + /// # See also + /// * [GeneralHashType] - representation of the different types of hash functions. pub fn hasher(&self) -> Box { self.hash_type().hasher() } + + /// Computes the hash value of the specified data. + /// + /// # Arguments + /// * `reader` - The data to hash (supplied as `std::io::Read`). + /// + /// # Returns + /// The size of the data that was hashed. + /// + /// # Errors + /// Returns an error if the data could not be read. + pub fn hash_file(&mut self, mut reader: T) -> anyhow::Result + where T: std::io::Read { + + let mut hasher = self.hasher(); + let mut buffer = [0; 4096]; + let mut content_size = 0; + + loop { + let bytes_read = reader.read(&mut buffer)?; + content_size += bytes_read as u64; + if bytes_read == 0 { + break; + } + hasher.update(&buffer[..bytes_read]); + } + + *self = hasher.finalize(); + + Ok(content_size) + } + + /// Computes the hash value of file iterator/directory. + /// + /// # Arguments + /// * `children` - The iterator of files to hash. + /// + /// # Returns + /// The count of files that were hashed. + /// + /// # Errors + /// Does not return an error. Might return an error in the future. + pub fn hash_directory<'a>(&mut self, children: impl Iterator) -> anyhow::Result { + let mut hasher = self.hasher(); + + let mut content_size = 0; + + for child in children { + content_size += 1; + hasher.update(child.get_content_hash().as_bytes()); + } + + *self = hasher.finalize(); + + Ok(content_size) + } + + /// Computes the hash value of the specified path. + /// + /// # Arguments + /// * `path` - The path to hash. + /// + /// # Returns + /// Does not return a value. + /// + /// # Errors + /// Does not return an error. Might return an error in the future. + pub fn hash_path(&mut self, path: &Path) -> anyhow::Result<()> { + let mut hasher = self.hasher(); + + hasher.update(path.as_os_str().as_encoded_bytes()); + + *self = hasher.finalize(); + + Ok(()) + } + + /// Computes the hash value of the specified file path. + /// + /// # Arguments + /// * `path` - The file path to hash. + /// + /// # Returns + /// Does not return a value. + /// + /// # Errors + /// Does not return an error. Might return an error in the future. + pub fn hash_filepath(&mut self, path: &FilePath) -> anyhow::Result<()> { + let mut hasher = self.hasher(); + + for component in &path.path { + hasher.update(component.path.as_os_str().as_encoded_bytes()); + } + + *self = hasher.finalize(); + + Ok(()) + } } +/// `GeneralHasher` is a trait for computing hash values. +/// +/// # Methods +/// * `new` - creates a new instance of a `GeneralHasher`. +/// * `update` - updates the hash value with the specified data. +/// * `finalize` - finalizes the hash value and returns the result. +/// +/// # Examples +/// See the example in the `GeneralHash` documentation. +/// +/// # See also +/// * [GeneralHash] - representation of a hash value. +/// * [GeneralHashType] - representation of the different types of hash functions. pub trait GeneralHasher { + /// Creates a new instance of a `GeneralHasher`. + /// + /// # Returns + /// A new instance of a `GeneralHasher`. fn new() -> Self where Self: Sized; + + /// Updates the hash value with the specified data. + /// + /// # Arguments + /// * `data` - The data to hash. fn update(&mut self, data: &[u8]); + + /// Finalizes the hash value and returns the result. + /// Consumes the `GeneralHasher` instance. + /// + /// # Returns + /// The hash value. fn finalize(self: Box) -> GeneralHash; } +#[cfg(feature = "hash-sha1")] +/// `GeneralHasher` implementation for the SHA1 crate mod sha1; +#[cfg(feature = "hash-sha2")] +/// `GeneralHasher` implementation for the SHA2 crate mod sha2; +#[cfg(feature = "hash-xxh")] +/// `GeneralHasher` implementation for the XXH crate mod xxh; +/// `GeneralHasher` implementation for the NULL hash function mod null; diff --git a/src/data/hash/null.rs b/src/data/hash/null.rs index 2088528..5a427cc 100644 --- a/src/data/hash/null.rs +++ b/src/data/hash/null.rs @@ -1,4 +1,4 @@ -use crate::data::{GeneralHash, GeneralHasher}; +use crate::hash::{GeneralHash, GeneralHasher}; pub struct NullHasher { diff --git a/src/data/hash/sha1.rs b/src/data/hash/sha1.rs index 670bcb6..0f95caf 100644 --- a/src/data/hash/sha1.rs +++ b/src/data/hash/sha1.rs @@ -1,5 +1,5 @@ use sha1::Digest; -use crate::data::{GeneralHash, GeneralHasher}; +use crate::hash::{GeneralHash, GeneralHasher}; pub struct Sha1Hasher { hasher: sha1::Sha1 diff --git a/src/data/hash/sha2.rs b/src/data/hash/sha2.rs index bea1126..ca98309 100644 --- a/src/data/hash/sha2.rs +++ b/src/data/hash/sha2.rs @@ -1,5 +1,5 @@ use sha2::Digest; -use crate::data::{GeneralHash, GeneralHasher}; +use crate::hash::{GeneralHash, GeneralHasher}; pub struct Sha512Hasher { hasher: sha2::Sha512 diff --git a/src/data/hash/xxh.rs b/src/data/hash/xxh.rs index b97dbe4..dbc37a8 100644 --- a/src/data/hash/xxh.rs +++ b/src/data/hash/xxh.rs @@ -1,5 +1,5 @@ use xxhash_rust::{xxh32, xxh64}; -use crate::data::{GeneralHash, GeneralHasher}; +use crate::hash::{GeneralHash, GeneralHasher}; pub struct Xxh64Hasher { hasher: xxh64::Xxh64 diff --git a/src/data/hashtree_save_file.rs b/src/data/hashtree_save_file.rs deleted file mode 100644 index eeda980..0000000 --- a/src/data/hashtree_save_file.rs +++ /dev/null @@ -1,218 +0,0 @@ -use std::cell::RefCell; -use std::collections::HashMap; -use std::io::{BufRead, Write}; -use std::ops::DerefMut; -use std::sync::Arc; - -use anyhow::Result; -use log::{info, trace, warn}; -use serde::{Deserialize, Serialize}; - -use crate::data::{FilePath, GeneralHash, GeneralHashType}; -use crate::utils; - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub enum SaveFileVersion { - V1, -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct SaveFileHeaders { - pub version: SaveFileVersion, - pub hash_type: GeneralHashType, - pub creation_date: u64, -} - -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Hash, Eq)] -pub enum SaveFileEntryTypeV1 { - File, - Directory, - Symlink, - Other, -} -pub use SaveFileEntryTypeV1 as SaveFileEntryType; - -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] -pub struct SaveFileEntryV1 { - pub file_type: SaveFileEntryTypeV1, - pub modified: u64, - pub size: u64, - pub hash: GeneralHash, - pub path: FilePath, - pub children: Vec, -} -pub use SaveFileEntryV1 as SaveFileEntry; - -#[derive(Debug, Serialize)] -pub struct SaveFileEntryV1Ref<'a> { - pub file_type: &'a SaveFileEntryTypeV1, - pub modified: &'a u64, - pub size: &'a u64, - pub hash: &'a GeneralHash, - pub path: &'a FilePath, - pub children: Vec<&'a GeneralHash>, -} -pub type SaveFileEntryRef<'a> = SaveFileEntryV1Ref<'a>; - -pub mod converter; - -pub struct SaveFile<'a, W, R> where W: Write, R: BufRead { - pub header: SaveFileHeaders, - pub file_by_hash: HashMap>>, - pub file_by_path: HashMap>, - pub all_entries: Vec>, - - enable_file_by_hash: bool, - enable_file_by_path: bool, - enable_all_entry_list: bool, - - writer: RefCell<&'a mut W>, - written_bytes: RefCell, - reader: RefCell<&'a mut R>, -} - -impl<'a, W: Write, R: BufRead> SaveFile<'a, W, R> { - pub fn new(writer: &'a mut W, reader: &'a mut R, enable_file_by_hash: bool, enable_file_by_path: bool, enable_all_entry_list: bool) -> Self { - let time = utils::get_time(); - SaveFile { - header: SaveFileHeaders { - version: SaveFileVersion::V1, - hash_type: GeneralHashType::SHA256, - creation_date: time, - }, - file_by_hash: HashMap::new(), - file_by_path: HashMap::new(), - all_entries: Vec::new(), - enable_file_by_hash, - enable_file_by_path, - enable_all_entry_list, - writer: RefCell::new(writer), - reader: RefCell::new(reader), - written_bytes: RefCell::new(0), - } - } - - pub fn save_header(&self) -> Result<()> { - let header_str = serde_json::to_string(&self.header)?; - *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write(header_str.as_bytes())?; - *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write(b"\n")?; - - Ok(()) - } - - pub fn load_header(&mut self) -> Result<()> { - let mut header_str = String::new(); - self.reader.borrow_mut().deref_mut().read_line(&mut header_str)?; - - let header: SaveFileHeaders = serde_json::from_str(header_str.as_str())?; - self.header = header; - - Ok(()) - } - - pub fn load_entry_no_filter(&mut self) -> Result>> { - self.load_entry(|_| true) - } - - pub fn load_entry bool>(&mut self, filter: F) -> Result>> { - loop { - let mut entry_str = String::new(); - let count = self.reader.borrow_mut().deref_mut().read_line(&mut entry_str)?; - - if count == 0 { - return Ok(None); - } - - if count == 1 { - continue; - } - - let entry: SaveFileEntry = serde_json::from_str(entry_str.as_str())?; - - if entry.hash.hash_type() != self.header.hash_type && !(entry.file_type == SaveFileEntryType::Other && entry.hash.hash_type() == GeneralHashType::NULL) { - warn!("Hash type mismatch ignoring entry: {:?}", entry.path); - continue; - } - - if !filter(&entry) { - trace!("Entry filtered: {:?}", entry.path); - continue; - } - - let shared_entry = Arc::new(entry); - - if self.enable_file_by_hash { - self.file_by_hash.entry(shared_entry.hash.clone()).or_insert_with(Vec::new).push(Arc::clone(&shared_entry)); - } - - if self.enable_file_by_path { - match self.file_by_path.insert(shared_entry.path.clone(), Arc::clone(&shared_entry)) { - None => {} - Some(old) => { - // this happens if analysis was canceled and continued - // and an already analysed file changed - info!("Duplicate entry for path: {:?}", &old.path); - if self.enable_all_entry_list { - self.all_entries.retain(|x| x != &old); - } - } - } - } - - if self.enable_all_entry_list { - self.all_entries.push(Arc::clone(&shared_entry)); - } - - return Ok(Some(shared_entry)) - } - } - - pub fn load_all_entries bool>(&mut self, filter: F) -> Result<()> { - while let Some(_) = self.load_entry(&filter)? {} - - Ok(()) - } - - pub fn load_all_entries_no_filter(&mut self) -> Result<()> { - self.load_all_entries(|_| true) - } - - pub fn write_entry(&self, result: &SaveFileEntryV1) -> Result<()> { - let string = serde_json::to_string(result)?; - *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write(string.as_bytes())?; - *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write("\n".as_bytes())?; - self.writer.borrow_mut().deref_mut().flush()?; - Ok(()) - } - - pub fn write_entry_ref(&self, result: &SaveFileEntryV1Ref) -> Result<()> { - let string = serde_json::to_string(result)?; - *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write(string.as_bytes())?; - *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write("\n".as_bytes())?; - self.writer.borrow_mut().deref_mut().flush()?; - Ok(()) - } - - pub fn empty_file_by_hash(&mut self) { - self.file_by_hash.clear(); - self.file_by_hash.shrink_to_fit(); - } - - pub fn empty_file_by_path(&mut self) { - self.file_by_path.clear(); - self.file_by_path.shrink_to_fit(); - } - - pub fn empty_entry_list(&mut self) { - self.all_entries.clear(); - self.all_entries.shrink_to_fit(); - } - - pub fn get_written_bytes(&self) -> usize { - *self.written_bytes.borrow() - } - - pub fn flush(&self) -> std::io::Result<()> { - self.writer.borrow_mut().deref_mut().flush() - } -} diff --git a/src/data/hashtree_save_file/converter.rs b/src/data/hashtree_save_file/converter.rs deleted file mode 100644 index 05b268d..0000000 --- a/src/data/hashtree_save_file/converter.rs +++ /dev/null @@ -1,176 +0,0 @@ -use crate::data::{DirectoryInformation, File, FileInformation, GeneralHash, OtherInformation, SaveFileEntryTypeV1, SaveFileEntryV1, SaveFileEntryV1Ref, StubInformation, SymlinkInformation}; - -impl From for SaveFileEntryV1 { - fn from(value: FileInformation) -> Self { - Self { - file_type: SaveFileEntryTypeV1::File, - modified: value.modified, - size: value.content_size, - hash: value.content_hash, - path: value.path, - children: Vec::with_capacity(0), - } - } -} - -impl From for SaveFileEntryV1 { - fn from(value: SymlinkInformation) -> Self { - Self { - file_type: SaveFileEntryTypeV1::Symlink, - modified: value.modified, - size: value.content_size, - hash: value.content_hash, - path: value.path, - children: Vec::with_capacity(0), - } - } -} - -impl From for SaveFileEntryV1 { - fn from(value: DirectoryInformation) -> Self { - let mut result = Self { - file_type: SaveFileEntryTypeV1::Directory, - modified: value.modified, - size: value.number_of_children, - hash: value.content_hash, - path: value.path, - children: Vec::with_capacity(value.children.len()), - }; - for child in value.children { - result.children.push(child.get_content_hash().clone()); - } - result - } -} - -impl From for SaveFileEntryV1 { - fn from(value: OtherInformation) -> Self { - Self { - file_type: SaveFileEntryTypeV1::Other, - modified: value.modified, - size: value.content_size, - hash: GeneralHash::NULL, - path: value.path, - children: Vec::with_capacity(0), - } - } -} - -impl From for SaveFileEntryV1 { - fn from(value: StubInformation) -> Self { - Self { - file_type: SaveFileEntryTypeV1::Other, - modified: 0, - size: 0, - hash: value.content_hash, - path: value.path, - children: Vec::with_capacity(0), - } - } -} - -impl<'a> From<&'a FileInformation> for SaveFileEntryV1Ref<'a> { - fn from(value: &'a FileInformation) -> Self { - Self { - file_type: &SaveFileEntryTypeV1::File, - modified: &value.modified, - hash: &value.content_hash, - path: &value.path, - size: &value.content_size, - children: Vec::with_capacity(0), - } - } -} - -impl<'a> From<&'a SymlinkInformation> for SaveFileEntryV1Ref<'a> { - fn from(value: &'a SymlinkInformation) -> Self { - Self { - file_type: &SaveFileEntryTypeV1::Symlink, - modified: &value.modified, - hash: &value.content_hash, - path: &value.path, - size: &value.content_size, - children: Vec::with_capacity(0), - } - } -} - -impl<'a> From<&'a DirectoryInformation> for SaveFileEntryV1Ref<'a> { - fn from(value: &'a DirectoryInformation) -> Self { - let mut result = Self { - file_type: &SaveFileEntryTypeV1::Directory, - modified: &value.modified, - hash: &value.content_hash, - path: &value.path, - size: &value.number_of_children, - children: Vec::with_capacity(value.children.len()), - }; - for child in &value.children { - result.children.push(child.get_content_hash()); - } - result - } -} - -impl<'a> From<&'a OtherInformation> for SaveFileEntryV1Ref<'a> { - fn from(value: &'a OtherInformation) -> Self { - Self { - file_type: &SaveFileEntryTypeV1::Other, - modified: &0, - hash: &GeneralHash::NULL, - path: &value.path, - size: &value.content_size, - children: Vec::with_capacity(0), - } - } -} - -impl<'a> From<&'a StubInformation> for SaveFileEntryV1Ref<'a> { - fn from(value: &'a StubInformation) -> Self { - Self { - file_type: &SaveFileEntryTypeV1::Other, - modified: &0, - hash: &value.content_hash, - path: &value.path, - size: &0, - children: Vec::with_capacity(0), - } - } -} - -impl From for SaveFileEntryV1 { - fn from(value: File) -> Self { - match value { - File::File(info) => info.into(), - File::Directory(info) => info.into(), - File::Symlink(info) => info.into(), - File::Other(info) => info.into(), - File::Stub(info) => info.into(), - } - } -} - -impl<'a> From<&'a File> for SaveFileEntryV1Ref<'a> { - fn from(value: &'a File) -> Self { - match value { - File::File(info) => info.into(), - File::Directory(info) => info.into(), - File::Symlink(info) => info.into(), - File::Other(info) => info.into(), - File::Stub(info) => info.into(), - } - } -} - -impl<'a> From<&'a SaveFileEntryV1> for SaveFileEntryV1Ref<'a> { - fn from(value: &'a SaveFileEntryV1) -> Self { - Self { - file_type: &value.file_type, - modified: &value.modified, - hash: &value.hash, - path: &value.path, - size: &value.size, - children: Vec::with_capacity(0), - } - } -} diff --git a/src/data/job.rs b/src/data/job.rs deleted file mode 100644 index 471a336..0000000 --- a/src/data/job.rs +++ /dev/null @@ -1,61 +0,0 @@ -use std::sync::{Arc, Mutex}; -use crate::data::{File, FilePath}; - -pub type SharedJob = Arc; - -static JOB_COUNTER: Mutex = Mutex::new(0); - -fn new_job_counter_id() -> usize { - let mut counter = JOB_COUNTER.lock().expect("Failed to lock job counter"); - *counter += 1; - (*counter).clone() -} - -#[derive(Debug, Clone, PartialEq, Copy)] -pub enum JobState { - NotProcessed, - Analyzed, -} - -#[derive(Debug)] -pub struct Job { - id: usize, - pub parent: Option, - pub finished_children: Mutex>, - pub target_path: FilePath, - pub state: JobState, -} - -impl Job { - pub fn new(parent: Option, target_path: FilePath) -> Self { - Job { - id: new_job_counter_id(), - parent, - target_path, - state: JobState::NotProcessed, - finished_children: Mutex::new(Vec::new()), - } - } - - pub fn job_id(&self) -> usize { - self.id - } - - pub(crate) fn new_job_id(mut self) -> Self { - self.id = new_job_counter_id(); - self - } -} - -impl JobTrait for Job { - fn job_id(&self) -> usize { - Job::job_id(self) - } -} - - -pub trait JobTrait { - fn job_id(&self) -> usize; -} - -pub trait ResultTrait {} diff --git a/src/data/path.rs b/src/data/path.rs index f0ceb80..ba63475 100644 --- a/src/data/path.rs +++ b/src/data/path.rs @@ -4,50 +4,100 @@ use std::path::PathBuf; use anyhow::{Result}; use serde::{Deserialize, Serialize}; +/// The type of archive. #[derive(Debug, Clone, Serialize, Deserialize, Hash)] pub enum ArchiveType { Tar, Zip, } +/// The target of a path. +/// +/// # Fields +/// * `File` - The path points to a file. +/// * `Archive` - The path points to an archive. That is further traversed. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] pub enum PathTarget { File, // Archive(ArchiveType), } +/// A path component. A path points to a file or an archive. +/// +/// # Fields +/// * `path` - The path. +/// * `target` - The target of the path. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] pub struct PathComponent { pub path: PathBuf, pub target: PathTarget, } +/// A file path. A file path specifies a target file. It may consist of multiple path components. +/// Imagine the following file structure: +/// +/// ```text +/// DIR stuff +/// \-- DIR more_stuff +/// \-- FILE archive.tar.gz +/// \-- FILE file_in_archive.txt +/// ``` +/// +/// The file path to `file_in_archive.txt` would consist of the following path components: +/// - `stuff/more_stuff/archive.tar.gz` (target: Archive) +/// - `file_in_archive.txt` (target: File) +/// +/// The file path to `archive.tar.gz` would consist of the following path components: +/// - `stuff/more_stuff/archive.tar.gz` (target: File) +/// +/// # Fields +/// * `path` - The path components. +/// +/// # Examples +/// ``` +/// use std::path::PathBuf; +/// use backup_deduplicator::path::FilePath; +/// +/// let path = FilePath::from_realpath(PathBuf::from("test.txt")); +/// +/// ``` #[derive(Debug, Clone, Serialize, Deserialize, Hash)] pub struct FilePath { pub path: Vec } impl FilePath { - pub fn from_vec(path: Vec) -> Self { + /// Creates a new file path from path components. + /// + /// # Arguments + /// * `path` - The path components. + /// + /// # Returns + /// The file path. + pub fn from_pathcomponents(path: Vec) -> Self { FilePath { path } } - - pub fn from_path(path: PathBuf, target: PathTarget) -> Self { + + /// Creates a new file path from a real path. + /// + /// # Arguments + /// * `path` - The real path. + /// + /// # Returns + /// The file path. + pub fn from_realpath(path: PathBuf) -> Self { FilePath { path: vec![PathComponent { path, - target + target: PathTarget::File }] } } - pub fn join(&mut self, path: PathBuf, target: PathTarget) { - self.path.push(PathComponent { - path, - target - }); + pub fn join_realpath(&mut self, _path: PathBuf) { + todo!("implement") } pub fn extract_parent(&self, _temp_directory: &PathBuf) { @@ -57,7 +107,14 @@ impl FilePath { pub fn delete_parent(&self, _temp_directory: &PathBuf) { todo!("implement") } - + + /// Resolves the file path to a single file. + /// + /// # Returns + /// The resolved file path. + /// + /// # Errors + /// Never pub fn resolve_file(&self) -> Result { if self.path.len() == 1 { match self.path[0].target { @@ -68,12 +125,42 @@ impl FilePath { } } - pub fn child_real(&self, child_name: OsString) -> FilePath { + /// Gets the child of where the file path points to. + /// + /// # Arguments + /// * `child_name` - The name of the child. + /// + /// # Returns + /// The child file path. + /// + /// # Example + /// ``` + /// use std::path::PathBuf; + /// use backup_deduplicator::path::FilePath; + /// + /// let path = FilePath::from_realpath(PathBuf::from("test/")); + /// let child = path.child("child.txt"); + /// + /// assert_eq!(child.path[0].path, PathBuf::from("test/child.txt")); + /// assert_eq!(child.path.len(), 1); + /// ``` + /// + /// ``` + /// use std::path::PathBuf; + /// use backup_deduplicator::path::FilePath; + /// + /// let path = FilePath::from_realpath(PathBuf::from("test/")); + /// let subpath = path.child("subdir").child("abc.txt"); + /// + /// assert_eq!(subpath.path[0].path, PathBuf::from("test/subdir/abc.txt")); + /// assert_eq!(subpath.path.len(), 1); + /// ``` + pub fn child>(&self, child_name: Str) -> FilePath { let mut result = FilePath { path: self.path.clone() }; - let component = PathBuf::from(child_name); + let component = PathBuf::from(child_name.into()); match result.path.last_mut() { Some(last) => { @@ -89,7 +176,27 @@ impl FilePath { return result; } - + + /// Gets the parent of the file path. + /// + /// # Returns + /// The parent file path. None if the file path has no parent. + /// + /// # Example + /// ``` + /// use std::path::PathBuf; + /// use backup_deduplicator::path::FilePath; + /// + /// let path = FilePath::from_realpath(PathBuf::from("test/abc/def.txt")); + /// let parent = path.parent().unwrap(); + /// + /// assert_eq!(parent.path[0].path, PathBuf::from("test/abc")); + /// + /// // test/abc test/ "" None + /// let root = path.parent().unwrap().parent().unwrap().parent().unwrap().parent(); + /// + /// assert_eq!(root, None); + /// ``` pub fn parent(&self) -> Option { let last = self.path.last(); @@ -126,6 +233,13 @@ impl FilePath { } impl PartialEq for FilePath { + /// Compares two file paths. + /// + /// # Arguments + /// * `other` - The other file path. + /// + /// # Returns + /// Whether the file paths are equal. fn eq(&self, other: &Self) -> bool { self.path.len() == other.path.len() && self.path.iter().zip(other.path.iter()).all(|(a, b)| a == b) } @@ -134,6 +248,7 @@ impl PartialEq for FilePath { impl Eq for FilePath {} impl std::fmt::Display for FilePath { + /// Formats the file path to a string. fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { let mut result = String::new(); diff --git a/src/lib.md b/src/lib.md new file mode 100644 index 0000000..f778ae1 --- /dev/null +++ b/src/lib.md @@ -0,0 +1,107 @@ +# Inner-workings +The tool is run in four stages: +```plain + Input Execution Output +┌───────────┐ ┌┐ +│ HashTree ◄─────────┼┼────────┐ +│ │ ││ │ +│(optional) ├──┐ ┌────▼▼────┐ ┌┴────────────────┐ +└───────────┘ └─► │ │ │ + │ Build ├──► HashTree │ +┌───────────┐ ┌─► │ │ │ +│ Folder ├──┘ └────┬┬────┘ └┬────────────────┘ +│ -file │ ││ │ +│ -file │ ┌───────┼┼────────┘ +└───┬────┬──┘ │ ││ + │ │ │ ┌────▼▼────┐ ┌─────────────────┐ + │ │ │ │ │ │ │ + │ │ └──► Analyze ├──► Duplicate Sets │ + │ │ │ │ │ │ + │ │ └────┬┬────┘ └┬────────────────┘ + │ │ ││ │ Basic functionality complete +----│----│----┌───────┼┼────────┘---------------------------------- + │ │ │ ││ Implementation in progress + │ │ │ ┌────▼▼────┐ ┌─────────────────┐ + │ │ └──► │ │ │ + │ │ │ Dedup ├──► Change commands │ + │ └───────► │ │ │ + │ └────┬┬────┘ └┬────────────────┘ + │ ││ │ + │ ┌───────┼┼────────┘ + │ │ ││ + │ │ ┌────▼▼────┐ + │ └──► │ + │ │ Execute ├──►Deduplicated files + └────────────► │ + └──────────┘ +``` +1. **Build**: The tools reads a folder and builds a hash tree of all files in it. +2. **Analyze**: The tool analyzes the hash tree and finds duplicate files. +3. **Dedup**: The tool determine which steps to take to deduplicate the files. +This can be done in a half automatic or manual way. +4. **Execute**: The tool executes the deduplication steps (Deleting/Hardlinking/...). + +**Dedup** and **Execute** are in development and currently not (fully) implemented. + +## Build +* Input: Folder with files, Hashtree (optional) to update or continue from. +* Output: HashTree +* Execution: Fully automatic, no user interaction required, multithreaded. + +### HashTree file format +The HashTree is stored in a file with the following format: +```plain +HEADER [newline] +ENTRY [newline] +ENTRY [newline] +... +``` +See `HashTreeFileEntry` for the exact format of an entry. In short, it contains +every information about an analyzed file or directory that is needed for later +stages (JSON): +* File path +* File type +* Last modified time +* File size +* Hash of the file +* Children hashes (if it is a directory) + +While analyzing entries are only appended to the file. After the analysis is +done, the file is fed into the `clean` command that removes all entries that +are outdated or do not exist anymore, rewriting the entire file (but only shrinking it). + +The `clean` command can also be run manually. + +## Analyze +* Input: HashTree +* Output: Duplicate sets +* Execution: Fully automatic, no user interaction required, multithreaded file parsing, + single-threaded duplication detection. + +### Analysis results +The analysis results are stored in a file with the following format: +```plain +[ENTRY] [newline] +[ENTRY] [newline] +... +``` +See `ResultEntry` for the exact format of an entry. In short, it contains (JSON) +* File type +* Hash +* Size (0 if it is a directory, else the file size of one of the files) +* Conflicting Set (a set of all files that are duplicates of each other) + +## Dedup +* Input: Duplicate sets +* Output: Set of commands to execute to deduplicate the files +* Execution: Manual or half-automatic, user interaction required. + +Implementation in progress. To the current date the duplicate sets +must be manually processed. + +## Execute +* Input: Set of commands +* Output: Deduplicated files +* Execution: Fully automatic, user interaction only on errors. + +Implementation in progress. diff --git a/src/lib.rs b/src/lib.rs index 707de38..cc836c9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,32 +1,22 @@ +#![doc = include_str!("../README.md")] +#![doc = include_str!("lib.md")] + extern crate num_cpus; pub mod utils; -mod cmd { +pub mod pool; + +pub mod stages { pub mod build; - pub mod clean; pub mod analyze; -} -pub use cmd::*; - -pub mod data { - mod file; - pub use file::*; - mod fileid; - pub use fileid::*; - mod job; - pub use job::*; - mod path; - pub use path::*; - mod hash; - pub use hash::*; - - mod hashtree_save_file; - pub use hashtree_save_file::*; + pub mod clean; } -pub mod main { - pub mod utils; +mod data { + pub mod path; + pub mod hash; + pub mod fileid; } -pub mod threadpool; +pub use data::*; diff --git a/src/main.rs b/src/main.rs index d4433e4..5a290e0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,11 +2,12 @@ use std::{env}; use std::str::FromStr; use clap::{arg, Parser, Subcommand}; use log::{debug, info, LevelFilter, trace}; -use backup_deduplicator::build::BuildSettings; -use backup_deduplicator::{analyze, clean, main}; -use backup_deduplicator::analyze::AnalysisSettings; -use backup_deduplicator::clean::CleanSettings; -use backup_deduplicator::data::GeneralHashType; +use backup_deduplicator::hash::GeneralHashType; +use backup_deduplicator::stages::analyze::cmd::AnalysisSettings; +use backup_deduplicator::stages::{analyze, build, clean}; +use backup_deduplicator::stages::build::cmd::BuildSettings; +use backup_deduplicator::stages::clean::cmd::CleanSettings; +use backup_deduplicator::utils; /// A simple command line tool to deduplicate backups. #[derive(Parser, Debug)] @@ -46,15 +47,15 @@ enum Command { output: String, /// Absolute paths, if set, the tool will output absolute paths in the hash tree. /// If not set, the tool will output relative paths to the current working directory. - #[arg(long)] - absolute_paths: bool, + // #[arg(long)] + // absolute_paths: bool, /// Working directory, if set, the tool will use the current working directory as the base for relative paths. #[arg(short, long)] working_directory: Option, /// Force overwrite, if set, the tool will overwrite the output file if it exists. If not set, the tool will continue an existing analysis #[arg(long="overwrite", default_value = "false")] recreate_output: bool, - /// Hash algorithm to use (values: sha256, sha512, sha1, xxh64, xxh32) + /// Hash algorithm to use #[arg(long="hash", default_value = "sha256")] hash_type: String, /// Disable database clean after run, if set the tool will not clean the database after the creation @@ -130,7 +131,7 @@ fn main() { // archives, follow_symlinks, output, - absolute_paths, + // absolute_paths, working_directory, recreate_output, hash_type, @@ -150,9 +151,9 @@ fn main() { // Convert to paths and check if they exist - let directory = main::utils::parse_path(directory.as_str(), main::utils::ParsePathKind::AbsoluteNonExisting); - let output = main::utils::parse_path(output.as_str(), main::utils::ParsePathKind::AbsoluteNonExisting); - let working_directory = working_directory.map(|w| main::utils::parse_path(w.as_str(), main::utils::ParsePathKind::AbsoluteNonExisting)); + let directory = utils::main::parse_path(directory.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting); + let output = utils::main::parse_path(output.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting); + let working_directory = working_directory.map(|w| utils::main::parse_path(w.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting)); if !directory.exists() { eprintln!("Target directory does not exist: {}", directory.display()); @@ -175,7 +176,7 @@ fn main() { // Change working directory trace!("Changing working directory"); - let working_directory = main::utils::change_working_directory(working_directory); + let working_directory = utils::main::change_working_directory(working_directory); // Convert paths to relative path to working directory @@ -188,17 +189,17 @@ fn main() { // info!("Archives: {:?}", archives); info!("Follow symlinks: {:?}", follow_symlinks); info!("Output: {:?}", output); - info!("Absolute paths: {:?}", absolute_paths); + // info!("Absolute paths: {:?}", absolute_paths); info!("Working directory: {:?}", working_directory); // Run the command - match backup_deduplicator::build::run(BuildSettings { + match build::cmd::run(BuildSettings { directory: directory.to_path_buf(), //into_archives: archives, follow_symlinks, output: output.clone(), - absolute_paths, + // absolute_paths, threads: args.threads, continue_file: !recreate_output, hash_type @@ -208,7 +209,7 @@ fn main() { if !no_clean { info!("Executing clean command"); - match clean::run(CleanSettings { + match clean::cmd::run(CleanSettings { input: output.clone(), output: output, root: None, @@ -239,13 +240,13 @@ fn main() { working_directory, follow_symlinks } => { - let input = main::utils::parse_path(input.as_str(), main::utils::ParsePathKind::AbsoluteNonExisting); - let output = main::utils::parse_path(output.as_str(), main::utils::ParsePathKind::AbsoluteNonExisting); + let input = utils::main::parse_path(input.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting); + let output = utils::main::parse_path(output.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting); // Change working directory trace!("Changing working directory"); - main::utils::change_working_directory(working_directory.map(|w| main::utils::parse_path(w.as_str(), main::utils::ParsePathKind::AbsoluteNonExisting))); + utils::main::change_working_directory(working_directory.map(|w| utils::main::parse_path(w.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting))); if !input.exists() { eprintln!("Input file does not exist: {:?}", input); @@ -257,7 +258,7 @@ fn main() { std::process::exit(exitcode::CONFIG); } - match clean::run(CleanSettings { + match clean::cmd::run(CleanSettings { input, output, root, @@ -278,8 +279,8 @@ fn main() { output, overwrite } => { - let input = main::utils::parse_path(input.as_str(), main::utils::ParsePathKind::AbsoluteExisting); - let output = main::utils::parse_path(output.as_str(), main::utils::ParsePathKind::AbsoluteNonExisting); + let input = utils::main::parse_path(input.as_str(), utils::main::ParsePathKind::AbsoluteExisting); + let output = utils::main::parse_path(output.as_str(), utils::main::ParsePathKind::AbsoluteNonExisting); if !input.exists() { eprintln!("Input file does not exist: {:?}", input); @@ -291,7 +292,7 @@ fn main() { std::process::exit(exitcode::CONFIG); } - match analyze::run(AnalysisSettings { + match analyze::cmd::run(AnalysisSettings { input, output, threads: args.threads, diff --git a/src/main/utils.rs b/src/main/utils.rs deleted file mode 100644 index 5da5302..0000000 --- a/src/main/utils.rs +++ /dev/null @@ -1,61 +0,0 @@ -use std::env; -use std::path::PathBuf; -use crate::utils::LexicalAbsolute; - -pub fn change_working_directory(working_directory: Option) -> PathBuf { - match working_directory { - None => {}, - Some(working_directory) => { - env::set_current_dir(&working_directory).unwrap_or_else(|_| { - eprintln!("IO error, could not change working directory: {}", working_directory.display()); - std::process::exit(exitcode::CONFIG); - }); - } - } - - env::current_dir().unwrap_or_else(|_| { - eprintln!("IO error, could not resolve working directory"); - std::process::exit(exitcode::CONFIG); - }).canonicalize().unwrap_or_else(|_| { - eprintln!("IO error, could not resolve working directory"); - std::process::exit(exitcode::CONFIG); - }) -} - -#[derive(Debug, Clone, Copy)] -pub enum ParsePathKind { - Direct, - AbsoluteExisting, - AbsoluteNonExisting, -} - -pub fn parse_path(path: &str, kind: ParsePathKind) -> PathBuf { - let path = std::path::Path::new(path); - - let path = path.to_path_buf(); - - let path = match kind { - ParsePathKind::Direct => path, - ParsePathKind::AbsoluteExisting => to_lexical_absolute(path, true), - ParsePathKind::AbsoluteNonExisting => to_lexical_absolute(path, false), - }; - - path -} - -pub fn to_lexical_absolute(path: PathBuf, exists: bool) -> PathBuf { - let path = match exists { - true => path.canonicalize(), - false => path.to_lexical_absolute(), - }; - - let path = match path{ - Ok(out) => out, - Err(e) => { - eprintln!("IO error, could not resolve output file: {:?}", e); - std::process::exit(exitcode::CONFIG); - } - }; - - path -} diff --git a/src/pool.rs b/src/pool.rs new file mode 100644 index 0000000..68f4750 --- /dev/null +++ b/src/pool.rs @@ -0,0 +1,287 @@ +use std::sync::{Arc, mpsc, Mutex}; +use std::sync::mpsc::{Receiver, RecvTimeoutError, Sender}; +use std::thread; +use std::time::Duration; +use log::{debug, error, trace, warn}; + +/// A trait that must be implemented by a job type to be processed by the pool. +pub trait JobTrait { + /// Get the job id. + /// + /// # Returns + /// * `usize` - The job id. + fn job_id(&self) -> usize; +} + +/// A trait that must be implemented by a result type to be returned by the pool. +pub trait ResultTrait {} + +/// Worker entry function signature +/// The worker entry function is called by the worker thread to process a job. +/// A custom worker must supply a function of this type to the thread pool to process jobs. +/// +/// # Arguments +/// * `usize` - The current worker id. +/// * `Job` - The job received that should be processed. +/// * `&Sender` - A sender to publish job results. +/// * `&Sender` - A sender to publish new jobs to the thread pool. +/// * `&mut Argument` - A mutable reference to the arguments passed to the worker thread via the thread pool creation. +/// +/// # Returns +/// * `()` - The worker entry function should not return a value but instead should send the result via the `Sender` back to the main thread. +type WorkerEntry = fn(usize, Job, &Sender, &Sender, &mut Argument); + +/// Internal worker struct to manage the worker thread via the thread pool. +/// +/// # Fields +/// * `id` - The worker id. +/// * `thread` - The worker thread handle. +struct Worker +{ + id: usize, + thread: Option>, +} + +impl Worker { + /// Create a new worker thread. Starts the worker thread and returns the worker struct. + /// + /// # Arguments + /// * `id` - The worker id. + /// * `job_receive` - A receiver to receive jobs from the thread pool. + /// * `result_publish` - A sender to publish job results. + /// * `job_publish` - A sender to publish new jobs to the thread pool. + /// * `func` - The worker entry function to process jobs. + /// * `arg` - The arguments passed to the worker thread via the thread pool creation. + /// + /// # Returns + /// * `Worker` - The worker struct with the worker thread handle. + fn new(id: usize, job_receive: Arc>>, result_publish: Sender, job_publish: Sender, func: WorkerEntry, arg: Argument) -> Worker { + let thread = thread::spawn(move || { + Worker::worker_entry(id, job_receive, result_publish, job_publish, func, arg); + }); + + Worker { id, thread: Some(thread) } + } + + /// Function executed by the worker thread. Does exit when the job receiver is closed/the thread pool is shutting down. + /// + /// # Arguments + /// * `id` - The worker id. + /// * `job_receive` - A receiver to receive jobs from the thread pool. + /// * `result_publish` - A sender to publish job results. + /// * `job_publish` - A sender to publish new jobs to the thread pool. + /// * `func` - The worker entry function to process jobs. + /// * `arg` - The arguments passed to the worker thread via the thread pool creation. + fn worker_entry(id: usize, job_receive: Arc>>, result_publish: Sender, job_publish: Sender, func: WorkerEntry, mut arg: Argument) { + loop { + // Acquire the job lock + let job = job_receive.lock(); + + let job = match job { + Err(e) => { + error!("Worker {} shutting down {}", id, e); + break; + } + Ok(job) => { + job.recv() // receive new job + } + }; + + match job { + Err(_) => { + trace!("Worker {} shutting down", id); + break; + } + Ok(job) => { + trace!("Worker {} received job {}", id, job.job_id()); + // Call the user function to process the job + func(id, job, &result_publish, &job_publish, &mut arg); + } + } + } + } +} + +/// A thread pool to manage the distribution of jobs to worker threads. +/// +/// # Template Parameters +/// * `Job` - The job type that should be processed by the worker threads. +/// * `Result` - The result type that should be returned by the worker threads. +/// +/// Both `Job` and `Result` must implement the `Send` trait. +pub struct ThreadPool +where + Job: Send, + Result: Send, +{ + workers: Vec, + thread: Option>, + job_publish: Arc>>>, + result_receive: Receiver, +} + +impl ThreadPool { + /// Create a new thread pool with a given number of worker threads (args.len()). + /// Each worker thread will receive an argument from the args vector. When a new job + /// is published to the thread pool, the thread pool will distribute the job to the worker threads + /// and execute the `func` function within a worker thread. + /// + /// # Arguments + /// * `args` - A vector of arguments that should be passed to the worker threads. + /// * `func` - The worker entry function to process jobs. + /// + /// # Returns + /// * `ThreadPool` - The thread pool struct with the worker threads. + /// + /// # Template Parameters + /// * `Argument` - The argument type that should be passed to the worker threads. + /// The argument type must implement the `Send` trait. + pub fn new(mut args: Vec, func: WorkerEntry) -> ThreadPool { + assert!(args.len() > 0); + + let mut workers = Vec::with_capacity(args.len()); + + let (job_publish, job_receive) = mpsc::channel(); + + let job_receive = Arc::new(Mutex::new(job_receive)); + let (result_publish, result_receive) = mpsc::channel(); + let (thread_publish_job, thread_receive_job) = mpsc::channel(); + + let mut id = 0; + while let Some(arg) = args.pop() { + workers.push(Worker::new(id, Arc::clone(&job_receive), result_publish.clone(), thread_publish_job.clone(), func, arg)); + id += 1; + } + + let job_publish = Arc::new(Mutex::new(Some(job_publish))); + let job_publish_clone = Arc::clone(&job_publish); + + let thread = thread::spawn(move || { + ThreadPool::::pool_entry(job_publish_clone, thread_receive_job); + }); + + ThreadPool { + workers, + job_publish, + result_receive, + thread: Some(thread), + } + } + + /// Publish a new job to the thread pool. The job will be distributed to a worker thread. + /// + /// # Arguments + /// * `job` - The job that should be processed by a worker thread. + pub fn publish(&self, job: Job) { + let job_publish = self.job_publish.lock(); + match job_publish { + Err(e) => { + error!("ThreadPool is shutting down. Cannot publish job. {}", e); + } + Ok(job_publish) => { + match job_publish.as_ref() { + None => { + error!("ThreadPool is shutting down. Cannot publish job."); + } + Some(job_publish) => { + match job_publish.send(job) { + Err(e) => { + error!("Failed to publish job on thread pool. {}", e); + } + Ok(_) => {} + } + } + } + } + } + } + + /// Internal function that is run in a separate thread. It feeds back jobs from the worker threads to the input of the thread pool. + /// + /// # Arguments + /// * `job_publish` - A sender to publish new jobs to the thread pool. + /// * `job_receive` - A receiver to receive jobs from the worker threads. + fn pool_entry(job_publish: Arc>>>, job_receive: Receiver) { + loop { + let job = job_receive.recv(); + + match job { + Err(_) => { + trace!("Pool worker shutting down"); + break; + } + Ok(job) => { + match job_publish.lock() { + Err(e) => { + error!("Pool worker shutting down: {}", e); + break; + } + Ok(job_publish) => { + if let Some(job_publish) = job_publish.as_ref() { + job_publish.send(job).expect("Pool worker failed to send job. This should never fail."); + } + } + } + } + } + } + } + + /// Receive a result from the worker threads. This function will block until a result is available. + /// + /// # Returns + /// * `Result` - The result of a job processed by a worker thread. + /// + /// # Errors + /// * If all worker threads panicked, therefore the pipe is closed + pub fn receive(&self) -> std::result::Result { + self.result_receive.recv() + } + + /// Receive a result from the worker threads. This function will block until a result is available or a timeout occurs. + /// + /// # Arguments + /// * `timeout` - The maximum time to wait for a result. + /// + /// # Returns + /// * `Result` - The result of a job processed by a worker thread. + /// + /// # Errors + /// * If all worker threads panicked, therefore the pipe is closed + /// * If the timeout occurs before a result is available + pub fn receive_timeout(&self, timeout: Duration) -> std::result::Result { + self.result_receive.recv_timeout(timeout) + } +} + +impl Drop for ThreadPool { + fn drop(&mut self) { + drop(self.job_publish.lock().expect("This should not break").take()); + + for worker in &mut self.workers { + debug!("Shutting down worker {}", worker.id); + + if let Some(thread) = worker.thread.take() { + match thread.join() { + Ok(_) => { + trace!("Worker {} shut down", worker.id); + } + Err(_) => { + warn!("Worker {} panicked", worker.id); + } + } + } + } + + if let Some(thread) = self.thread.take() { + match thread.join() { + Ok(_) => { + trace!("ThreadPool shut down"); + } + Err(_) => { + warn!("ThreadPool worker panicked"); + } + } + } + } +} diff --git a/src/stages/analyze.rs b/src/stages/analyze.rs new file mode 100644 index 0000000..462034e --- /dev/null +++ b/src/stages/analyze.rs @@ -0,0 +1,11 @@ + +pub mod output { + mod dupset_file; + + pub use dupset_file::*; +} + +pub mod cmd; +mod worker; + +pub mod intermediary_analysis_data; diff --git a/src/cmd/analyze.rs b/src/stages/analyze/cmd.rs similarity index 75% rename from src/cmd/analyze.rs rename to src/stages/analyze/cmd.rs index 8a374ad..4106bd1 100644 --- a/src/cmd/analyze.rs +++ b/src/stages/analyze/cmd.rs @@ -1,3 +1,4 @@ +use crate::stages::analyze::worker::AnalysisIntermediaryFile; use std::collections::HashMap; use std::fs; use std::io::Write; @@ -7,17 +8,40 @@ use std::sync::{Arc, Mutex}; use std::time::Duration; use anyhow::{anyhow, Result}; use log::{error, info, trace}; -use crate::analyze::analysis::{AnalysisFile, ResultEntryRef}; -use crate::analyze::worker::{AnalysisJob, AnalysisResult, MarkedIntermediaryFile, WorkerArgument}; -use crate::data::{GeneralHash, SaveFile, SaveFileEntry, SaveFileEntryType}; -use crate::threadpool::ThreadPool; +use crate::hash::{GeneralHash, GeneralHashType}; +use crate::pool::ThreadPool; +use crate::stages::analyze::intermediary_analysis_data::AnalysisFile; +use crate::stages::analyze::output::{DupSetEntryRef}; +use crate::stages::analyze::worker::{AnalysisJob, AnalysisResult, worker_run, AnalysisWorkerArgument}; +use crate::stages::build::output::{HashTreeFile, HashTreeFileEntry, HashTreeFileEntryType}; +use crate::utils::NullWriter; +/// The settings for the analysis cmd. +/// +/// # Fields +/// * `input` - The input file to analyze. +/// * `output` - The output file to write the results to. +/// * `threads` - The number of threads to use for the analysis. If None, the number of threads is equal to the number of CPUs. pub struct AnalysisSettings { pub input: PathBuf, pub output: PathBuf, pub threads: Option, } +/// Run the analysis cmd. +/// +/// # Arguments +/// * `analysis_settings` - The settings for the analysis cmd. +/// +/// # Returns +/// Nothing +/// +/// # Errors +/// * If the input file cannot be opened. +/// * If the output file cannot be opened. +/// * If the header of the input file cannot be loaded. +/// * If an error occurs while loading entries from the input file. +/// * If writing to the output file fails. pub fn run(analysis_settings: AnalysisSettings) -> Result<()> { let mut input_file_options = fs::File::options(); input_file_options.read(true); @@ -43,9 +67,10 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> { }; let mut input_buf_reader = std::io::BufReader::new(&input_file); + let mut null_out_writer = NullWriter::new(); let mut output_buf_writer = std::io::BufWriter::new(&output_file); - let mut save_file = SaveFile::new(&mut output_buf_writer, &mut input_buf_reader, true, true, true); + let mut save_file = HashTreeFile::new(&mut null_out_writer, &mut input_buf_reader, GeneralHashType::NULL, true, true, true); save_file.load_header()?; save_file.load_all_entries_no_filter()?; @@ -56,7 +81,7 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> { let mut all_files = save_file.all_entries; for (path, entry) in file_by_path.iter_mut() { - file_by_path_marked.insert(path.clone(), MarkedIntermediaryFile { + file_by_path_marked.insert(path.clone(), AnalysisIntermediaryFile { saved_file_entry: Arc::clone(entry), file: Arc::new(Mutex::new(None)), }); @@ -82,12 +107,12 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> { let mut args = Vec::with_capacity(analysis_settings.threads.unwrap_or_else(|| num_cpus::get())); for _ in 0..args.capacity() { - args.push(WorkerArgument { + args.push(AnalysisWorkerArgument { file_by_path: Arc::clone(&file_by_path) }); } - let pool: ThreadPool = ThreadPool::new(args, crate::cmd::analyze::worker::worker_run); + let pool: ThreadPool = ThreadPool::new(args, worker_run); for entry in &all_files { pool.publish(AnalysisJob::new(Arc::clone(entry))); @@ -171,12 +196,17 @@ pub fn run(analysis_settings: AnalysisSettings) -> Result<()> { Ok(()) } +/// Used to find duplicates of entries in the hash tree file. #[derive(Debug, PartialEq, Hash, Eq)] struct SetKey<'a> { size: u64, - ftype: &'a SaveFileEntryType, + ftype: &'a HashTreeFileEntryType, + children: &'a Vec, } -fn write_result_entry(file: &AnalysisFile, file_by_hash: &HashMap>>, output_buf_writer: &mut std::io::BufWriter<&fs::File>) -> u64 { +/// Write the result entry to the output file. Find all duplicates of the file and write them to the output file. +/// If called for every file, it will write all duplicates to the output file. +/// Writing each file only once +fn write_result_entry(file: &AnalysisFile, file_by_hash: &HashMap>>, output_buf_writer: &mut std::io::BufWriter<&fs::File>) -> u64 { let hash = match file { AnalysisFile::File(info) => &info.content_hash, AnalysisFile::Directory(info) => &info.content_hash, @@ -186,12 +216,13 @@ fn write_result_entry(file: &AnalysisFile, file_by_hash: &HashMap> = HashMap::new(); + let mut sets: HashMap> = HashMap::new(); for file in file_by_hash.get(hash).unwrap() { sets.entry(SetKey { size: file.size, - ftype: &file.file_type + ftype: &file.file_type, + children: &file.children, }).or_insert(Vec::new()).push(file); } @@ -212,7 +243,7 @@ fn write_result_entry(file: &AnalysisFile, file_by_hash: &HashMap &Mutex>> { + match self { + AnalysisFile::File(info) => &info.parent, + AnalysisFile::Directory(info) => &info.parent, + AnalysisFile::Symlink(info) => &info.parent, + AnalysisFile::Other(info) => &info.parent, + } + } + + /// Get the path of the file. + /// + /// # Returns + /// The path of the file. + pub fn path(&self) -> &FilePath { + match self { + AnalysisFile::File(info) => &info.path, + AnalysisFile::Directory(info) => &info.path, + AnalysisFile::Symlink(info) => &info.path, + AnalysisFile::Other(info) => &info.path, + } + } +} + +/// File information part of [AnalysisFile]. +/// +/// # Fields +/// * `path` - The path of the file. +/// * `content_hash` - The hash of the file content. +/// * `parent` - The parent of the file. +#[derive(Debug, Serialize, Deserialize)] +pub struct AnalysisFileInformation { + pub path: FilePath, + pub content_hash: GeneralHash, + pub parent: Mutex>>, +} + +/// Directory information part of [AnalysisFile]. +/// +/// # Fields +/// * `path` - The path of the directory. +/// * `content_hash` - The hash of the directory content. +/// * `children` - The children of the directory. +/// * `parent` - The parent of the directory. +#[derive(Debug, Serialize, Deserialize)] +pub struct AnalysisDirectoryInformation { + pub path: FilePath, + pub content_hash: GeneralHash, + pub children: Mutex>>, + pub parent: Mutex>>, +} + +/// Symlink information part of [AnalysisFile]. +/// +/// # Fields +/// * `path` - The path of the symlink. +/// * `content_hash` - The hash of the symlink content. +/// * `parent` - The parent of the symlink. +#[derive(Debug, Serialize, Deserialize)] +pub struct AnalysisSymlinkInformation { + pub path: FilePath, + pub content_hash: GeneralHash, + pub parent: Mutex>>, +} + +/// Other information part of [AnalysisFile]. +/// +/// # Fields +/// * `path` - The path of the file. +/// * `parent` - The parent of the file. +#[derive(Debug, Serialize, Deserialize)] +pub struct AnalysisOtherInformation { + pub path: FilePath, + pub parent: Mutex>>, +} diff --git a/src/stages/analyze/output/dupset_file.rs b/src/stages/analyze/output/dupset_file.rs new file mode 100644 index 0000000..9cb0da6 --- /dev/null +++ b/src/stages/analyze/output/dupset_file.rs @@ -0,0 +1,19 @@ +use serde::{Serialize}; +use crate::hash::GeneralHash; +use crate::path::FilePath; +use crate::stages::build::output::HashTreeFileEntryType; + +/// The result of the analysis worker. A duplicate set entry. +/// +/// # Fields +/// * `ftype` - The type of the file. +/// * `size` - The size of the file. +/// * `hash` - The hash of the file content. +/// * `conflicting` - The conflicting files. +#[derive(Debug, Serialize)] +pub struct DupSetEntryRef<'a, 'b, 'c> { + pub ftype: &'a HashTreeFileEntryType, + pub size: u64, + pub hash: &'b GeneralHash, + pub conflicting: Vec<&'c FilePath>, +} diff --git a/src/cmd/analyze/worker.rs b/src/stages/analyze/worker.rs similarity index 54% rename from src/cmd/analyze/worker.rs rename to src/stages/analyze/worker.rs index 9a3432e..3d6fbc3 100644 --- a/src/cmd/analyze/worker.rs +++ b/src/stages/analyze/worker.rs @@ -3,27 +3,51 @@ use std::ops::Deref; use std::sync::{Arc, Mutex}; use std::sync::mpsc::Sender; use log::error; -use crate::data::{FilePath, JobTrait, ResultTrait, SaveFileEntry, SaveFileEntryType}; -use super::analysis::{DirectoryInformation, AnalysisFile, FileInformation, OtherInformation, SymlinkInformation}; +use crate::path::FilePath; +use crate::pool::{JobTrait, ResultTrait}; +use crate::stages::analyze::intermediary_analysis_data::{AnalysisFile, AnalysisDirectoryInformation, AnalysisFileInformation, AnalysisOtherInformation, AnalysisSymlinkInformation}; +use crate::stages::build::output::{HashTreeFileEntry, HashTreeFileEntryType}; +/// The intermediary file for the analysis worker. +/// +/// # Fields +/// * `saved_file_entry` - A saved file entry from the hash tree file. +/// * `file` - Analysis result of the file. Processed by a worker. #[derive(Debug)] -pub struct MarkedIntermediaryFile { - pub saved_file_entry: Arc, +pub struct AnalysisIntermediaryFile { + pub saved_file_entry: Arc, pub file: Arc>>>, } -pub struct WorkerArgument { - pub file_by_path: Arc>, +/// The argument for the analysis worker main thread. +/// Files from the hash tree file are stored in a hash map. +/// +/// # Fields +/// * `file_by_path` - A hash map of [FilePath] -> [AnalysisIntermediaryFile]. +pub struct AnalysisWorkerArgument { + pub file_by_path: Arc>, } +/// The job for the analysis worker. +/// +/// # Fields +/// * `id` - The id of the job. +/// * `file` - The file to analyze. #[derive(Debug)] pub struct AnalysisJob { id: usize, - pub file: Arc, + pub file: Arc, } impl AnalysisJob { - pub fn new(file: Arc) -> Self { + /// Create a new analysis job. + /// + /// # Arguments + /// * `file` - The file to analyze. + /// + /// # Returns + /// The analysis job. + pub fn new(file: Arc) -> Self { Self { id: new_job_counter_id(), file, @@ -31,6 +55,16 @@ impl AnalysisJob { } } +impl JobTrait for AnalysisJob { + /// Get the job id. + /// + /// # Returns + /// The job id. + fn job_id(&self) -> usize { + self.id + } +} + static JOB_COUNTER: Mutex = Mutex::new(0); fn new_job_counter_id() -> usize { @@ -39,24 +73,24 @@ fn new_job_counter_id() -> usize { (*counter).clone() } -impl JobTrait for AnalysisJob { - fn job_id(&self) -> usize { - self.id - } -} +/// The result for the analysis worker. #[derive(Debug)] -pub struct AnalysisResult { - -} - -impl ResultTrait for AnalysisResult { - -} +pub struct AnalysisResult {} +impl ResultTrait for AnalysisResult {} -fn parent_file<'a, 'b>(file: &'b MarkedIntermediaryFile, arg: &'a WorkerArgument) -> Option<(&'a Arc>>>, FilePath)> { +/// Get the parent file of a file. Searches the arg.cache for the parent file. +/// +/// # Arguments +/// * `file` - The file to get the parent of. +/// * `arg` - The argument for the worker thread. +/// +/// # Returns +/// The parent file and the parent path. +/// If the parent file is not present, return None. +fn parent_file<'a, 'b>(file: &'b AnalysisIntermediaryFile, arg: &'a AnalysisWorkerArgument) -> Option<(&'a Arc>>>, FilePath)> { match file.saved_file_entry.path.parent() { None => None, Some(parent_path) => { @@ -73,35 +107,42 @@ fn parent_file<'a, 'b>(file: &'b MarkedIntermediaryFile, arg: &'a WorkerArgument } } -fn recursive_process_file(path: &FilePath, arg: &WorkerArgument) { +/// Recursively process a file. Iterates over the file and its parent files until +/// the parent file is present or the root is reached. +/// +/// # Arguments +/// * `id` - The id of the worker. +/// * `path` - The path of the file to process. +/// * `arg` - The argument for the worker thread. +fn recursive_process_file(id: usize, path: &FilePath, arg: &AnalysisWorkerArgument) { let marked_file = arg.file_by_path.get(path); let mut attach_parent = None; if let Some(file) = marked_file { let result = match file.saved_file_entry.file_type { - SaveFileEntryType::File => { - AnalysisFile::File(FileInformation { + HashTreeFileEntryType::File => { + AnalysisFile::File(AnalysisFileInformation { path: file.saved_file_entry.path.clone(), content_hash: file.saved_file_entry.hash.clone(), parent: Mutex::new(None), }) }, - SaveFileEntryType::Symlink => { - AnalysisFile::Symlink(SymlinkInformation { + HashTreeFileEntryType::Symlink => { + AnalysisFile::Symlink(AnalysisSymlinkInformation { path: file.saved_file_entry.path.clone(), content_hash: file.saved_file_entry.hash.clone(), parent: Mutex::new(None), }) }, - SaveFileEntryType::Other => { - AnalysisFile::Other(OtherInformation { + HashTreeFileEntryType::Other => { + AnalysisFile::Other(AnalysisOtherInformation { path: file.saved_file_entry.path.clone(), parent: Mutex::new(None), }) }, - SaveFileEntryType::Directory => { - AnalysisFile::Directory(DirectoryInformation { + HashTreeFileEntryType::Directory => { + AnalysisFile::Directory(AnalysisDirectoryInformation { path: file.saved_file_entry.path.clone(), content_hash: file.saved_file_entry.hash.clone(), children: Mutex::new(Vec::new()), @@ -120,7 +161,7 @@ fn recursive_process_file(path: &FilePath, arg: &WorkerArgument) { } }, Err(err) => { - panic!("Failed to lock file: {}", err); + panic!("[{}] Failed to lock file: {}", id, err); } } @@ -130,17 +171,17 @@ fn recursive_process_file(path: &FilePath, arg: &WorkerArgument) { } if let Some((result, parent, parent_path)) = attach_parent { - match add_to_parent_as_child(parent, &result) { + match add_to_parent_as_child(id, parent, &result) { AddToParentResult::Ok => { return; }, AddToParentResult::ParentDoesNotExist => { // parent does not exist // create it - recursive_process_file(&parent_path, arg); + recursive_process_file(id, &parent_path, arg); // try to read to parent again - match add_to_parent_as_child(parent, &result) { + match add_to_parent_as_child(id, parent, &result) { AddToParentResult::Ok => { return; }, AddToParentResult::ParentDoesNotExist => { - error!("Parent still does not exist"); + error!("[{}] Parent still does not exist", id); return; }, AddToParentResult::Error => { @@ -155,13 +196,28 @@ fn recursive_process_file(path: &FilePath, arg: &WorkerArgument) { } } +/// The result of adding a file to a parent as child, see [add_to_parent_as_child] +/// +/// # Variants +/// * `Ok` - The operation was successful. +/// * `ParentDoesNotExist` - The parent does not exist. +/// * `Error` - An error occurred during the operation enum AddToParentResult { Ok, ParentDoesNotExist, Error, } -fn add_to_parent_as_child(parent: &Arc>>>, child: &Arc) -> AddToParentResult { +/// Add a file to a parent as a child. +/// +/// # Arguments +/// * `id` - The id of the worker. +/// * `parent` - The parent file. +/// * `child` - The child file. +/// +/// # Returns +/// The result of the operation. +fn add_to_parent_as_child(id: usize, parent: &Arc>>>, child: &Arc) -> AddToParentResult { match parent.lock() { Ok(guard) => { // exclusive access to parent file @@ -175,7 +231,7 @@ fn add_to_parent_as_child(parent: &Arc>>>, child: *guard = Some(Arc::downgrade(parent)); }, Err(err) => { - error!("Failed to lock parent: {}", err); + error!("[{}] Failed to lock parent: {}", id, err); return AddToParentResult::Error; } } @@ -189,13 +245,13 @@ fn add_to_parent_as_child(parent: &Arc>>>, child: AddToParentResult::Ok }, Err(err) => { - error!("Failed to lock children: {}", err); + error!("[{}] Failed to lock children: {}", id, err); AddToParentResult::Error } } }, _ => { - error!("Parent is not a directory"); + error!("[{}] Parent is not a directory", id); AddToParentResult::Error } } @@ -207,12 +263,15 @@ fn add_to_parent_as_child(parent: &Arc>>>, child: } }, Err(err) => { - error!("Failed to lock file: {}", err); + error!("[{}] Failed to lock file: {}", id, err); AddToParentResult::Error } } } -pub fn worker_run(_id: usize, job: AnalysisJob, _result_publish: &Sender, _job_publish: &Sender, arg: &mut WorkerArgument) { - recursive_process_file(&job.file.path, arg); +/// The main function for the analysis worker. +/// +/// # Arguments +pub fn worker_run(id: usize, job: AnalysisJob, _result_publish: &Sender, _job_publish: &Sender, arg: &mut AnalysisWorkerArgument) { + recursive_process_file(id, &job.file.path, arg); } diff --git a/src/stages/build.rs b/src/stages/build.rs new file mode 100644 index 0000000..00862f9 --- /dev/null +++ b/src/stages/build.rs @@ -0,0 +1,17 @@ + +pub mod output { + pub mod converter; + mod hashtreefile; + + pub use hashtreefile::*; +} + +pub mod cmd { + mod cmd; + pub mod job; + pub mod worker; + + pub use cmd::*; +} + +pub mod intermediary_build_data; diff --git a/src/cmd/build.rs b/src/stages/build/cmd/cmd.rs similarity index 63% rename from src/cmd/build.rs rename to src/stages/build/cmd/cmd.rs index e62b6f8..bba9f5e 100644 --- a/src/cmd/build.rs +++ b/src/stages/build/cmd/cmd.rs @@ -3,41 +3,46 @@ use std::fs; use std::path::{PathBuf}; use std::sync::Arc; use anyhow::{anyhow, Result}; -use serde::Serialize; -use crate::build::worker::{worker_run, WorkerArgument}; -use crate::data::{FilePath, GeneralHashType, Job, PathTarget, ResultTrait, File, SaveFile, SaveFileEntryRef, SaveFileEntry}; -use crate::threadpool::ThreadPool; - -mod worker; +use crate::hash::GeneralHashType; +use crate::path::{FilePath}; +use crate::pool::ThreadPool; +use crate::stages::build::cmd::job::{BuildJob, JobResult}; +use crate::stages::build::cmd::worker::{worker_run, WorkerArgument}; +use crate::stages::build::output::{HashTreeFile, HashTreeFileEntry, HashTreeFileEntryRef}; +/// The settings for the build command. +/// +/// # Fields +/// * `directory` - The directory to build. +/// * `follow_symlinks` - Whether to follow symlinks when traversing the file system. +/// * `output` - The output file to write the hash tree to. +/// * `threads` - The number of threads to use for building the hash tree. None = number of logical CPUs. +/// * `hash_type` - The hash algorithm to use for hashing files. +/// * `continue_file` - Whether to continue an existing hash tree file. pub struct BuildSettings { pub directory: PathBuf, // pub into_archives: bool, pub follow_symlinks: bool, pub output: PathBuf, - pub absolute_paths: bool, + // pub absolute_paths: bool, pub threads: Option, pub hash_type: GeneralHashType, pub continue_file: bool, } -#[derive(Debug, Serialize, Clone)] -struct JobResultContent { - already_cached: bool, - content: File, -} - -#[derive(Debug, Serialize, Clone)] -enum JobResult { - Final(JobResultContent), - Intermediate(JobResultContent), -} - -impl ResultTrait for JobResult { - -} - +/// Runs the build command. Hashes a directory and produces a hash tree file. +/// +/// # Arguments +/// * `build_settings` - The settings for the build command. +/// +/// # Returns +/// Nothing +/// +/// # Errors +/// * If the output file cannot be opened. +/// * If the header cannot be loaded from the output file (if the file is continued). +/// * If the output file cannot be written to. pub fn run( build_settings: BuildSettings, ) -> Result<()> { @@ -63,7 +68,7 @@ pub fn run( let mut result_in = std::io::BufReader::new(&result_file); let mut result_out = std::io::BufWriter::new(&result_file); - let mut save_file = SaveFile::new(&mut result_out, &mut result_in, false, true, false); + let mut save_file = HashTreeFile::new(&mut result_out, &mut result_in, build_settings.hash_type, false, true, false); match save_file.load_header() { Ok(_) => {}, Err(err) => { @@ -75,6 +80,7 @@ pub fn run( } } + // load all existing entries from the hash tree file match save_file.load_all_entries_no_filter() { Ok(_) => {}, Err(err) => { @@ -86,7 +92,7 @@ pub fn run( save_file.empty_file_by_hash(); save_file.empty_entry_list(); - let mut file_by_hash: HashMap = HashMap::with_capacity(save_file.file_by_hash.len()); + let mut file_by_hash: HashMap = HashMap::with_capacity(save_file.file_by_hash.len()); save_file.file_by_path.drain().for_each(|(k, v)| { file_by_hash.insert(k, Arc::into_inner(v).expect("There should be no further references to the entry")); }); @@ -103,10 +109,10 @@ pub fn run( }); } - let pool: ThreadPool = ThreadPool::new(args, worker_run); + let pool: ThreadPool = ThreadPool::new(args, worker_run); - let root_file = FilePath::from_path(build_settings.directory, PathTarget::File); - let root_job = Job::new(None, root_file); + let root_file = FilePath::from_realpath(build_settings.directory); + let root_job = BuildJob::new(None, root_file); pool.publish(root_job); @@ -124,7 +130,7 @@ pub fn run( }; if !result.already_cached { - let entry = SaveFileEntryRef::from(&result.content); + let entry = HashTreeFileEntryRef::from(&result.content); save_file.write_entry_ref(&entry)?; } diff --git a/src/stages/build/cmd/job.rs b/src/stages/build/cmd/job.rs new file mode 100644 index 0000000..4b9d7ce --- /dev/null +++ b/src/stages/build/cmd/job.rs @@ -0,0 +1,116 @@ +use std::sync::{Arc, Mutex}; +use serde::Serialize; +use crate::stages::build::intermediary_build_data::BuildFile; +use crate::path::FilePath; +use crate::pool::{JobTrait, ResultTrait}; + +pub type SharedBuildJob = Arc; + +static JOB_COUNTER: Mutex = Mutex::new(0); + +fn new_job_counter_id() -> usize { + let mut counter = JOB_COUNTER.lock().expect("Failed to lock job counter"); + *counter += 1; + (*counter).clone() +} + +/// The state of a build job. Used to track the state of a directory process job. +/// +/// # Fields +/// * `NotProcessed` - The job has not been processed yet. +/// * `Analyzed` - The directory has been expanded and can be analyzed further. +#[derive(Debug, Clone, PartialEq, Copy)] +pub enum BuildJobState { + NotProcessed, + Analyzed, +} + +/// A build job. Used to issue a job to hash a file/directory. +/// +/// # Fields +/// * `parent` - The parent job of this job. +/// * `finished_children` - The finished children of this job. +/// * `target_path` - The path of the file/directory to hash. +/// * `state` - The state of the job. +#[derive(Debug)] +pub struct BuildJob { + id: usize, + pub parent: Option, + pub finished_children: Mutex>, + pub target_path: FilePath, + pub state: BuildJobState, +} + +impl BuildJob { + /// Create a new build job. + /// + /// # Arguments + /// * `parent` - The parent job of this job. + /// * `target_path` - The path of the file/directory to hash. + /// + /// # Returns + /// The created build job. + pub fn new(parent: Option, target_path: FilePath) -> Self { + BuildJob { + id: new_job_counter_id(), + parent, + target_path, + state: BuildJobState::NotProcessed, + finished_children: Mutex::new(Vec::new()), + } + } + + /// Get the job id. + /// + /// # Returns + /// The job id. + pub fn job_id(&self) -> usize { + self.id + } + + /// Create and assign a new unique job id. + /// + /// # Returns + /// The build job with the new job id. + pub fn new_job_id(mut self) -> Self { + self.id = new_job_counter_id(); + self + } +} + +impl JobTrait for BuildJob { + /// Get the job id. + /// + /// # Returns + /// The job id. + fn job_id(&self) -> usize { + BuildJob::job_id(self) + } +} + +/// The result of a build job. +/// +/// # Fields +/// * `already_cached` - Whether the content was already cached. +/// * `content` - The content of the job result. +#[derive(Debug, Serialize, Clone)] +pub struct JobResultContent { + pub already_cached: bool, + pub content: BuildFile, +} + +/// A job result. +/// +/// # Fields +/// * `Final` - The final result of command. Returned if the job has no parent. +/// * `Intermediate` - An intermediate result of a command. Returned if the job has a parent. +#[derive(Debug, Serialize, Clone)] +pub enum JobResult { + Final(JobResultContent), + Intermediate(JobResultContent), +} + +impl ResultTrait for JobResult { + +} + diff --git a/src/cmd/build/worker.rs b/src/stages/build/cmd/worker.rs similarity index 58% rename from src/cmd/build/worker.rs rename to src/stages/build/cmd/worker.rs index ee0df35..e0bbcae 100644 --- a/src/cmd/build/worker.rs +++ b/src/stages/build/cmd/worker.rs @@ -1,3 +1,4 @@ +use crate::stages::build::intermediary_build_data::{BuildFile, BuildOtherInformation, BuildStubInformation}; use std::collections::HashMap; use std::fs; use std::sync::Arc; @@ -5,25 +6,41 @@ use std::sync::mpsc::Sender; use std::time::SystemTime; use anyhow::anyhow; use log::{error, info, trace, warn}; -use crate::build::{JobResult, JobResultContent}; -use crate::build::worker::directory::worker_run_directory; -use crate::build::worker::file::worker_run_file; -use crate::build::worker::other::worker_run_other; -use crate::build::worker::symlink::worker_run_symlink; -use crate::data::{File, FilePath, GeneralHashType, Job, OtherInformation, SaveFileEntry, StubInformation}; +use crate::hash::GeneralHashType; +use crate::path::FilePath; +use crate::stages::build::cmd::job::{BuildJob, JobResult, JobResultContent}; +use crate::stages::build::cmd::worker::directory::worker_run_directory; +use crate::stages::build::cmd::worker::file::worker_run_file; +use crate::stages::build::cmd::worker::other::worker_run_other; +use crate::stages::build::cmd::worker::symlink::worker_run_symlink; +use crate::stages::build::output::HashTreeFileEntry; mod directory; mod file; mod other; mod symlink; +/// The argument for the worker main thread. +/// +/// # Fields +/// * `follow_symlinks` - Whether to follow symlinks when traversing the file system. +/// * `hash_type` - The hash algorithm to use for hashing files. +/// * `save_file_by_path` - A hash map of [FilePath] -> [HashTreeFileEntry]. pub struct WorkerArgument { pub follow_symlinks: bool, pub hash_type: GeneralHashType, - pub save_file_by_path: Arc>, + pub save_file_by_path: Arc>, } -pub fn worker_run(id: usize, job: Job, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { +/// Main function for the worker thread. +/// +/// # Arguments +/// * `id` - The id of the worker. +/// * `job` - The job to process. +/// * `result_publish` - The channel to publish the result to. +/// * `job_publish` - The channel to publish new jobs to. +/// * `arg` - The argument for the worker thread. +pub fn worker_run(id: usize, job: BuildJob, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { let path = job.target_path.resolve_file(); let path = match path { Ok(file) => file, @@ -82,6 +99,11 @@ pub fn worker_run(id: usize, job: Job, result_publish: &Sender, job_p } } +/// Publish a result to the result channel. +/// Processes the error if the result could not be published. +/// +/// # Error +/// Never, issues a warning instead fn worker_publish_result(id: usize, result_publish: &Sender, result: JobResult) { match result_publish.send(result) { Ok(_) => {}, @@ -91,15 +113,34 @@ fn worker_publish_result(id: usize, result_publish: &Sender, result: } } -fn worker_create_error(path: FilePath, modified: u64, size: u64) -> File { - File::Other(OtherInformation { +/// Create a [File::Other] with the given information. +/// Used when an error occurs. +/// +/// # Arguments +/// * `path` - The path of the file. +/// * `modified` - The modified date of the file. +/// * `size` - The size of the file. +/// +/// # Returns +/// The created [File::Other]. +fn worker_create_error(path: FilePath, modified: u64, size: u64) -> BuildFile { + BuildFile::Other(BuildOtherInformation { path, modified, content_size: size, }) } -fn worker_publish_new_job(id: usize, job_publish: &Sender, job: Job) { +/// Publish a new job. +/// +/// # Arguments +/// * `id` - The id of the worker. +/// * `job_publish` - The channel to publish the job to. +/// * `job` - The job to publish. +/// +/// # Error +/// Never, issues a warning instead +fn worker_publish_new_job(id: usize, job_publish: &Sender, job: BuildJob) { match job_publish.send(job) { Ok(_) => {}, Err(e) => { @@ -108,7 +149,17 @@ fn worker_publish_new_job(id: usize, job_publish: &Sender, job: Job) { } } -fn worker_publish_result_or_trigger_parent(id: usize, cached: bool, result: File, job: Job, result_publish: &Sender, job_publish: &Sender, _arg: &mut WorkerArgument) { +/// Publish a result and trigger the parent job. +/// +/// # Arguments +/// * `id` - The id of the worker. +/// * `cached` - Whether the file is already cached. +/// * `result` - The result to publish. +/// * `job` - The job that was processed. +/// * `result_publish` - The channel to publish the result to. +/// * `job_publish` - The channel to publish new jobs to. +/// * `arg` - The argument for the worker thread. +fn worker_publish_result_or_trigger_parent(id: usize, cached: bool, result: BuildFile, job: BuildJob, result_publish: &Sender, job_publish: &Sender, _arg: &mut WorkerArgument) { let parent_job; let hash; @@ -127,7 +178,7 @@ fn worker_publish_result_or_trigger_parent(id: usize, cached: bool, result: File match parent_job.finished_children.lock() { Ok(mut finished) => { - finished.push(File::Stub(StubInformation { + finished.push(BuildFile::Stub(BuildStubInformation { path: job.target_path, content_hash: hash, })); @@ -149,6 +200,14 @@ fn worker_publish_result_or_trigger_parent(id: usize, cached: bool, result: File } } -fn worker_fetch_savedata<'a, 'b>(args: &'a WorkerArgument, path: &'b FilePath) -> Option<&'a SaveFileEntry> { +/// Fetch the saved data for a file. +/// +/// # Arguments +/// * `args` - The argument for the worker thread. +/// * `path` - The path of the file to fetch the saved data for. +/// +/// # Returns +/// The saved data for the file if it exists. +fn worker_fetch_savedata<'a, 'b>(args: &'a WorkerArgument, path: &'b FilePath) -> Option<&'a HashTreeFileEntry> { args.save_file_by_path.get(path) } diff --git a/src/cmd/build/worker/directory.rs b/src/stages/build/cmd/worker/directory.rs similarity index 75% rename from src/cmd/build/worker/directory.rs rename to src/stages/build/cmd/worker/directory.rs index e0830aa..397845e 100644 --- a/src/cmd/build/worker/directory.rs +++ b/src/stages/build/cmd/worker/directory.rs @@ -5,16 +5,28 @@ use std::path::PathBuf; use std::sync::Arc; use std::sync::mpsc::Sender; use log::{error, trace}; -use crate::build::JobResult; -use crate::build::worker::{worker_create_error, worker_fetch_savedata, worker_publish_result_or_trigger_parent, WorkerArgument}; -use crate::data::{DirectoryInformation, File, GeneralHash, Job, JobState, SaveFileEntryType}; -use crate::utils; - -pub fn worker_run_directory(path: PathBuf, modified: u64, size: u64, id: usize, mut job: Job, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { +use crate::stages::build::intermediary_build_data::{BuildDirectoryInformation, BuildFile}; +use crate::hash::GeneralHash; +use crate::stages::build::cmd::job::{BuildJob, BuildJobState, JobResult}; +use crate::stages::build::cmd::worker::{worker_create_error, worker_fetch_savedata, worker_publish_result_or_trigger_parent, WorkerArgument}; +use crate::stages::build::output::HashTreeFileEntryType; + +/// Analyze a directory. +/// +/// # Arguments +/// * `path` - The path to the directory. +/// * `modified` - The last modified time of the directory. +/// * `size` - The size of the directory (given by fs::metadata). +/// * `id` - The id of the worker. +/// * `job` - The job to process. +/// * `result_publish` - The channel to publish the result to. +/// * `job_publish` - The channel to publish new jobs to. +/// * `arg` - The argument for the worker thread. +pub fn worker_run_directory(path: PathBuf, modified: u64, size: u64, id: usize, mut job: BuildJob, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { trace!("[{}] analyzing directory {} > {:?}", id, &job.target_path, path); match job.state { - JobState::NotProcessed => { + BuildJobState::NotProcessed => { let read_dir = fs::read_dir(&path); let read_dir = match read_dir { Ok(read_dir) => read_dir, @@ -41,17 +53,17 @@ pub fn worker_run_directory(path: PathBuf, modified: u64, size: u64, id: usize, let mut children = Vec::new(); for entry in read_dir { - let child_path = job.target_path.child_real(entry.file_name()); + let child_path = job.target_path.child(entry.file_name()); children.push(child_path); } - job.state = JobState::Analyzed; + job.state = BuildJobState::Analyzed; let parent_job = Arc::new(job); let mut jobs = Vec::with_capacity(children.len()); for child in children { - let job = Job::new(Some(Arc::clone(&parent_job)), child); + let job = BuildJob::new(Some(Arc::clone(&parent_job)), child); jobs.push(job); } @@ -66,7 +78,7 @@ pub fn worker_run_directory(path: PathBuf, modified: u64, size: u64, id: usize, } } }, - JobState::Analyzed => { + BuildJobState::Analyzed => { let mut hash = GeneralHash::from_type(arg.hash_type); let mut children = Vec::new(); @@ -81,14 +93,14 @@ pub fn worker_run_directory(path: PathBuf, modified: u64, size: u64, id: usize, // query cache match worker_fetch_savedata(arg, &job.target_path) { Some(found) => { - if found.file_type == SaveFileEntryType::Directory && found.modified == modified && found.size == finished.len() as u64 { + if found.file_type == HashTreeFileEntryType::Directory && found.modified == modified && found.size == finished.len() as u64 { if found.children.len() == finished.len() && found.children.iter().zip(finished.iter().map(|e| e.get_content_hash())).all(|(a, b)| a == b) { trace!("Directory {:?} is already in save file", path); let mut children = Vec::new(); children.append(finished.deref_mut()); - let file = File::Directory(DirectoryInformation { + let file = BuildFile::Directory(BuildDirectoryInformation { path: job.target_path.clone(), modified, content_hash: found.hash.clone(), @@ -104,7 +116,7 @@ pub fn worker_run_directory(path: PathBuf, modified: u64, size: u64, id: usize, } if cached_entry.is_none() { - match utils::hash_directory(finished.iter(), &mut hash) { + match hash.hash_directory(finished.iter()) { Ok(_) => {}, Err(err) => { error = true; @@ -129,7 +141,7 @@ pub fn worker_run_directory(path: PathBuf, modified: u64, size: u64, id: usize, return; } - let file = File::Directory(DirectoryInformation { + let file = BuildFile::Directory(BuildDirectoryInformation { path: job.target_path.clone(), modified, content_hash: hash, diff --git a/src/cmd/build/worker/file.rs b/src/stages/build/cmd/worker/file.rs similarity index 64% rename from src/cmd/build/worker/file.rs rename to src/stages/build/cmd/worker/file.rs index a0c841c..0e1e777 100644 --- a/src/cmd/build/worker/file.rs +++ b/src/stages/build/cmd/worker/file.rs @@ -1,20 +1,33 @@ +use crate::stages::build::cmd::worker::GeneralHashType; +use crate::hash::GeneralHash; use std::fs; use std::path::PathBuf; use std::sync::mpsc::Sender; use log::{error, trace}; -use crate::build::JobResult; -use crate::build::worker::{worker_create_error, worker_fetch_savedata, worker_publish_result_or_trigger_parent, WorkerArgument}; -use crate::data::{GeneralHash, Job, GeneralHashType, File, FileInformation, SaveFileEntryType}; -use crate::utils; +use crate::stages::build::intermediary_build_data::{BuildFile, BuildFileInformation}; +use crate::stages::build::cmd::job::{BuildJob, JobResult}; +use crate::stages::build::cmd::worker::{worker_create_error, worker_fetch_savedata, worker_publish_result_or_trigger_parent, WorkerArgument}; +use crate::stages::build::output::HashTreeFileEntryType; -pub fn worker_run_file(path: PathBuf, modified: u64, size: u64, id: usize, job: Job, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { +/// Analyze a file. +/// +/// # Arguments +/// * `path` - The path to the file. +/// * `modified` - The last modified time of the file. +/// * `size` - The size of the file (given by fs::metadata). +/// * `id` - The id of the worker. +/// * `job` - The job to process. +/// * `result_publish` - The channel to publish the result to. +/// * `job_publish` - The channel to publish new jobs to. +/// * `arg` - The argument for the worker thread. +pub fn worker_run_file(path: PathBuf, modified: u64, size: u64, id: usize, job: BuildJob, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { trace!("[{}] analyzing file {} > {:?}", id, &job.target_path, path); match worker_fetch_savedata(arg, &job.target_path) { Some(found) => { - if found.file_type == SaveFileEntryType::File && found.modified == modified && found.size == size { + if found.file_type == HashTreeFileEntryType::File && found.modified == modified && found.size == size { trace!("File {:?} is already in save file", path); - worker_publish_result_or_trigger_parent(id, true, File::File(FileInformation { + worker_publish_result_or_trigger_parent(id, true, BuildFile::File(BuildFileInformation { path: job.target_path.clone(), modified, content_hash: found.hash.clone(), @@ -36,7 +49,7 @@ pub fn worker_run_file(path: PathBuf, modified: u64, size: u64, id: usize, job: // dont hash file content_size = fs::metadata(&path).map(|metadata| metadata.len()).unwrap_or(0); } else { - match utils::hash_file(&mut reader, &mut hash) { + match hash.hash_file(&mut reader) { Ok(size) => { content_size = size; } @@ -48,7 +61,7 @@ pub fn worker_run_file(path: PathBuf, modified: u64, size: u64, id: usize, job: } } - let file = File::File(FileInformation { + let file = BuildFile::File(BuildFileInformation { path: job.target_path.clone(), modified, content_hash: hash, diff --git a/src/stages/build/cmd/worker/other.rs b/src/stages/build/cmd/worker/other.rs new file mode 100644 index 0000000..113ad5c --- /dev/null +++ b/src/stages/build/cmd/worker/other.rs @@ -0,0 +1,45 @@ +use std::path::PathBuf; +use std::sync::mpsc::Sender; +use log::trace; +use crate::stages::build::intermediary_build_data::{BuildFile, BuildOtherInformation}; +use crate::stages::build::cmd::job::{BuildJob, JobResult}; +use crate::stages::build::cmd::worker::{worker_fetch_savedata, worker_publish_result_or_trigger_parent, WorkerArgument}; +use crate::stages::build::output::HashTreeFileEntryType; + +/// Analyze a file that is not a symlink/folder/file. +/// +/// # Arguments +/// * `path` - The path to the file. +/// * `modified` - The last modified time of the file. +/// * `size` - The size of the file (given by fs::metadata). +/// * `id` - The id of the worker. +/// * `job` - The job to process. +/// * `result_publish` - The channel to publish the result to. +/// * `job_publish` - The channel to publish new jobs to. +/// * `arg` - The argument for the worker thread. +pub fn worker_run_other(path: PathBuf, modified: u64, size: u64, id: usize, job: BuildJob, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { + trace!("[{}] analyzing other {} > {:?}", id, &job.target_path, path); + + match worker_fetch_savedata(arg, &job.target_path) { + Some(found) => { + if found.file_type == HashTreeFileEntryType::Other && found.modified == modified && found.size == size { + trace!("Other {:?} is already in save file", path); + worker_publish_result_or_trigger_parent(id, true, BuildFile::Other(BuildOtherInformation { + path: job.target_path.clone(), + content_size: size, + modified, + }), job, result_publish, job_publish, arg); + return; + } + } + None => {} + } + + let file = BuildFile::Other(BuildOtherInformation { + path: job.target_path.clone(), + content_size: size, + modified, + }); + + worker_publish_result_or_trigger_parent(id, false, file, job, result_publish, job_publish, arg); +} \ No newline at end of file diff --git a/src/cmd/build/worker/symlink.rs b/src/stages/build/cmd/worker/symlink.rs similarity index 65% rename from src/cmd/build/worker/symlink.rs rename to src/stages/build/cmd/worker/symlink.rs index 7643179..1c312e6 100644 --- a/src/cmd/build/worker/symlink.rs +++ b/src/stages/build/cmd/worker/symlink.rs @@ -1,19 +1,31 @@ -use crate::data::{File, SaveFileEntryType, SymlinkInformation}; +use crate::stages::build::cmd::worker::BuildJob; use std::fs; use std::path::PathBuf; use std::sync::mpsc::Sender; use log::{error, trace}; -use crate::build::JobResult; -use crate::build::worker::{worker_create_error, worker_fetch_savedata, worker_publish_result_or_trigger_parent, WorkerArgument}; -use crate::data::{GeneralHash, Job}; -use crate::utils; +use crate::stages::build::intermediary_build_data::{BuildFile, BuildSymlinkInformation}; +use crate::hash::GeneralHash; +use crate::stages::build::cmd::job::JobResult; +use crate::stages::build::cmd::worker::{worker_create_error, worker_fetch_savedata, worker_publish_result_or_trigger_parent, WorkerArgument}; +use crate::stages::build::output::HashTreeFileEntryType; -pub fn worker_run_symlink(path: PathBuf, modified: u64, size: u64, id: usize, job: Job, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { +/// Analyze a symlink. +/// +/// # Arguments +/// * `path` - The path to the symlink. +/// * `modified` - The last modified time of the symlink. +/// * `size` - The size of the symlink (given by fs::metdata). +/// * `id` - The id of the worker. +/// * `job` - The job to process. +/// * `result_publish` - The channel to publish the result to. +/// * `job_publish` - The channel to publish new jobs to. +/// * `arg` - The argument for the worker thread. +pub fn worker_run_symlink(path: PathBuf, modified: u64, size: u64, id: usize, job: BuildJob, result_publish: &Sender, job_publish: &Sender, arg: &mut WorkerArgument) { trace!("[{}] analyzing symlink {} > {:?}", id, &job.target_path, path); match worker_fetch_savedata(arg, &job.target_path) { Some(found) => { - if found.file_type == SaveFileEntryType::Symlink && found.modified == modified && found.size == size { + if found.file_type == HashTreeFileEntryType::Symlink && found.modified == modified && found.size == size { trace!("Symlink {:?} is already in save file", path); let target_link = fs::read_link(&path); let target_link = match target_link { @@ -24,7 +36,7 @@ pub fn worker_run_symlink(path: PathBuf, modified: u64, size: u64, id: usize, jo return; } }; - worker_publish_result_or_trigger_parent(id, true, File::Symlink(SymlinkInformation { + worker_publish_result_or_trigger_parent(id, true, BuildFile::Symlink(BuildSymlinkInformation { path: job.target_path.clone(), modified, content_hash: found.hash.clone(), @@ -49,7 +61,7 @@ pub fn worker_run_symlink(path: PathBuf, modified: u64, size: u64, id: usize, jo let mut hash = GeneralHash::from_type(arg.hash_type); - match utils::hash_path(&target_link, &mut hash) { + match hash.hash_path(&target_link) { Ok(_) => {}, Err(err) => { error!("Error while hashing symlink target {:?}: {}", target_link, err); @@ -58,7 +70,7 @@ pub fn worker_run_symlink(path: PathBuf, modified: u64, size: u64, id: usize, jo } } - let file = File::Symlink(SymlinkInformation { + let file = BuildFile::Symlink(BuildSymlinkInformation { path: job.target_path.clone(), modified, content_hash: hash, diff --git a/src/stages/build/intermediary_build_data.rs b/src/stages/build/intermediary_build_data.rs new file mode 100644 index 0000000..fbd047e --- /dev/null +++ b/src/stages/build/intermediary_build_data.rs @@ -0,0 +1,182 @@ +use std::path::{PathBuf}; +use serde::{Deserialize, Serialize}; +use crate::hash::GeneralHash; +use crate::path::FilePath; + +/// Information about an analyzed file. +/// +/// # Fields +/// * `path` - The path of the file. +/// * `modified` - The last modification time of the file. +/// * `content_hash` - The hash of the file content. +/// * `content_size` - The size of the file content. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BuildFileInformation { + pub path: FilePath, + pub modified: u64, + pub content_hash: GeneralHash, + pub content_size: u64, +} + +/// Information about an analyzed directory. +/// +/// # Fields +/// * `path` - The path of the directory. +/// * `modified` - The last modification time of the directory. +/// * `content_hash` - The hash of the directory content. +/// * `number_of_children` - The number of children in the directory. +/// * `children` - The children of the directory. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BuildDirectoryInformation { + pub path: FilePath, + pub modified: u64, + pub content_hash: GeneralHash, + pub number_of_children: u64, + pub children: Vec, +} + +/// Information about an analyzed symlink. +/// +/// # Fields +/// * `path` - The path of the symlink. +/// * `modified` - The last modification time of the symlink. +/// * `content_hash` - The hash of the symlink content. +/// * `target` - The target of the symlink. +/// * `content_size` - The size of the symlink content. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BuildSymlinkInformation { + pub path: FilePath, + pub modified: u64, + pub content_hash: GeneralHash, // equal to the target file's hash or if not following symlinks, the symlink's path hashed + pub target: PathBuf, + pub content_size: u64, +} + +/// Information about an analyzed file that is not a regular file, directory, or symlink. +/// This could be sockets, block devices, character devices, etc. or file for which permissions are missing. +/// +/// # Fields +/// * `path` - The path of the file. +/// * `modified` - The last modification time of the file. +/// * `content_size` - The size of the file content. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BuildOtherInformation { + pub path: FilePath, + pub modified: u64, + pub content_size: u64, +} + +/// Information about a file that is not kept in memory but saved to disk. +/// +/// # Fields +/// * `path` - The path of the file. +/// * `content_hash` - The hash of the file content. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BuildStubInformation { + pub path: FilePath, + pub content_hash: GeneralHash, +} + +/// A file that has been analyzed. +/// +/// # Variants +/// * `File` - A regular file. +/// * `Directory` - A directory. +/// * `Symlink` - A symlink. +/// * `Other` - A file that is not a regular file, directory, or symlink, or a file for which permissions are missing. +/// * `Stub` - A file that is not kept in memory but already saved to disk in the hashtree file. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum BuildFile { + File(BuildFileInformation), + Directory(BuildDirectoryInformation), + Symlink(BuildSymlinkInformation), + Other(BuildOtherInformation), // for unsupported file types like block devices, character devices, etc., or files without permission + Stub(BuildStubInformation), // for files that are already analyzed +} + +// ---- IMPLEMENTATION ---- + +impl BuildFile { + /// Get the hash of a file + /// + /// # Returns + /// The hash of the file. If the file is of type `Other` the hash is [GeneralHash::NULL]. + pub fn get_content_hash(&self) -> &GeneralHash { + match self { + BuildFile::File(info) => &info.content_hash, + BuildFile::Directory(info) => &info.content_hash, + BuildFile::Symlink(info) => &info.content_hash, + BuildFile::Other(_) => &GeneralHash::NULL, + BuildFile::Stub(info) => &info.content_hash, + } + } + + /// Gets the path of this file + /// + /// # Returns + /// The path of the file. + pub fn get_path(&self) -> &FilePath { + match self { + BuildFile::File(info) => &info.path, + BuildFile::Directory(info) => &info.path, + BuildFile::Symlink(info) => &info.path, + BuildFile::Other(info) => &info.path, + BuildFile::Stub(info) => &info.path, + } + } + + /// Returns if this is a directory + /// + /// # Returns + /// True if this is a directory, false otherwise. + pub fn is_directory(&self) -> bool { + match self { + BuildFile::Directory(_) => true, + _ => false, + } + } + + /// Returns if this is a symlink + /// + /// # Returns + /// True if this is a symlink, false otherwise. + pub fn is_symlink(&self) -> bool { + match self { + BuildFile::Symlink(_) => true, + _ => false, + } + } + + /// Returns if this is a file + /// + /// # Returns + /// True if this is a file, false otherwise. + pub fn is_file(&self) -> bool { + match self { + BuildFile::File(_) => true, + _ => false, + } + } + + /// Returns if this is an "other" file + /// + /// # Returns + /// True if this is an "other" file, false otherwise. + pub fn is_other(&self) -> bool { + match self { + BuildFile::Other(_) => true, + _ => false, + } + } + + /// Returns if this is a stub file + /// + /// # Returns + /// True if this is a stub file, false otherwise. + pub fn is_stub(&self) -> bool { + match self { + BuildFile::Stub(_) => true, + _ => false, + } + } +} diff --git a/src/stages/build/output/converter.rs b/src/stages/build/output/converter.rs new file mode 100644 index 0000000..f6895e9 --- /dev/null +++ b/src/stages/build/output/converter.rs @@ -0,0 +1,269 @@ +use crate::stages::build::intermediary_build_data::{BuildDirectoryInformation, BuildFile, BuildFileInformation, BuildOtherInformation, BuildStubInformation, BuildSymlinkInformation}; +use crate::hash::GeneralHash; +use crate::stages::build::output::{HashTreeFileEntryType, HashTreeFileEntry, HashTreeFileEntryRef}; + +impl From for HashTreeFileEntry { + /// Convert a [BuildFileInformation] into a [HashTreeFileEntry]. + /// + /// # Arguments + /// * `value` - The [BuildFileInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntry]. + fn from(value: BuildFileInformation) -> Self { + Self { + file_type: HashTreeFileEntryType::File, + modified: value.modified, + size: value.content_size, + hash: value.content_hash, + path: value.path, + children: Vec::with_capacity(0), + } + } +} + +impl From for HashTreeFileEntry { + /// Convert a [BuildSymlinkInformation] into a [HashTreeFileEntry]. + /// + /// # Arguments + /// * `value` - The [BuildSymlinkInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntry]. + fn from(value: BuildSymlinkInformation) -> Self { + Self { + file_type: HashTreeFileEntryType::Symlink, + modified: value.modified, + size: value.content_size, + hash: value.content_hash, + path: value.path, + children: Vec::with_capacity(0), + } + } +} + +impl From for HashTreeFileEntry { + /// Convert a [BuildDirectoryInformation] into a [HashTreeFileEntry]. + /// + /// # Arguments + /// * `value` - The [BuildDirectoryInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntry]. + fn from(value: BuildDirectoryInformation) -> Self { + let mut result = Self { + file_type: HashTreeFileEntryType::Directory, + modified: value.modified, + size: value.number_of_children, + hash: value.content_hash, + path: value.path, + children: Vec::with_capacity(value.children.len()), + }; + for child in value.children { + result.children.push(child.get_content_hash().clone()); + } + result + } +} + +impl From for HashTreeFileEntry { + /// Convert a [BuildOtherInformation] into a [HashTreeFileEntry]. + /// + /// # Arguments + /// * `value` - The [BuildOtherInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntry]. + fn from(value: BuildOtherInformation) -> Self { + Self { + file_type: HashTreeFileEntryType::Other, + modified: value.modified, + size: value.content_size, + hash: GeneralHash::NULL, + path: value.path, + children: Vec::with_capacity(0), + } + } +} + +impl From for HashTreeFileEntry { + /// Convert a [BuildStubInformation] into a [HashTreeFileEntry]. + /// + /// # Arguments + /// * `value` - The [BuildStubInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntry]. + fn from(value: BuildStubInformation) -> Self { + Self { + file_type: HashTreeFileEntryType::Other, + modified: 0, + size: 0, + hash: value.content_hash, + path: value.path, + children: Vec::with_capacity(0), + } + } +} + +impl<'a> From<&'a BuildFileInformation> for HashTreeFileEntryRef<'a> { + /// Convert a [BuildFileInformation] into a [HashTreeFileEntryRef]. + /// + /// # Arguments + /// * `value` - The reference to the [BuildFileInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntryRef]. + fn from(value: &'a BuildFileInformation) -> Self { + Self { + file_type: &HashTreeFileEntryType::File, + modified: &value.modified, + hash: &value.content_hash, + path: &value.path, + size: &value.content_size, + children: Vec::with_capacity(0), + } + } +} + +impl<'a> From<&'a BuildSymlinkInformation> for HashTreeFileEntryRef<'a> { + /// Convert a [BuildSymlinkInformation] into a [HashTreeFileEntryRef]. + /// + /// # Arguments + /// * `value` - The reference to the [BuildSymlinkInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntryRef]. + fn from(value: &'a BuildSymlinkInformation) -> Self { + Self { + file_type: &HashTreeFileEntryType::Symlink, + modified: &value.modified, + hash: &value.content_hash, + path: &value.path, + size: &value.content_size, + children: Vec::with_capacity(0), + } + } +} + +impl<'a> From<&'a BuildDirectoryInformation> for HashTreeFileEntryRef<'a> { + /// Convert a [BuildDirectoryInformation] into a [HashTreeFileEntryRef]. + /// + /// # Arguments + /// * `value` - The reference to the [BuildDirectoryInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntryRef]. + fn from(value: &'a BuildDirectoryInformation) -> Self { + let mut result = Self { + file_type: &HashTreeFileEntryType::Directory, + modified: &value.modified, + hash: &value.content_hash, + path: &value.path, + size: &value.number_of_children, + children: Vec::with_capacity(value.children.len()), + }; + for child in &value.children { + result.children.push(child.get_content_hash()); + } + result + } +} + +impl<'a> From<&'a BuildOtherInformation> for HashTreeFileEntryRef<'a> { + /// Convert a [BuildOtherInformation] into a [HashTreeFileEntryRef]. + /// + /// # Arguments + /// * `value` - The reference to the [BuildOtherInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntryRef]. + fn from(value: &'a BuildOtherInformation) -> Self { + Self { + file_type: &HashTreeFileEntryType::Other, + modified: &0, + hash: &GeneralHash::NULL, + path: &value.path, + size: &value.content_size, + children: Vec::with_capacity(0), + } + } +} + +impl<'a> From<&'a BuildStubInformation> for HashTreeFileEntryRef<'a> { + /// Convert a [BuildStubInformation] into a [HashTreeFileEntryRef]. + /// + /// # Arguments + /// * `value` - The reference to the [BuildStubInformation] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntryRef]. + fn from(value: &'a BuildStubInformation) -> Self { + Self { + file_type: &HashTreeFileEntryType::Other, + modified: &0, + hash: &value.content_hash, + path: &value.path, + size: &0, + children: Vec::with_capacity(0), + } + } +} + +impl From for HashTreeFileEntry { + /// Convert a [BuildFile] into a [HashTreeFileEntry]. + /// + /// # Arguments + /// * `value` - The [BuildFile] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntry]. + fn from(value: BuildFile) -> Self { + match value { + BuildFile::File(info) => info.into(), + BuildFile::Directory(info) => info.into(), + BuildFile::Symlink(info) => info.into(), + BuildFile::Other(info) => info.into(), + BuildFile::Stub(info) => info.into(), + } + } +} + +impl<'a> From<&'a BuildFile> for HashTreeFileEntryRef<'a> { + /// Convert a [BuildFile] into a [HashTreeFileEntryRef]. + /// + /// # Arguments + /// * `value` - The reference to the [BuildFile] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntryRef]. + fn from(value: &'a BuildFile) -> Self { + match value { + BuildFile::File(info) => info.into(), + BuildFile::Directory(info) => info.into(), + BuildFile::Symlink(info) => info.into(), + BuildFile::Other(info) => info.into(), + BuildFile::Stub(info) => info.into(), + } + } +} + +impl<'a> From<&'a HashTreeFileEntry> for HashTreeFileEntryRef<'a> { + /// Convert a [HashTreeFileEntry] into a [HashTreeFileEntryRef]. + /// + /// # Arguments + /// * `value` - The reference to the [HashTreeFileEntry] to convert. + /// + /// # Returns + /// The converted [HashTreeFileEntryRef]. + fn from(value: &'a HashTreeFileEntry) -> Self { + Self { + file_type: &value.file_type, + modified: &value.modified, + hash: &value.hash, + path: &value.path, + size: &value.size, + children: Vec::with_capacity(0), + } + } +} diff --git a/src/stages/build/output/hashtreefile.rs b/src/stages/build/output/hashtreefile.rs new file mode 100644 index 0000000..7935a0a --- /dev/null +++ b/src/stages/build/output/hashtreefile.rs @@ -0,0 +1,337 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::io::{BufRead, Write}; +use std::ops::DerefMut; +use std::sync::Arc; + +use anyhow::Result; +use log::{info, trace, warn}; +use serde::{Deserialize, Serialize}; + +pub use HashTreeFileEntryTypeV1 as HashTreeFileEntryType; +pub use HashTreeFileEntryV1 as HashTreeFileEntry; +pub type HashTreeFileEntryRef<'a> = HashTreeFileEntryV1Ref<'a>; + +use crate::hash::{GeneralHash, GeneralHashType}; +use crate::path::FilePath; +use crate::utils; + +/// HashTreeFile file version. In further versions, the file format may change. +/// Currently only one file version exist. +/// +/// # Fields +/// * `V1` - Version 1 of the file format. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum HashTreeFileVersion { + V1, +} + +/// HashTreeFile file header. First line of a hash tree file. +/// +/// # Fields +/// * `version` - The version of the file. +/// * `hash_type` - The hash type used to hash the files. +/// * `creation_date` - The creation date of the file in unix time +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct HashTreeFileHeader { + pub version: HashTreeFileVersion, + pub hash_type: GeneralHashType, + pub creation_date: u64, +} + +/// HashTreeFile entry type. Describes the type of file. +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Hash, Eq)] +pub enum HashTreeFileEntryTypeV1 { + File, + Directory, + Symlink, + Other, +} + +/// HashTreeFile entry. Describes an analyzed file. +/// +/// # Fields +/// * `file_type` - The type of the file. +/// * `modified` - The last modified date of the file in unix time. +/// * `size` - The size of the file in bytes for files, number of children for folders. +/// * `hash` - The hash of the file content. +/// * `path` - The path of the file. +/// * `children` - The children of the file. Only for directories. +/// +/// # See also +/// * [HashTreeFileEntryV1Ref] which is a reference version of this struct. +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub struct HashTreeFileEntryV1 { + pub file_type: HashTreeFileEntryTypeV1, + pub modified: u64, + pub size: u64, + pub hash: GeneralHash, + pub path: FilePath, + pub children: Vec, +} + +/// HashTreeFile entry reference. Describes an analyzed file. +/// This is a reference version of the [HashTreeFileEntryV1] struct. +/// +/// # Fields +/// * `file_type` - The type of the file. +/// * `modified` - The last modified date of the file in unix time. +/// * `size` - The size of the file in bytes for files, number of children for folders. +/// * `hash` - The hash of the file content. +/// * `path` - The path of the file. +/// * `children` - The children of the file. Only for directories. +/// +/// # See also +/// * [HashTreeFileEntryV1] which is the owned version of this struct. +#[derive(Debug, Serialize)] +pub struct HashTreeFileEntryV1Ref<'a> { + pub file_type: &'a HashTreeFileEntryTypeV1, + pub modified: &'a u64, + pub size: &'a u64, + pub hash: &'a GeneralHash, + pub path: &'a FilePath, + pub children: Vec<&'a GeneralHash>, +} + +/// Interface to access and manage a hash tree file. +/// +/// # Fields +/// * `header` - The header of the file. +/// * `file_by_hash` - A map of files by their hash. +/// * `file_by_path` - A map of files by their path. +/// * `all_entries` - A list of all entries. +pub struct HashTreeFile<'a, W, R> where W: Write, R: BufRead { + pub header: HashTreeFileHeader, + pub file_by_hash: HashMap>>, + pub file_by_path: HashMap>, + pub all_entries: Vec>, + + enable_file_by_hash: bool, + enable_file_by_path: bool, + enable_all_entry_list: bool, + + writer: RefCell<&'a mut W>, + written_bytes: RefCell, + reader: RefCell<&'a mut R>, +} + +impl<'a, W: Write, R: BufRead> HashTreeFile<'a, W, R> { + /// Create a new hash tree file. + /// + /// If not writing a new header hash_type can be set to GeneralHashType::NULL. + /// + /// # Arguments + /// * `writer` - The writer to write the file. + /// * `reader` - The reader to read the file. + /// * `hash_type` - The hash type used to hash the files. + /// * `enable_file_by_hash` - Whether to enable the file by hash - hash map. + /// * `enable_file_by_path` - Whether to enable the file by path - hash map. + /// * `enable_all_entry_list` - Whether to enable the all entries list. + /// + /// # Returns + /// The created hash tree file interface. + pub fn new(writer: &'a mut W, reader: &'a mut R, hash_type: GeneralHashType, enable_file_by_hash: bool, enable_file_by_path: bool, enable_all_entry_list: bool) -> Self { + let time = utils::get_time(); + HashTreeFile { + header: HashTreeFileHeader { + version: HashTreeFileVersion::V1, + hash_type, + creation_date: time, + }, + file_by_hash: HashMap::new(), + file_by_path: HashMap::new(), + all_entries: Vec::new(), + enable_file_by_hash, + enable_file_by_path, + enable_all_entry_list, + writer: RefCell::new(writer), + reader: RefCell::new(reader), + written_bytes: RefCell::new(0), + } + } + + /// Save the header to the file + /// + /// # Error + /// If writing to the file errors + pub fn save_header(&self) -> Result<()> { + let header_str = serde_json::to_string(&self.header)?; + *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write(header_str.as_bytes())?; + *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write(b"\n")?; + + Ok(()) + } + + /// Load a file header from the file + /// + /// # Error + /// If reading from the file errors + pub fn load_header(&mut self) -> Result<()> { + let mut header_str = String::new(); + self.reader.borrow_mut().deref_mut().read_line(&mut header_str)?; + + let header: HashTreeFileHeader = serde_json::from_str(header_str.as_str())?; + self.header = header; + + Ok(()) + } + + /// Load a file entry from the file + /// + /// # Error + /// If reading from the file errors + pub fn load_entry_no_filter(&mut self) -> Result>> { + self.load_entry(|_| true) + } + + /// Load a file entry from the file + /// + /// # Arguments + /// * `filter` - A filter function to filter the entries. If the function returns false the entry is ignored. + /// + /// # Returns + /// The loaded entry or None if the end of the file is reached. + /// + /// # Error + /// If reading from the file errors + pub fn load_entry bool>(&mut self, filter: F) -> Result>> { + loop { + let mut entry_str = String::new(); + let count = self.reader.borrow_mut().deref_mut().read_line(&mut entry_str)?; + + if count == 0 { + return Ok(None); + } + + if count == 1 { + continue; + } + + let entry: HashTreeFileEntry = serde_json::from_str(entry_str.as_str())?; + + if entry.hash.hash_type() != self.header.hash_type && !(entry.file_type == HashTreeFileEntryType::Other && entry.hash.hash_type() == GeneralHashType::NULL) { + warn!("Hash type mismatch ignoring entry: {:?}", entry.path); + continue; + } + + if !filter(&entry) { + trace!("Entry filtered: {:?}", entry.path); + continue; + } + + let shared_entry = Arc::new(entry); + + if self.enable_file_by_hash { + self.file_by_hash.entry(shared_entry.hash.clone()).or_insert_with(Vec::new).push(Arc::clone(&shared_entry)); + } + + if self.enable_file_by_path { + match self.file_by_path.insert(shared_entry.path.clone(), Arc::clone(&shared_entry)) { + None => {} + Some(old) => { + // this happens if analysis was canceled and continued + // and an already analysed file changed + info!("Duplicate entry for path: {:?}", &old.path); + if self.enable_all_entry_list { + self.all_entries.retain(|x| x != &old); + } + } + } + } + + if self.enable_all_entry_list { + self.all_entries.push(Arc::clone(&shared_entry)); + } + + return Ok(Some(shared_entry)) + } + } + + /// Load all entries from the file. Till the end of the file is reached. + /// + /// # Arguments + /// * `filter` - A filter function to filter the entries. If the function returns false the entry is ignored. + /// + /// # Error + /// If reading from the file errors + pub fn load_all_entries bool>(&mut self, filter: F) -> Result<()> { + while let Some(_) = self.load_entry(&filter)? {} + + Ok(()) + } + + /// Load all entries from the file. Till the end of the file is reached. + /// + /// # Error + /// If reading from the file errors + pub fn load_all_entries_no_filter(&mut self) -> Result<()> { + self.load_all_entries(|_| true) + } + + /// Write an entry to the file + /// + /// # Arguments + /// * `result` - The entry to write. + /// + /// # Error + /// If writing to the file errors + pub fn write_entry(&self, result: &HashTreeFileEntry) -> Result<()> { + let string = serde_json::to_string(result)?; + *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write(string.as_bytes())?; + *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write("\n".as_bytes())?; + self.writer.borrow_mut().deref_mut().flush()?; + Ok(()) + } + + /// Write an entry reference to the file + /// + /// # Arguments + /// * `result` - The entry reference to write. + /// + /// # Error + /// If writing to the file errors + pub fn write_entry_ref(&self, result: &HashTreeFileEntryRef) -> Result<()> { + let string = serde_json::to_string(result)?; + *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write(string.as_bytes())?; + *self.written_bytes.borrow_mut() += self.writer.borrow_mut().deref_mut().write("\n".as_bytes())?; + self.writer.borrow_mut().deref_mut().flush()?; + Ok(()) + } + + /// Empty the file by hash - hash map. + /// Frees/Shrinks the memory used. + pub fn empty_file_by_hash(&mut self) { + self.file_by_hash.clear(); + self.file_by_hash.shrink_to_fit(); + } + + /// Empty the file by path - hash map. + /// Frees/Shrinks the memory used. + pub fn empty_file_by_path(&mut self) { + self.file_by_path.clear(); + self.file_by_path.shrink_to_fit(); + } + + /// Empty the all entries list. + /// Frees/Shrinks the memory used. + pub fn empty_entry_list(&mut self) { + self.all_entries.clear(); + self.all_entries.shrink_to_fit(); + } + + /// Get the written bytes count. + /// + /// # Returns + /// The written bytes count. + pub fn get_written_bytes(&self) -> usize { + *self.written_bytes.borrow() + } + + /// Flush the writer. + /// + /// # Error + /// If flushing the writer errors + pub fn flush(&self) -> std::io::Result<()> { + self.writer.borrow_mut().deref_mut().flush() + } +} diff --git a/src/stages/clean.rs b/src/stages/clean.rs new file mode 100644 index 0000000..52958ec --- /dev/null +++ b/src/stages/clean.rs @@ -0,0 +1 @@ +pub mod cmd; diff --git a/src/cmd/clean.rs b/src/stages/clean/cmd.rs similarity index 73% rename from src/cmd/clean.rs rename to src/stages/clean/cmd.rs index d509d5e..c48641e 100644 --- a/src/cmd/clean.rs +++ b/src/stages/clean/cmd.rs @@ -2,8 +2,16 @@ use std::fs; use std::path::PathBuf; use anyhow::{anyhow, Result}; use log::{info, trace, warn}; -use crate::data::{SaveFile, SaveFileEntryType}; +use crate::hash::GeneralHashType; +use crate::stages::build::output::{HashTreeFile, HashTreeFileEntryType}; +/// Settings for the clean stage. +/// +/// # Fields +/// * `input` - The input hashtree file to clean. +/// * `output` - The output hashtree file to write the cleaned hashtree to. +/// * `root` - The root path of the original working directory. This is used to resolve relative paths. +/// * `follow_symlinks` - Whether to follow symlinks when checking if files exist. pub struct CleanSettings { pub input: PathBuf, pub output: PathBuf, @@ -11,6 +19,10 @@ pub struct CleanSettings { pub follow_symlinks: bool, } +/// Run the clean command. +/// +/// # Arguments +/// * `clean_settings` - The settings for the clean command. pub fn run( clean_settings: CleanSettings, ) -> Result<()> { @@ -39,7 +51,7 @@ pub fn run( let mut input_buf_reader = std::io::BufReader::new(&input_file); let mut output_buf_writer = std::io::BufWriter::new(&output_file); - let mut save_file = SaveFile::new(&mut output_buf_writer, &mut input_buf_reader, false, true, true); + let mut save_file = HashTreeFile::new(&mut output_buf_writer, &mut input_buf_reader, GeneralHashType::NULL, false, true, true); save_file.load_header()?; // remove duplicates, remove deleted files @@ -64,13 +76,13 @@ pub fn run( if let Some(metadata) = metadata { return if metadata.is_symlink() { - entry.file_type == SaveFileEntryType::Symlink + entry.file_type == HashTreeFileEntryType::Symlink } else if metadata.is_dir() { - entry.file_type == SaveFileEntryType::Directory + entry.file_type == HashTreeFileEntryType::Directory } else if metadata.is_file() { - entry.file_type == SaveFileEntryType::File + entry.file_type == HashTreeFileEntryType::File } else { - entry.file_type == SaveFileEntryType::Other + entry.file_type == HashTreeFileEntryType::Other } } diff --git a/src/threadpool.rs b/src/threadpool.rs deleted file mode 100644 index 0fa730b..0000000 --- a/src/threadpool.rs +++ /dev/null @@ -1,187 +0,0 @@ -use std::sync::{Arc, mpsc, Mutex}; -use std::sync::mpsc::{Receiver, RecvTimeoutError, Sender}; -use std::thread; -use std::time::Duration; -use log::{debug, error, trace, warn}; -use crate::data::{JobTrait, ResultTrait}; - -type WorkerEntry = fn(usize, Job, &Sender, &Sender, &mut Argument); - -struct Worker -{ - id: usize, - thread: Option>, -} - -impl Worker { - fn new(id: usize, job_receive: Arc>>, result_publish: Sender, job_publish: Sender, func: WorkerEntry, arg: Argument) -> Worker { - let thread = thread::spawn(move || { - Worker::worker_entry(id, job_receive, result_publish, job_publish, func, arg); - }); - - Worker { id, thread: Some(thread) } - } - - fn worker_entry(id: usize, job_receive: Arc>>, result_publish: Sender, job_publish: Sender, func: WorkerEntry, mut arg: Argument) { - loop { - let job = job_receive.lock(); - - let job = match job { - Err(e) => { - error!("Worker {} shutting down {}", id, e); - break; - } - Ok(job) => { - job.recv() - } - }; - - match job { - Err(_) => { - trace!("Worker {} shutting down", id); - break; - } - Ok(job) => { - trace!("Worker {} received job {}", id, job.job_id()); - func(id, job, &result_publish, &job_publish, &mut arg); - } - } - } - } -} - -pub struct ThreadPool -where - Job: Send, - Result: Send, -{ - workers: Vec, - thread: Option>, - job_publish: Arc>>>, - result_receive: Receiver, -} - -impl ThreadPool { - pub fn new(mut args: Vec, func: WorkerEntry) -> ThreadPool { - assert!(args.len() > 0); - - let mut workers = Vec::with_capacity(args.len()); - - let (job_publish, job_receive) = mpsc::channel(); - - let job_receive = Arc::new(Mutex::new(job_receive)); - let (result_publish, result_receive) = mpsc::channel(); - let (thread_publish_job, thread_receive_job) = mpsc::channel(); - - let mut id = 0; - while let Some(arg) = args.pop() { - workers.push(Worker::new(id, Arc::clone(&job_receive), result_publish.clone(), thread_publish_job.clone(), func, arg)); - id += 1; - } - - let job_publish = Arc::new(Mutex::new(Some(job_publish))); - let job_publish_clone = Arc::clone(&job_publish); - - let thread = thread::spawn(move || { - ThreadPool::::pool_entry(job_publish_clone, thread_receive_job); - }); - - ThreadPool { - workers, - job_publish, - result_receive, - thread: Some(thread), - } - } - - pub fn publish(&self, job: Job) { - let job_publish = self.job_publish.lock(); - match job_publish { - Err(e) => { - error!("ThreadPool is shutting down. Cannot publish job. {}", e); - } - Ok(job_publish) => { - match job_publish.as_ref() { - None => { - error!("ThreadPool is shutting down. Cannot publish job."); - } - Some(job_publish) => { - match job_publish.send(job) { - Err(e) => { - error!("Failed to publish job on thread pool. {}", e); - } - Ok(_) => {} - } - } - } - } - } - - } - - fn pool_entry(job_publish: Arc>>>, job_receive: Receiver) { - loop { - let job = job_receive.recv(); - - match job { - Err(_) => { - trace!("Pool worker shutting down"); - break; - } - Ok(job) => { - match job_publish.lock() { - Err(e) => { - error!("Pool worker shutting down: {}", e); - break; - } - Ok(job_publish) => { - if let Some(job_publish) = job_publish.as_ref() { - job_publish.send(job).expect("Pool worker failed to send job. This should never fail."); - } - } - } - } - } - } - } - - pub fn receive(&self) -> std::result::Result { - self.result_receive.recv() - } - - pub fn receive_timeout(&self, timeout: Duration) -> std::result::Result { - self.result_receive.recv_timeout(timeout) - } -} - -impl Drop for ThreadPool { - fn drop(&mut self) { - drop(self.job_publish.lock().expect("This should not break").take()); - - for worker in &mut self.workers { - debug!("Shutting down worker {}", worker.id); - - if let Some(thread) = worker.thread.take() { - match thread.join() { - Ok(_) => { - trace!("Worker {} shut down", worker.id); - } - Err(_) => { - warn!("Worker {} panicked", worker.id); - } - } - } - } - - if let Some(thread) = self.thread.take() { - match thread.join() { - Ok(_) => { - trace!("ThreadPool shut down"); - } - Err(_) => { - warn!("ThreadPool worker panicked"); - } - } - } - } -} \ No newline at end of file diff --git a/src/utils.rs b/src/utils.rs index d696083..15d0656 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,14 +1,43 @@ -use std::path::{Path, PathBuf}; +use std::io::Write; +use std::path::{PathBuf}; use std::time::{SystemTime, UNIX_EPOCH}; use anyhow::{anyhow, Result}; -use crate::data::{File, GeneralHash}; +/// Trait to convert a path to a lexical absolute path. +/// Does not require the path to exist. +/// +/// # See also +/// * +/// * [std::fs::canonicalize] pub trait LexicalAbsolute { + /// Convert a path to a lexical absolute path. + /// Does not require the path to exist. + /// + /// # Errors + /// Returns an error if the absolute path could not be determined. fn to_lexical_absolute(&self) -> std::io::Result; } impl LexicalAbsolute for PathBuf { + /// Convert a path to a lexical absolute path. + /// Does not require the path to exist. + /// + /// # Example + /// ``` + /// use std::path::PathBuf; + /// use backup_deduplicator::utils::LexicalAbsolute; + /// + /// let path = PathBuf::from("/a/b/../c"); + /// let absolute = path.to_lexical_absolute().unwrap(); + /// assert_eq!(absolute, PathBuf::from("/a/c")); + /// ``` + /// + /// # Errors + /// Returns an error if the given path is relative and the current working directory could not be determined. + /// * The working directory does not exist. + /// * Insufficient permissions to determine the working directory. fn to_lexical_absolute(&self) -> std::io::Result { + // https://internals.rust-lang.org/t/path-to-lexical-absolute/14940 let mut absolute = if self.is_absolute() { PathBuf::new() } else { @@ -25,52 +54,18 @@ impl LexicalAbsolute for PathBuf { } } -pub fn hash_file(mut reader: T, hash: &mut GeneralHash) -> Result -where T: std::io::Read { - - let mut hasher = hash.hasher(); - let mut buffer = [0; 4096]; - let mut content_size = 0; - - loop { - let bytes_read = reader.read(&mut buffer)?; - content_size += bytes_read as u64; - if bytes_read == 0 { - break; - } - hasher.update(&buffer[..bytes_read]); - } - - *hash = hasher.finalize(); - - Ok(content_size) -} - -pub fn hash_directory<'a>(children: impl Iterator, hash: &mut GeneralHash) -> Result { - let mut hasher = hash.hasher(); - - let mut content_size = 0; - - for child in children { - content_size += 1; - hasher.update(child.get_content_hash().as_bytes()); - } - - *hash = hasher.finalize(); - - Ok(content_size) -} - -pub fn hash_path(path: &Path, hash: &mut GeneralHash) -> Result<()> { - let mut hasher = hash.hasher(); - - hasher.update(path.as_os_str().as_encoded_bytes()); - - *hash = hasher.finalize(); - - Ok(()) -} - +/// Decode a hex string to a byte vector. +/// +/// # Example +/// ``` +/// use backup_deduplicator::utils::decode_hex; +/// +/// let bytes = decode_hex("deadbeef").unwrap(); +/// assert_eq!(bytes, vec![0xde, 0xad, 0xbe, 0xef]); +/// ``` +/// +/// # Errors +/// Returns an error if the given string is not a valid hex string. pub fn decode_hex(s: &str) -> Result> { if s.len() % 2 != 0 { return Err(anyhow!("Invalid hex length")); @@ -82,8 +77,154 @@ pub fn decode_hex(s: &str) -> Result> { .collect() } +/// Get the current time in seconds since the Unix epoch (in seconds). +/// +/// # Returns +/// The current time in seconds since the Unix epoch. Returns 0 if the current time is before the Unix epoch. pub fn get_time() -> u64 { SystemTime::now() .duration_since(UNIX_EPOCH) .map(|d| d.as_secs()).unwrap_or(0) } + +/// A writer that discards all data. +/// +/// # Example +/// ``` +/// use std::io::Write; +/// +/// let mut writer = backup_deduplicator::utils::NullWriter::new(); +/// writer.write(b"Hello, world!").unwrap(); +/// ``` +pub struct NullWriter {} + +impl NullWriter { + /// Create a new NullWriter. + /// + /// # Returns + /// A new NullWriter. + pub fn new() -> Self { + NullWriter {} + } +} + +impl Write for NullWriter { + /// Discard all data. + /// + /// # Arguments + /// * `buf` - The data to write. + /// + /// # Returns + /// The number of bytes written. Always the same as the length of `buf`. + /// + /// # Errors + /// Never + fn write(&mut self, buf: &[u8]) -> std::io::Result {Ok(buf.len())} + + /// Flush the writer. + /// + /// # Errors + /// Never + fn flush(&mut self) -> std::io::Result<()> {Ok(())} +} + +/// Utility functions for the main function of `backup-deduplicator`. +pub mod main { + use std::env; + use std::path::PathBuf; + use crate::utils::LexicalAbsolute; + + /// Changes the working directory to the given path. + /// + /// # Arguments + /// * `working_directory` - The new working directory. + /// + /// # Returns + /// The new working directory. + /// + /// # Exit + /// Exits the process if the working directory could not be changed. + pub fn change_working_directory(working_directory: Option) -> PathBuf { + match working_directory { + None => {}, + Some(working_directory) => { + env::set_current_dir(&working_directory).unwrap_or_else(|_| { + eprintln!("IO error, could not change working directory: {}", working_directory.display()); + std::process::exit(exitcode::CONFIG); + }); + } + } + + env::current_dir().unwrap_or_else(|_| { + eprintln!("IO error, could not resolve working directory"); + std::process::exit(exitcode::CONFIG); + }).canonicalize().unwrap_or_else(|_| { + eprintln!("IO error, could not resolve working directory"); + std::process::exit(exitcode::CONFIG); + }) + } + + /// Option how to parse a path. + /// + /// # See also + /// * [parse_path] + #[derive(Debug, Clone, Copy)] + pub enum ParsePathKind { + /// Do not post-process the path. + Direct, + /// Convert the path to a absolute path. The path must exist. + AbsoluteExisting, + /// Convert the path to a absolute path. The path might not exist. + AbsoluteNonExisting, + } + + /// Parse a path from a string. + /// + /// # Arguments + /// * `path` - The path to parse. + /// * `kind` - How to parse the path. + /// + /// # Returns + /// The parsed path. + pub fn parse_path(path: &str, kind: ParsePathKind) -> PathBuf { + let path = std::path::Path::new(path); + + let path = path.to_path_buf(); + + let path = match kind { + ParsePathKind::Direct => path, + ParsePathKind::AbsoluteExisting => to_lexical_absolute(path, true), + ParsePathKind::AbsoluteNonExisting => to_lexical_absolute(path, false), + }; + + path + } + + /// Convert a path to a absolute path. + /// + /// # Arguments + /// * `path` - The path to convert. + /// * `exists` - Whether the path must exist. + /// + /// # Returns + /// The absolute path. + /// + /// # Exit + /// Exits the process if the path could not be resolved. + pub fn to_lexical_absolute(path: PathBuf, exists: bool) -> PathBuf { + let path = match exists { + true => path.canonicalize(), + false => path.to_lexical_absolute(), + }; + + let path = match path{ + Ok(out) => out, + Err(e) => { + eprintln!("IO error, could not resolve output file: {:?}", e); + std::process::exit(exitcode::CONFIG); + } + }; + + path + } +} \ No newline at end of file