diff --git a/Cargo.lock b/Cargo.lock index 01269091dd4..56dbf7ba890 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -468,9 +468,9 @@ checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" [[package]] name = "digest" -version = "0.10.3" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" dependencies = [ "block-buffer", "crypto-common", @@ -754,6 +754,7 @@ dependencies = [ "bytes", "chrono", "config", + "digest", "git2", "hex", "insta", diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 5e3365b142e..026ac561fcc 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -24,6 +24,7 @@ bytes = "1.2.1" byteorder = "1.4.3" chrono = { version = "0.4.22", default-features = false, features = ["std", "clock"] } config = { version = "0.13.2", default-features = false, features = ["toml"] } +digest = "0.10.5" git2 = "0.15.0" hex = "0.4.3" itertools = "0.10.5" diff --git a/lib/src/backend.rs b/lib/src/backend.rs index 08cd46fbed6..b3a323adadd 100644 --- a/lib/src/backend.rs +++ b/lib/src/backend.rs @@ -22,8 +22,10 @@ use thiserror::Error; use crate::repo_path::{RepoPath, RepoPathComponent}; -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -pub struct CommitId(Vec); +content_hash! { + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] + pub struct CommitId(Vec); +} impl Debug for CommitId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { @@ -225,14 +227,18 @@ pub enum Phase { Draft, } -#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] -pub struct MillisSinceEpoch(pub i64); +content_hash! { + #[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] + pub struct MillisSinceEpoch(pub i64); +} -#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] -pub struct Timestamp { - pub timestamp: MillisSinceEpoch, - // time zone offset in minutes - pub tz_offset: i32, +content_hash! { + #[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] + pub struct Timestamp { + pub timestamp: MillisSinceEpoch, + // time zone offset in minutes + pub tz_offset: i32, + } } impl Timestamp { diff --git a/lib/src/content_hash.rs b/lib/src/content_hash.rs new file mode 100644 index 00000000000..4a79426dd96 --- /dev/null +++ b/lib/src/content_hash.rs @@ -0,0 +1,221 @@ +use itertools::Itertools as _; + +/// Portable, stable hashing suitable for identifying values +/// +/// Variable-length sequences should hash a 64-bit little-endian representation +/// of their length, then their elements in order. Unordered containers should +/// order their elements according to their `Ord` implementation. Enums should +/// hash a 32-bit little-endian encoding of the ordinal number of the enum +/// variant, then the variant's fields in lexical order. +pub trait ContentHash { + fn hash(&self, state: &mut impl digest::Update); +} + +impl ContentHash for () { + fn hash(&self, _: &mut impl digest::Update) {} +} + +impl ContentHash for u8 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&[*self]); + } +} + +impl ContentHash for i32 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&self.to_le_bytes()); + } +} + +impl ContentHash for i64 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&self.to_le_bytes()); + } +} + +// TODO: Specialize for [u8] once specialization exists +impl ContentHash for [T] { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for x in self { + x.hash(state); + } + } +} + +impl ContentHash for Vec { + fn hash(&self, state: &mut impl digest::Update) { + self.as_slice().hash(state) + } +} + +impl ContentHash for String { + fn hash(&self, state: &mut impl digest::Update) { + self.as_bytes().hash(state); + } +} + +impl ContentHash for Option { + fn hash(&self, state: &mut impl digest::Update) { + match *self { + None => state.update(&[0]), + Some(ref x) => { + state.update(&[1]); + x.hash(state) + } + } + } +} + +impl ContentHash for std::collections::HashMap +where + K: ContentHash + Ord, + V: ContentHash, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + let mut kv = self.iter().collect_vec(); + kv.sort_unstable_by_key(|&(k, _)| k); + for (k, v) in kv { + k.hash(state); + v.hash(state); + } + } +} + +impl ContentHash for std::collections::HashSet +where + K: ContentHash + Ord, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for k in self.iter().sorted() { + k.hash(state); + } + } +} + +impl ContentHash for std::collections::BTreeMap +where + K: ContentHash, + V: ContentHash, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for (k, v) in self.iter() { + k.hash(state); + v.hash(state); + } + } +} + +macro_rules! content_hash { + ($(#[$meta:meta])* $vis:vis struct $name:ident { + $($(#[$field_meta:meta])* $field_vis:vis $field:ident : $ty:ty),* $(,)? + }) => { + $(#[$meta])* + $vis struct $name { + $($(#[$field_meta])* $field_vis $field : $ty),* + } + + impl crate::content_hash::ContentHash for $name { + fn hash(&self, state: &mut impl digest::Update) { + $(<$ty as crate::content_hash::ContentHash>::hash(&self.$field, state);)* + } + } + }; + ($(#[$meta:meta])* $vis:vis struct $name:ident($field_vis:vis $ty:ty);) => { + $(#[$meta])* + $vis struct $name($field_vis $ty); + + impl crate::content_hash::ContentHash for $name { + fn hash(&self, state: &mut impl digest::Update) { + <$ty as crate::content_hash::ContentHash>::hash(&self.0, state); + } + } + }; +} + +#[cfg(test)] +mod tests { + use std::collections::{BTreeMap, HashMap}; + + use blake2::{Blake2b512, Digest}; + + use super::*; + + #[test] + fn test_string_sanity() { + let a = "a".to_string(); + let b = "b".to_string(); + assert_eq!(hash(&a), hash(&a.clone())); + assert_ne!(hash(&a), hash(&b)); + assert_ne!(hash(&"a".to_string()), hash(&"a\0".to_string())); + } + + #[test] + fn test_hash_map_key_value_distinction() { + let a = [("ab".to_string(), "cd".to_string())] + .into_iter() + .collect::>(); + let b = [("a".to_string(), "bcd".to_string())] + .into_iter() + .collect::>(); + + assert_ne!(hash(&a), hash(&b)); + } + + #[test] + fn test_btree_map_key_value_distinction() { + let a = [("ab".to_string(), "cd".to_string())] + .into_iter() + .collect::>(); + let b = [("a".to_string(), "bcd".to_string())] + .into_iter() + .collect::>(); + + assert_ne!(hash(&a), hash(&b)); + } + + #[test] + fn test_struct_sanity() { + content_hash! { + struct Foo { x: i32 } + } + assert_ne!(hash(&Foo { x: 42 }), hash(&Foo { x: 12 })); + } + + #[test] + fn test_option_sanity() { + assert_ne!(hash(&Some(42)), hash(&42)); + assert_ne!(hash(&None::), hash(&42i32)); + } + + #[test] + fn test_slice_sanity() { + assert_ne!(hash(&[42i32][..]), hash(&[12i32][..])); + assert_ne!(hash(&([] as [i32; 0])[..]), hash(&[42i32][..])); + assert_ne!(hash(&([] as [i32; 0])[..]), hash(&())); + assert_ne!(hash(&42i32), hash(&[42i32][..])); + } + + #[test] + fn test_consistent_hashing() { + content_hash! { + struct Foo { x: Vec>, y: i64 } + } + insta::assert_snapshot!( + hex::encode(&hash(&Foo { + x: vec![None, Some(42)], + y: 17 + })), + @"14e42ea3d680bc815d0cea8ac20d3e872120014fb7bba8d82c3ffa7a8e6d63c41ef9631c60b73b150e3dd72efe50e8b0248321fe2b7eea09d879f3757b879372" + ); + } + + fn hash(x: &(impl ContentHash + ?Sized)) -> digest::Output { + let mut hasher = Blake2b512::default(); + x.hash(&mut hasher); + hasher.finalize() + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index a7053967630..612285aa5e6 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -14,6 +14,9 @@ #![deny(unused_must_use)] +#[macro_use] +mod content_hash; + pub mod backend; pub mod commit; pub mod commit_builder; diff --git a/lib/src/op_store.rs b/lib/src/op_store.rs index 3d74c7f555b..1e7387fe6c0 100644 --- a/lib/src/op_store.rs +++ b/lib/src/op_store.rs @@ -18,9 +18,12 @@ use std::fmt::{Debug, Error, Formatter}; use thiserror::Error; use crate::backend::{CommitId, Timestamp}; +use crate::content_hash::ContentHash; -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -pub struct WorkspaceId(String); +content_hash! { + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] + pub struct WorkspaceId(String); +} impl Debug for WorkspaceId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { @@ -44,8 +47,10 @@ impl WorkspaceId { } } -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -pub struct ViewId(Vec); +content_hash! { + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] + pub struct ViewId(Vec); +} impl Debug for ViewId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { @@ -75,8 +80,10 @@ impl ViewId { } } -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -pub struct OperationId(Vec); +content_hash! { + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] + pub struct OperationId(Vec); +} impl Debug for OperationId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { @@ -115,6 +122,26 @@ pub enum RefTarget { }, } +impl ContentHash for RefTarget { + fn hash(&self, state: &mut impl digest::Update) { + use RefTarget::*; + match *self { + Normal(ref id) => { + state.update(&0u32.to_le_bytes()); + id.hash(state); + } + Conflict { + ref removes, + ref adds, + } => { + state.update(&1u32.to_le_bytes()); + removes.hash(state); + adds.hash(state); + } + } + } +} + impl RefTarget { pub fn is_conflict(&self) -> bool { matches!(self, RefTarget::Conflict { .. }) @@ -146,67 +173,75 @@ impl RefTarget { } } -#[derive(Default, PartialEq, Eq, Clone, Debug)] -pub struct BranchTarget { - /// The commit the branch points to locally. `None` if the branch has been - /// deleted locally. - pub local_target: Option, - // TODO: Do we need to support tombstones for remote branches? For example, if the branch - // has been deleted locally and you pull from a remote, maybe it should make a difference - // whether the branch is known to have existed on the remote. We may not want to resurrect - // the branch if the branch's state on the remote was just not known. - pub remote_targets: BTreeMap, -} - -/// Represents the way the repo looks at a given time, just like how a Tree -/// object represents how the file system looks at a given time. -#[derive(PartialEq, Eq, Clone, Debug, Default)] -pub struct View { - /// All head commits - pub head_ids: HashSet, - /// Heads of the set of public commits. - pub public_head_ids: HashSet, - pub branches: BTreeMap, - pub tags: BTreeMap, - pub git_refs: BTreeMap, - /// The commit the Git HEAD points to. - // TODO: Support multiple Git worktrees? - // TODO: Do we want to store the current branch name too? - pub git_head: Option, - // The commit that *should be* checked out in the workspace. Note that the working copy - // (.jj/working_copy/) has the source of truth about which commit *is* checked out (to be - // precise: the commit to which we most recently completed an update to). - pub wc_commit_ids: HashMap, -} - -/// Represents an operation (transaction) on the repo view, just like how a -/// Commit object represents an operation on the tree. -/// -/// Operations and views are not meant to be exchanged between repos or users; -/// they represent local state and history. -/// -/// The operation history will almost always be linear. It will only have -/// forks when parallel operations occurred. The parent is determined when -/// the transaction starts. When the transaction commits, a lock will be -/// taken and it will be checked that the current head of the operation -/// graph is unchanged. If the current head has changed, there has been -/// concurrent operation. -#[derive(PartialEq, Eq, Clone, Debug)] -pub struct Operation { - pub view_id: ViewId, - pub parents: Vec, - pub metadata: OperationMetadata, +content_hash! { + #[derive(Default, PartialEq, Eq, Clone, Debug)] + pub struct BranchTarget { + /// The commit the branch points to locally. `None` if the branch has been + /// deleted locally. + pub local_target: Option, + // TODO: Do we need to support tombstones for remote branches? For example, if the branch + // has been deleted locally and you pull from a remote, maybe it should make a difference + // whether the branch is known to have existed on the remote. We may not want to resurrect + // the branch if the branch's state on the remote was just not known. + pub remote_targets: BTreeMap, + } } -#[derive(PartialEq, Eq, Clone, Debug)] -pub struct OperationMetadata { - pub start_time: Timestamp, - pub end_time: Timestamp, - // Whatever is useful to the user, such as exact command line call - pub description: String, - pub hostname: String, - pub username: String, - pub tags: HashMap, +content_hash! { + /// Represents the way the repo looks at a given time, just like how a Tree + /// object represents how the file system looks at a given time. + #[derive(PartialEq, Eq, Clone, Debug, Default)] + pub struct View { + /// All head commits + pub head_ids: HashSet, + /// Heads of the set of public commits. + pub public_head_ids: HashSet, + pub branches: BTreeMap, + pub tags: BTreeMap, + pub git_refs: BTreeMap, + /// The commit the Git HEAD points to. + // TODO: Support multiple Git worktrees? + // TODO: Do we want to store the current branch name too? + pub git_head: Option, + // The commit that *should be* checked out in the workspace. Note that the working copy + // (.jj/working_copy/) has the source of truth about which commit *is* checked out (to be + // precise: the commit to which we most recently completed an update to). + pub wc_commit_ids: HashMap, + } +} + +content_hash! { + /// Represents an operation (transaction) on the repo view, just like how a + /// Commit object represents an operation on the tree. + /// + /// Operations and views are not meant to be exchanged between repos or users; + /// they represent local state and history. + /// + /// The operation history will almost always be linear. It will only have + /// forks when parallel operations occurred. The parent is determined when + /// the transaction starts. When the transaction commits, a lock will be + /// taken and it will be checked that the current head of the operation + /// graph is unchanged. If the current head has changed, there has been + /// concurrent operation. + #[derive(PartialEq, Eq, Clone, Debug)] + pub struct Operation { + pub view_id: ViewId, + pub parents: Vec, + pub metadata: OperationMetadata, + } +} + +content_hash! { + #[derive(PartialEq, Eq, Clone, Debug)] + pub struct OperationMetadata { + pub start_time: Timestamp, + pub end_time: Timestamp, + // Whatever is useful to the user, such as exact command line call + pub description: String, + pub hostname: String, + pub username: String, + pub tags: HashMap, + } } impl OperationMetadata { diff --git a/lib/src/simple_op_store.rs b/lib/src/simple_op_store.rs index 372b19cff58..54a950184e3 100644 --- a/lib/src/simple_op_store.rs +++ b/lib/src/simple_op_store.rs @@ -16,15 +16,16 @@ use std::collections::BTreeMap; use std::fmt::Debug; use std::fs; use std::fs::File; -use std::io::{ErrorKind, Write}; +use std::io::ErrorKind; use std::path::PathBuf; -use blake2::{Blake2b512, Digest}; +use blake2::Blake2b512; use itertools::Itertools; use protobuf::{Message, MessageField}; use tempfile::{NamedTempFile, PersistError}; use crate::backend::{CommitId, MillisSinceEpoch, Timestamp}; +use crate::content_hash::ContentHash; use crate::file_util::persist_content_addressed_temp_file; use crate::op_store::{ BranchTarget, OpStore, OpStoreError, OpStoreResult, Operation, OperationId, OperationMetadata, @@ -95,12 +96,9 @@ impl OpStore for SimpleOpStore { let temp_file = NamedTempFile::new_in(&self.path)?; let proto = view_to_proto(view); - let mut proto_bytes: Vec = Vec::new(); - proto.write_to_writer(&mut proto_bytes)?; + proto.write_to_writer(&mut temp_file.as_file())?; - temp_file.as_file().write_all(&proto_bytes)?; - - let id = ViewId::new(Blake2b512::digest(&proto_bytes).to_vec()); + let id = ViewId::new(hash(view).to_vec()); persist_content_addressed_temp_file(temp_file, self.view_path(&id))?; Ok(id) @@ -118,12 +116,9 @@ impl OpStore for SimpleOpStore { let temp_file = NamedTempFile::new_in(&self.path)?; let proto = operation_to_proto(operation); - let mut proto_bytes: Vec = Vec::new(); - proto.write_to_writer(&mut proto_bytes)?; - - temp_file.as_file().write_all(&proto_bytes)?; + proto.write_to_writer(&mut temp_file.as_file())?; - let id = OperationId::new(Blake2b512::digest(&proto_bytes).to_vec()); + let id = OperationId::new(hash(operation).to_vec()); persist_content_addressed_temp_file(temp_file, self.operation_path(&id))?; Ok(id) @@ -363,6 +358,13 @@ fn ref_target_from_proto(proto: &crate::protos::op_store::RefTarget) -> RefTarge } } +fn hash(x: &impl ContentHash) -> digest::Output { + use digest::Digest; + let mut hasher = Blake2b512::default(); + x.hash(&mut hasher); + hasher.finalize() +} + #[cfg(test)] mod tests { use maplit::{btreemap, hashmap, hashset};