From f1168be39d11b07d5f939e9de97226a27410bf39 Mon Sep 17 00:00:00 2001 From: Benno Rice Date: Fri, 24 Sep 2021 14:18:18 +1000 Subject: [PATCH 1/4] compose: rewrite rpmdb timestamps if SOURCE_DATE_EPOCH is set RPM package headers may contain several values that are either timestamps or derived from timestamps. These introduce variation into the RPM database. This patch looks for the SOURCE_DATE_EPOCH environment variable and, if that is present, rewrites these values to match the value it contains. --- docs/treefile.md | 6 +++ rust/src/composepost.rs | 18 +++++--- rust/src/lib.rs | 3 +- rust/src/normalization.rs | 78 +++++++++++++++++++++++++++++++++- rust/src/treefile.rs | 11 ++++- src/libpriv/rpmostree-core.cxx | 2 +- 6 files changed, 108 insertions(+), 10 deletions(-) diff --git a/docs/treefile.md b/docs/treefile.md index 35648bcd72..f14d1ce91e 100644 --- a/docs/treefile.md +++ b/docs/treefile.md @@ -111,6 +111,12 @@ It supports the following parameters: library in the target filesystem tree understands. However, this is a relatively new default, so the value `host` is provided as a fallback + * `rpmdb-normalize`: boolean, optional. Defaults to `false`. If enabled, + this will perform various manipulations of the RPM database to, as much + as possible, guarantee a deterministic result for the on-disk RPM + database. Requires the `SOURCE_DATE_EPOCH` environment variable to be set + to the UNIX epoch time to be used as the build timestamp. + * `cliwrap`: boolean, optional. Defaults to `false`. If enabled, rpm-ostree will replace binaries such as `/usr/bin/rpm` with wrappers that intercept unsafe operations, or adjust functionality. diff --git a/rust/src/composepost.rs b/rust/src/composepost.rs index 7bb616a26f..8178226e1b 100644 --- a/rust/src/composepost.rs +++ b/rust/src/composepost.rs @@ -9,6 +9,7 @@ use crate::bwrap::Bubblewrap; use crate::cxxrsutil::*; use crate::ffi::BubblewrapMutability; use crate::ffiutil::ffi_view_openat_dir; +use crate::normalization; use crate::passwd::PasswdDB; use crate::treefile::Treefile; use crate::{bwrap, importer}; @@ -951,7 +952,7 @@ fn hardlink_rpmdb_base_location( } #[context("Rewriting rpmdb for target native format")] -fn rewrite_rpmdb_for_target_inner(rootfs_dfd: &openat::Dir) -> Result<()> { +fn rewrite_rpmdb_for_target_inner(rootfs_dfd: &openat::Dir, normalize: bool) -> Result<()> { let tempetc = crate::core::prepare_tempetc_guard(rootfs_dfd.as_raw_fd())?; let dbfd = Rc::new( @@ -980,6 +981,12 @@ fn rewrite_rpmdb_for_target_inner(rootfs_dfd: &openat::Dir) -> Result<()> { let mut dbfd = Rc::try_unwrap(dbfd).unwrap(); dbfd.seek(std::io::SeekFrom::Start(0))?; + // In the interests of build stability, rewrite the INSTALLTIME and INSTALLTID tags + // to be deterministic and dervied from `SOURCE_DATE_EPOCH` if requested. + if normalize { + normalization::rewrite_rpmdb_timestamps(&mut dbfd)?; + } + // Fork the target rpmdb to write the content from memory to disk let mut bwrap = Bubblewrap::new_with_mutability(rootfs_dfd, BubblewrapMutability::RoFiles)?; bwrap.append_child_argv(&["rpmdb", dbpath_arg.as_str(), "--importdb"]); @@ -994,10 +1001,11 @@ fn rewrite_rpmdb_for_target_inner(rootfs_dfd: &openat::Dir) -> Result<()> { Ok(()) } -pub(crate) fn rewrite_rpmdb_for_target(rootfs_dfd: i32) -> CxxResult<()> { - Ok(rewrite_rpmdb_for_target_inner(&ffi_view_openat_dir( - rootfs_dfd, - ))?) +pub(crate) fn rewrite_rpmdb_for_target(rootfs_dfd: i32, normalize: bool) -> CxxResult<()> { + Ok(rewrite_rpmdb_for_target_inner( + &ffi_view_openat_dir(rootfs_dfd), + normalize, + )?) } /// Recursively hard-link `source` hierarchy to `target` directory. diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 837db99ee1..f747d24c93 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -199,7 +199,7 @@ pub mod ffi { cancellable: Pin<&mut GCancellable>, ) -> Result<()>; fn compose_postprocess_rpm_macro(rootfs_dfd: i32) -> Result<()>; - fn rewrite_rpmdb_for_target(rootfs_dfd: i32) -> Result<()>; + fn rewrite_rpmdb_for_target(rootfs_dfd: i32, normalize: bool) -> Result<()>; fn directory_size(dfd: i32, mut cancellable: Pin<&mut GCancellable>) -> Result; } @@ -370,6 +370,7 @@ pub mod ffi { fn get_selinux(&self) -> bool; fn get_releasever(&self) -> &str; fn rpmdb_backend_is_target(&self) -> bool; + fn should_normalize_rpmdb(&self) -> bool; fn get_files_remove_regex(&self, package: &str) -> Vec; fn get_checksum(&self, repo: Pin<&mut OstreeRepo>) -> Result; fn get_ostree_ref(&self) -> String; diff --git a/rust/src/normalization.rs b/rust/src/normalization.rs index 97068a8f66..fd1cc2434e 100644 --- a/rust/src/normalization.rs +++ b/rust/src/normalization.rs @@ -6,10 +6,11 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT use crate::nameservice::shadow::parse_shadow_content; -use anyhow::Result; +use anyhow::{anyhow, Result}; use fn_error_context::context; use lazy_static::lazy_static; -use std::io::{BufReader, Seek, SeekFrom}; +use std::convert::TryInto; +use std::io::{BufReader, Read, Seek, SeekFrom, Write}; lazy_static! { static ref SOURCE_DATE_EPOCH_RAW: Option = std::env::var("SOURCE_DATE_EPOCH").ok(); @@ -45,3 +46,76 @@ pub(crate) fn normalize_etc_shadow(rootfs: &openat::Dir) -> Result<()> { Ok(()) } + +const RPM_HEADER_MAGIC: [u8; 8] = [0x8E, 0xAD, 0xE8, 0x01, 0x00, 0x00, 0x00, 0x00]; +const RPMTAG_INSTALLTIME: u32 = 1008; +const RPMTAG_INSTALLTID: u32 = 1128; + +#[context("Normalizing rpmdb timestamps for build stability")] +pub(crate) fn rewrite_rpmdb_timestamps(rpmdb: &mut F) -> Result<()> { + let source_date_epoch = if let Some(source_date_epoch) = *SOURCE_DATE_EPOCH { + source_date_epoch as u32 + } else { + return Ok(()); + }; + + // Remember where we started + let pos = rpmdb.stream_position()?; + + let mut buffer: [u8; 16] = [0; 16]; + let install_tid = source_date_epoch; + let mut install_time = source_date_epoch; + + loop { + // Read in a header record + match rpmdb.read_exact(&mut buffer) { + Err(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e.into()), + _ => (), + }; + + // Make sure things are sane + if buffer[..8] != RPM_HEADER_MAGIC { + return Err(anyhow!("Bad RPM header magic in RPM database")); + } + + // Grab the count of index records and the size of the data blob + let record_count = u32::from_be_bytes(buffer[8..12].try_into()?); + let data_size = u32::from_be_bytes(buffer[12..].try_into()?); + + // Loop through the records looking for ones that point at things + // that are, or are derived from, timestamps + let mut offsets = Vec::new(); + for _ in 0..record_count { + rpmdb.read_exact(&mut buffer)?; + + let tag = u32::from_be_bytes(buffer[..4].try_into()?); + if tag == RPMTAG_INSTALLTIME || tag == RPMTAG_INSTALLTID { + offsets.push((tag, u32::from_be_bytes(buffer[8..12].try_into()?))); + } + } + + // Work through the data blob replacing the timestamp-derived values + // with the timestamp we want + offsets.sort_unstable_by_key(|(_, offset)| *offset); + let mut offset = 0; + for (tag, value_offset) in offsets { + rpmdb.seek(std::io::SeekFrom::Current((value_offset - offset) as i64))?; + if tag == RPMTAG_INSTALLTID { + rpmdb.write_all(&install_tid.to_be_bytes())?; + } else if tag == RPMTAG_INSTALLTIME { + rpmdb.write_all(&install_time.to_be_bytes())?; + install_time += 1; + } + offset = value_offset + std::mem::size_of::() as u32; + } + + // Move to the next record + rpmdb.seek(std::io::SeekFrom::Current((data_size - offset) as i64))?; + } + + // Seek back to where we were before + rpmdb.seek(std::io::SeekFrom::Start(pos))?; + + Ok(()) +} diff --git a/rust/src/treefile.rs b/rust/src/treefile.rs index c996bdac73..c6dd0a4514 100644 --- a/rust/src/treefile.rs +++ b/rust/src/treefile.rs @@ -382,7 +382,8 @@ fn treefile_merge(dest: &mut TreeComposeConfig, src: &mut TreeComposeConfig) { preserve_passwd, check_passwd, check_groups, - postprocess_script + postprocess_script, + rpmdb_normalize ); merge_hashsets!(ignore_removed_groups, ignore_removed_users); merge_maps!(add_commit_metadata); @@ -732,6 +733,10 @@ impl Treefile { .map_or(true, |b| *b != RpmdbBackend::Host) } + pub(crate) fn should_normalize_rpmdb(&self) -> bool { + self.parsed.rpmdb_normalize.unwrap_or(false) + } + pub(crate) fn get_files_remove_regex(&self, package: &str) -> Vec { let mut files_to_remove: Vec = Vec::new(); if let Some(ref packages) = self.parsed.remove_from_packages { @@ -1282,6 +1287,10 @@ pub(crate) struct TreeComposeConfig { #[serde(skip_serializing_if = "Option::is_none")] pub(crate) rpmdb: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(rename = "rpmdb-normalize")] + pub(crate) rpmdb_normalize: Option, + #[serde(flatten)] pub(crate) legacy_fields: LegacyTreeComposeConfigFields, diff --git a/src/libpriv/rpmostree-core.cxx b/src/libpriv/rpmostree-core.cxx index 0a717023a6..1e7ddc7315 100644 --- a/src/libpriv/rpmostree-core.cxx +++ b/src/libpriv/rpmostree-core.cxx @@ -3944,7 +3944,7 @@ write_rpmdb (RpmOstreeContext *self, if (self->treefile_rs && self->treefile_rs->rpmdb_backend_is_target()) { g_print ("Regenerating rpmdb for target\n"); - CXX_TRY(rewrite_rpmdb_for_target(tmprootfs_dfd), error); + CXX_TRY(rewrite_rpmdb_for_target(tmprootfs_dfd, self->treefile_rs->should_normalize_rpmdb()), error); } else { From 2c03f71eb774a80403cf26f48baffb182ce117b0 Mon Sep 17 00:00:00 2001 From: Benno Rice Date: Wed, 3 Nov 2021 11:35:35 +1100 Subject: [PATCH 2/4] compose: remove static varibles to make testing easier. We originally used (lazy) statics to hold the value of SOURCE_DATE_EPOCH if we were using it but this can interfere with unit testing. --- rust/src/normalization.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/rust/src/normalization.rs b/rust/src/normalization.rs index fd1cc2434e..b517a77bbc 100644 --- a/rust/src/normalization.rs +++ b/rust/src/normalization.rs @@ -8,19 +8,19 @@ use crate::nameservice::shadow::parse_shadow_content; use anyhow::{anyhow, Result}; use fn_error_context::context; -use lazy_static::lazy_static; use std::convert::TryInto; use std::io::{BufReader, Read, Seek, SeekFrom, Write}; -lazy_static! { - static ref SOURCE_DATE_EPOCH_RAW: Option = std::env::var("SOURCE_DATE_EPOCH").ok(); - static ref SOURCE_DATE_EPOCH: Option = SOURCE_DATE_EPOCH_RAW - .as_ref() - .map(|s| s.parse::().expect("bad number in SOURCE_DATE_EPOCH")); +pub(crate) fn source_date_epoch() -> Option { + if let Some(raw) = source_date_epoch_raw() { + raw.parse().ok() + } else { + None + } } -pub(crate) fn source_date_epoch_raw() -> Option<&'static str> { - SOURCE_DATE_EPOCH_RAW.as_ref().map(|s| s.as_str()) +pub(crate) fn source_date_epoch_raw() -> Option { + std::env::var("SOURCE_DATE_EPOCH").ok() } #[context("Rewriting /etc/shadow to remove lstchg field")] @@ -53,8 +53,8 @@ const RPMTAG_INSTALLTID: u32 = 1128; #[context("Normalizing rpmdb timestamps for build stability")] pub(crate) fn rewrite_rpmdb_timestamps(rpmdb: &mut F) -> Result<()> { - let source_date_epoch = if let Some(source_date_epoch) = *SOURCE_DATE_EPOCH { - source_date_epoch as u32 + let source_date = if let Some(source_date) = source_date_epoch() { + source_date as u32 } else { return Ok(()); }; @@ -63,8 +63,8 @@ pub(crate) fn rewrite_rpmdb_timestamps(rpmdb: &mut F) -> let pos = rpmdb.stream_position()?; let mut buffer: [u8; 16] = [0; 16]; - let install_tid = source_date_epoch; - let mut install_time = source_date_epoch; + let install_tid = source_date; + let mut install_time = source_date; loop { // Read in a header record From cd6ca0af3cfd152ff16feb2248c02294d3b38d19 Mon Sep 17 00:00:00 2001 From: Benno Rice Date: Fri, 29 Oct 2021 13:53:29 +1100 Subject: [PATCH 3/4] compose: add tests for rpmdb timestamp normalisation --- Cargo.lock | 9 ++-- Cargo.toml | 1 + rust/src/normalization.rs | 68 +++++++++++++++++++++++++++++++ rust/test/dummy-rpm-database.bin | Bin 0 -> 6844 bytes 4 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 rust/test/dummy-rpm-database.bin diff --git a/Cargo.lock b/Cargo.lock index 141f987935..b1df597236 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1269,9 +1269,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.35" +version = "0.10.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "549430950c79ae24e6d02e0b7404534ecf311d94cc9f861e9e4020187d13d885" +checksum = "0c7ae222234c30df141154f159066c5093ff73b63204dcda7121eb082fc56a95" dependencies = [ "bitflags", "cfg-if 1.0.0", @@ -1289,9 +1289,9 @@ checksum = "28988d872ab76095a6e6ac88d99b54fd267702734fd7ffe610ca27f533ddb95a" [[package]] name = "openssl-sys" -version = "0.9.65" +version = "0.9.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a7907e3bfa08bb85105209cdfcb6c63d109f8f6c1ed6ca318fff5c1853fbc1d" +checksum = "c6517987b3f8226b5da3661dad65ff7f300cc59fb5ea8333ca191fc65fde3edf" dependencies = [ "autocfg", "cc", @@ -1847,6 +1847,7 @@ dependencies = [ "nix 0.23.0", "openat", "openat-ext", + "openssl", "os-release", "ostree-ext", "paste", diff --git a/Cargo.toml b/Cargo.toml index 3aad862ffb..76a7e777fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,7 @@ memfd = "0.4.1" nix = "0.23.0" openat = "0.1.21" openat-ext = "^0.2.2" +openssl = "0.10.38" os-release = "0.1.0" ostree-ext = "0.3.0" paste = "1.0" diff --git a/rust/src/normalization.rs b/rust/src/normalization.rs index b517a77bbc..7f588d3238 100644 --- a/rust/src/normalization.rs +++ b/rust/src/normalization.rs @@ -119,3 +119,71 @@ pub(crate) fn rewrite_rpmdb_timestamps(rpmdb: &mut F) -> Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::Result; + use openssl::sha::sha256; + use std::io::Cursor; + + #[test] + fn rpmdb_timestamp_rewrite() -> Result<()> { + // This is a pretty simple smoke test. We have a dummy RPM database + // dump that contains a specific initial timestamp in several tags: + // + // - RPMTAG_INSTALLTID + // - RPMTAG_INSTALLTIME + // - RPMTAG_BUILDTIME + // + // The first two are the ones we want changed, the latter is a canary + // to ensure we don't get overzealous. + // + // We know the checksums of both the unrewritten and rewritten dumps + // so all we need to do is make sure that the initial database matches + // what we expect, and then run the rewrite, and ensure the new + // checksum matches what we expect. + // + // More complicated testing is left to other test cycles. + + let rpmdb = include_bytes!("../test/dummy-rpm-database.bin").to_vec(); + + const REWRITE_TIMESTAMP: u32 = 1445437680; + const INITIAL_CHECKSUM: [u8; 32] = [ + 0x66, 0xac, 0x68, 0x75, 0xe7, 0x40, 0x99, 0x64, 0xd0, 0x04, 0xde, 0xff, 0x09, 0x80, + 0x22, 0x77, 0xb0, 0xeb, 0x63, 0x7a, 0xa9, 0x14, 0x62, 0x4e, 0xda, 0x52, 0x36, 0x06, + 0x8b, 0x23, 0x39, 0xec, + ]; + const REWRITE_CHECKSUM: [u8; 32] = [ + 0xac, 0x79, 0xb9, 0xa9, 0x9b, 0x95, 0x73, 0x81, 0x5f, 0x7c, 0x90, 0xbb, 0x27, 0x49, + 0x55, 0xba, 0x1a, 0x77, 0xcd, 0xfc, 0xde, 0x6e, 0xa0, 0xf9, 0xc4, 0x9c, 0x6e, 0xea, + 0x88, 0x31, 0x15, 0x43, + ]; + + // Calculate and check initial checksum. + let checksum = sha256(&rpmdb); + assert_eq!(checksum[..], INITIAL_CHECKSUM[..]); + + // Override SOURCE_DATE_EPOCH, retaining original value for later. + let source_date = std::env::var_os("SOURCE_DATE_EPOCH"); + std::env::set_var("SOURCE_DATE_EPOCH", REWRITE_TIMESTAMP.to_string()); + + // Actually do the rewrite. + let mut cursor = Cursor::new(rpmdb); + rewrite_rpmdb_timestamps(&mut cursor)?; + let rpmdb = cursor.into_inner(); + + // Restore or remove the original SOURCE_DATE_EPOCH. + if let Some(value) = source_date { + std::env::set_var("SOURCE_DATE_EPOCH", value); + } else { + std::env::remove_var("SOURCE_DATE_EPOCH"); + } + + // Calculate and check checksum of rewritten data. + let checksum = sha256(&rpmdb); + assert_eq!(checksum[..], REWRITE_CHECKSUM[..]); + + Ok(()) + } +} diff --git a/rust/test/dummy-rpm-database.bin b/rust/test/dummy-rpm-database.bin new file mode 100644 index 0000000000000000000000000000000000000000..52d5c8ecf0e81e89e34fa04e19b1f480905abc43 GIT binary patch literal 6844 zcmeI0e~cAZ702iC9%Zdmt=rnxXnZv^S(=??e*ZAVg20wF1PE*6lAt(q=gz(t9`CI) zvt>bzv?0x!5G1jGHZ4Y@{?V#2X@ZG~n@W@>m|~p-_XXf0;?xuf` z_782+N#>pT%(-*toOAEo`Mx>y{H2;!ZEG~GaT13QV_$~jVb~8~Pp}VQrwwiEmtb$h zzJ&Qt%+!|re`dam`7-Q#uz!g83iAr)zrr@KJIsHBrTzovzr&J$J@Y?c$-j~L8Z7lM zng0z-{x35Bhj~xM4eE>b{mj(1j{N{L^+o(7^X;(IewO*YuvCwO%*&a7$$SSawWnI5 z9*y_IQu|ZPR43vy%v2|e;kV3p!_xKFneTx``*N62^TxfEzqSH)DfmA2Q@w~k%=|6p zAz1WZn}elzMwx%X{0J=WS3AqRk@+0+7%biYJo5xBwZ8;Q@oZ=RRpuS+SMh(Y^3!xF z@Bew`_c6z?h;IqyMe!vS*IUdquJG4)GpEc`%pK;8xqwA{bu~WI%!k?k7%cLsKgCS* zg4)k9&oCcj-phQPc^@pEkNOE%jOE}r*w6JF{5hE7JqL??2G7IN{Vp$F^oUN@PF#I9&kYUYTg*^ACCY@ul%oqOhTrzfILtCv&pu|2rC*OfEvypY|Z zIo?Z)Jy9l`o7zHlbE!Sr&8MVjr)`;NopyJ6Mwo zaHVaSmLr283T)FcQY(s0!}J})a9t~qrY{XghMt`UQdp^Nd6r>T-Dz6Z>lM|{g2;8` zrrRTVCPuO;np`)qEoiYY?;!-Kd(KLv)!Dx>OE?T<&y?IjIelHK0+}Z92CwV{O%Ro|Tzt z`WCCHPd4?b_LOYupN%>l{K_orWvkO(wmT|xRBk$J7khDjYFJ;tb9BqL&y0<3-#NbJ z(QWHCtj6Wl`A0U5eQMpr#Ll(j;~UUo%jWT>p2`^^r;2v3n*;I!lTV+@dW954FVpiP z>veJO$qZ97OY})0^n6dd5G{%3G?ntzWu&P5Dh$A?OeU?UV57I9ERkKA=t7vc&^+pP zdl<~B^={8;>br~Su3SBdKC-qzd{Mr)D_V%DH=XO#-97DYq8Cxzp=daG(TVbwN&#uX zA<}uQiK%u^o7tyXJoWV|X1#nI^vNzQT58+6WqfR+nYSd0o{OwKRSY-Rj;|XVYZh{* zXv!H>zP!J>0Asuj$FnbOc=7(pAI>hldiT*4Pu}(hP7O>KsUB#yNFwPQR+4zG85n_S z$4(Hqw(q5;VK|}hI*A(^w(GlT6#Fs}c;XBZq@m~8fghDOqWM#5g&Q}nAI34(s)5=2 zcU^h)*@sUo+xYlbo_=}hg$Lrmi-hYMuHy#2FC*KvOvjdK;zXXA_)cQlq3wr`W;wQP zB&m}G!cFib$I``k#bzu55eaEWVQfV~8ib~Ucq}_Y;Hm4`wx7lxo_7R*bqFg5Z8ZZw zby}AWEz~!(#wmR>gk!{f%ld}i<9L4h`eu;%GAwBZdT*1TVj%uFv(h&d6Zw_Cq2-QP z>6X2!dt)HC~;sV~wuPcSQebAVatn_1@N zur$^$G2a1;_lZ`%FL0>z%}MrCO{x7W%y+?346id&o#=YX3-LEt-;{c137B*M{Iw=5 z-erA#L-it7`sP_?8b|bBQ}4e~=AXe*|4QGSX5I)({hwtXgGIcxbIcR4*VJYV|yMzCL2{(6f!W!}ZyVdn8}6kxoetmuOhwA#l3$Tc%v7+L> zz6qEyhNZrF2p0YK_01;sD}6(Gp?-aRbF|{VzESxoeKW^?rEgR|R9A|J@`1id+gV=R zlsATEE4ctxyJ-0T;zcqIWtgO~ zowz>!*`)X%6B{mqij2@o6WcWc8K#zL`k@eko%&Ea_yZ)2BtmkeWoQm#iW~S@tZmj} zeLIKqO55<-ddu48gVZs)Ro`xCO(p&dEY?o#US^7c-fJIWR$6D6S!o@*KV7f1&KhQ= zb$G3>E3LDY{nucrf2D1d)}eP4^{2GXS1ayoompnQXG?8!keT`-t#gQ3X`RE&O6xqu zyd0L+4y9d`);Z07rFG6S(r=*h+k=)8r_4m1l1M3a!g6v+y{oQ zwjP#PX`LT155XeG+6h?dZiog>Uj>wKG8 zX`PeIGqA|Neu|mq5wRLirFCdLkyBsmaJ~9khwIhXIy7&QPhaZ<75BByCT68|wpRW3 zwaypVue8qBD(-8Y6BYNhj>?CgOT^z$`6#WU@@cW3@`2W2nNzf6(Q0Q2-i0Rqabl}4 z_AKT%`KB}cHgwN$rF%v`CjB6FVmFLkEV+;}iEjmo#7gN{_|}6()J-DW2z=8GEi;U< zY6gMjdye!(nBdPR1y9>e-Loii$R1O-`qGEK=`3ooxGMA1&l(!kch+~Ez^8H`>~g1L zJm1=Y{fV*;nzH0rT!(ih(EnZny;c45(@&oM*UM+_eDU>fTCW}b=hvU3F|YJbYD?Ek z@VUts_}t<+5gx0=m64ISUg$(Z_+~BW%d}hW=yd;5kam-XCb|^y6a2(Tiydd>re4L4eY2lAe zpc(id6NmUe14-oifsFk?7?v5t_%`HAOW1ha9m^J>^gJhmtcpDW&1RWa>f?}J5?GH& KTVT7XfBp;dBqUn^ literal 0 HcmV?d00001 From 3bd2bdbb2aa98bb5bd12446ec8c9ee6619fdfb3c Mon Sep 17 00:00:00 2001 From: Benno Rice Date: Fri, 1 Oct 2021 13:28:07 +1000 Subject: [PATCH 4/4] compose: normalise underlying BDB files in RPM database Berkeley DB has several issues that cause unreproducible builds: 1) Upon creation each file is assigned a unique ID generated using a mixture of process ID, current time, and some randomness. 2) Pages used to hold data to be written out to disk are not zeroed prior to use. This leads to arbitrary data from the current process being written out to disk. 3) Unused fields in structures are not zeroed leading to arbitrary stack data being written out to disk. Replacing the unique file ID causes no issues broadly but to ensure "sufficient" uniqueness these are replaced with a value generated by feeding the current time or the current value of SOURCE_DATE_EPOCH along with a partial file path into sha256 and using the first 20 bytes as the ID. For the other problems, areas known to be unused are found and zeroed out. In order to ensure no change to data, the `db_dump` utility is run prior to any changes and the output is hashed using sha256. After changes the `db_verify` utility is run and, assuming this is successful, `db_dump` is re-run and the hash of the contents is compared. Any variation is considered a failure. This change does not look at any potential reproducibility issues in the ndb or sqlite backends. --- Cargo.lock | 29 ++++ Cargo.toml | 1 + docs/treefile.md | 3 +- rust/src/composepost.rs | 6 + rust/src/normalization.rs | 355 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 393 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index b1df597236..119428ffde 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,6 +76,28 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "binread" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16598dfc8e6578e9b597d9910ba2e73618385dc9f4b1d43dd92c349d6be6418f" +dependencies = [ + "binread_derive", + "rustversion", +] + +[[package]] +name = "binread_derive" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d9672209df1714ee804b1f4d4f68c8eb2a90b1f7a07acf472f88ce198ef1fed" +dependencies = [ + "either", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1826,6 +1848,7 @@ name = "rpmostree-rust" version = "0.1.0" dependencies = [ "anyhow", + "binread", "c_utf8", "camino", "chrono", @@ -1883,6 +1906,12 @@ dependencies = [ "ordered-multimap", ] +[[package]] +name = "rustversion" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" + [[package]] name = "ryu" version = "1.0.5" diff --git a/Cargo.toml b/Cargo.toml index 76a7e777fc..d6a5e69018 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ rpm = "4" [dependencies] anyhow = "1.0.44" +binread = "2.2.0" c_utf8 = "0.1.0" camino = "1.0.5" chrono = { version = "0.4.19", features = ["serde"] } diff --git a/docs/treefile.md b/docs/treefile.md index f14d1ce91e..3668a91c9f 100644 --- a/docs/treefile.md +++ b/docs/treefile.md @@ -115,7 +115,8 @@ It supports the following parameters: this will perform various manipulations of the RPM database to, as much as possible, guarantee a deterministic result for the on-disk RPM database. Requires the `SOURCE_DATE_EPOCH` environment variable to be set - to the UNIX epoch time to be used as the build timestamp. + to the UNIX epoch time to be used as the build timestamp. Currently only + fully supports the `bdb` backend. Somewhat experimental. * `cliwrap`: boolean, optional. Defaults to `false`. If enabled, rpm-ostree will replace binaries such as `/usr/bin/rpm` with diff --git a/rust/src/composepost.rs b/rust/src/composepost.rs index 8178226e1b..2fafebcc08 100644 --- a/rust/src/composepost.rs +++ b/rust/src/composepost.rs @@ -996,6 +996,12 @@ fn rewrite_rpmdb_for_target_inner(rootfs_dfd: &openat::Dir, normalize: bool) -> .run(cancellable.gobj_rewrap()) .context("Failed to run rpmdb --importdb")?; + // Sometimes we can end up with build-to-build variance in the underlying rpmdb + // files. Attempt to sort that out, if requested. + if normalize { + normalization::normalize_rpmdb(rootfs_dfd, RPMOSTREE_RPMDB_LOCATION)?; + } + tempetc.undo()?; Ok(()) diff --git a/rust/src/normalization.rs b/rust/src/normalization.rs index 7f588d3238..cf8813a42d 100644 --- a/rust/src/normalization.rs +++ b/rust/src/normalization.rs @@ -5,11 +5,14 @@ // Copyright (C) 2021 Oracle and/or its affiliates. // SPDX-License-Identifier: Apache-2.0 OR MIT +use crate::bwrap::Bubblewrap; use crate::nameservice::shadow::parse_shadow_content; use anyhow::{anyhow, Result}; use fn_error_context::context; +use ostree_ext::gio; use std::convert::TryInto; use std::io::{BufReader, Read, Seek, SeekFrom, Write}; +use std::path::Path; pub(crate) fn source_date_epoch() -> Option { if let Some(raw) = source_date_epoch_raw() { @@ -120,6 +123,358 @@ pub(crate) fn rewrite_rpmdb_timestamps(rpmdb: &mut F) -> Ok(()) } +#[context("Rewriting rpmdb database files for build stability")] +pub(crate) fn normalize_rpmdb(rootfs: &openat::Dir, rpmdb_path: impl AsRef) -> Result<()> { + let source_date = if let Some(source_date) = source_date_epoch() { + source_date as u32 + } else { + return Ok(()); + }; + + let mut bwrap = + Bubblewrap::new_with_mutability(rootfs, crate::ffi::BubblewrapMutability::Immutable)?; + bwrap.append_child_argv(&["rpm", "--eval", "%{_db_backend}"]); + let cancellable = gio::Cancellable::new(); + let db_backend = bwrap.run_captured(Some(&cancellable))?; + + let db_backend = String::from_utf8(db_backend.to_vec())?; + + match db_backend.trim() { + "bdb" => bdb_normalize::normalize(rootfs, rpmdb_path, source_date), + "ndb" => Ok(()), + "sqlite" => Ok(()), + _ => Err(anyhow!("Unknown rpmdb backend: {}", db_backend)), + } +} + +mod bdb_normalize { + // Gather round, my friends, and I shall tell you a tale of trade-offs and consequences. + // + // Way, way back in the halcyon days of 1994 a piece of database software came into being. + // Computers were simpler then, and slower, and to save memory our database-creating + // protagonists elected to use a page cache in which to keep pages of data from the + // database. Then they elected to re-use pages from that cache that were no longer in use + // when they needed to add new pages to a database. Then, to save time presumably, they + // decided to not bother zeroing these pages because the left-over data was unimportant. + // + // This, as you can imagine, wreaks merry hell on any attempt at creating deterministic + // output. But that's not all. + // + // When this software was extended to allow multiple access they needed a way to + // identify a given database file for locking purposes. So they gave each file a unique + // file ID. This also wreaks merry hell on any attempt at deterministic output, especially + // since it doesn't respect any of the things that might help that such as SOURCE_DATE_EPOCH. + // + // This leads to the eternal question: What do? + // + // This code knows just enough about the structure of BerkeleyDB BTree and Hash database + // files to know which bits are unused and writes zeros over those bits with extreme + // prejudice. It also constructs a file ID based purely on a provided timestamp and the + // name of the file in question. Both of these normalise the file sufficiently that we + // no longer see byte-wise variance given the same input data. + + use crate::bwrap::Bubblewrap; + use anyhow::{anyhow, Context, Result}; + use binread::{BinRead, BinReaderExt}; + use lazy_static::lazy_static; + use openat::SimpleType; + use openssl::sha::{sha256, Sha256}; + use ostree_ext::gio; + use std::io::{Read, Seek, SeekFrom, Write}; + use std::path::{Path, PathBuf}; + + // BerkeleyDB page types, limited to those found in the BTree and Hash database types. + #[derive(BinRead, Debug, Clone, Copy, PartialEq, Eq)] + #[repr(u8)] + #[br(repr=u8)] + enum PageType { + IBTree = 3, // An internal BTree page + LBTree = 5, // A leaf BTree page + Overflow = 7, // An overflow page (for values too long to fit in a "normal" page) + HashMeta = 8, // A Hash metadata page + BTreeMeta = 9, // A BTree metadata page + Hash = 13, // A Hash data page + } + + // Database metadata header. + #[derive(BinRead, Debug)] + #[br(little)] + struct MetaHeader { + lsn: u64, // Log sequence number + pgno: u32, // Number of this page + magic: u32, // Magic (determines which type of database this is) + version: u32, // Database library version + pagesize: u32, // Page size for this database + encrypt_alg: u8, // Encryption algorithm (if used) + page_type: PageType, // Type of this page + metaflags: u8, // Metadata flags + unused1: u8, // Exactly what it says + free: u32, // Free list page number + last_pgno: u32, // Number of last page in database + nparts: u32, // Numer of partitions + key_count: u32, // Cached key count + record_count: u32, // Cached record count + flags: u32, // Flags (type-dependent) + uid: [u8; 20], // File ID + } + + // Database metadata magic number value for BTree database type. + const BTREE_MAGIC: u32 = 0x00053162; + + // Database metadata magic number value for Hash database type. + const HASH_MAGIC: u32 = 0x00061561; + + // Size of the page header structure + const PAGE_HEADER_SIZE: u16 = 26; + + // Offset of the file ID field in the metadata header + const PAGE_HEADER_FILE_ID_OFFSET: u64 = 0x34; + + // The per-header page used in both BTree and Hash databases. + #[derive(BinRead, Debug)] + #[br(little)] + struct PageHeader { + lsn: u64, // Log sequence number + pgno: u32, // Number of this page + prev_pgno: u32, // Number of the previous page + next_pgno: u32, // Number of the next page + entries: u16, // Number of entries in this page + hf_offset: u16, // Offset to the first free byte in this page + level: u8, // BTree depth (leaf is 1, grows upwards) + page_type: PageType, // Type of this page + } + + // The types of BTree items found in a page + #[derive(BinRead, Debug)] + #[br(repr=u8)] + #[repr(u8)] + enum BTreeItemType { + KeyData = 1, // Actual key/data values + Duplicate = 2, // Duplicate entry + Overflow = 3, // Overflow + } + + // A BTree item, defined as a length and a type. Data is stored later + // in the page. + #[derive(BinRead, Debug)] + #[br(little)] + struct BTreeItem { + len: u16, // Length of this item + item_type: BTreeItemType, // Type of this item + } + + // The types of Hash items found in a page + #[derive(BinRead, Debug)] + #[br(repr=u8)] + #[repr(u8)] + enum HashItemType { + KeyData = 1, // Actual key/data values + Duplicate = 2, // Duplicate entry + Offpage = 3, // Off-page (aka Overflow) + OffDup = 4, // Off-page duplicate + } + + lazy_static! { + static ref PROC_SELF_CWD: PathBuf = PathBuf::from("/proc/self/cwd"); + static ref PROC_SELF_FD: PathBuf = PathBuf::from("/proc/self/fd"); + } + + pub(super) fn normalize( + rootfs: &openat::Dir, + db_path: impl AsRef, + timestamp: u32, + ) -> Result<()> { + let db_path = db_path.as_ref(); + + for entry in rootfs.list_dir(db_path)? { + let entry = entry?; + + // We only care about regular files. + if !matches!(entry.simple_type(), Some(SimpleType::File)) { + continue; + } + + // We don't want any dotfiles, nor do we want to mess with the temporary + // files BerkeleyDB sometimes leaves around. + if entry + .file_name() + .to_str() + .filter(|name| !(name.starts_with('.') || name.starts_with("__db"))) + .is_none() + { + continue; + } + + let path = db_path.join(entry.file_name()); + + // As a pre-check, verify the database and take a checksum of the contents. + let old_digest = database_contents_digest(&path, rootfs) + .context("pre-normalization contents check")?; + + { + // Construct a new, deterministic file ID. + let mut file_id = Sha256::new(); + file_id.update(×tamp.to_be_bytes()); + file_id.update(format!("bdb/{}", entry.file_name().to_str().unwrap()).as_bytes()); + let file_id = &file_id.finish()[..20]; + + // Open the file for update. + let mut db = rootfs.update_file(&path, 0o644)?; + + // Get the metadata header and make sure we're working on one of the + // types of file we care about. + let meta_header: MetaHeader = db.read_le()?; + match (meta_header.magic, meta_header.page_type) { + (BTREE_MAGIC, PageType::BTreeMeta) => (), + (HASH_MAGIC, PageType::HashMeta) => (), + _ => continue, + }; + + // Seek to where the file ID lives and replace it. + db.seek(SeekFrom::Start(PAGE_HEADER_FILE_ID_OFFSET))?; + db.write_all(file_id)?; + + for pageno in 1..meta_header.last_pgno + 1 { + // Seek to the next page. + db.seek(SeekFrom::Start((pageno * meta_header.pagesize) as u64))?; + + // Read in the page header. + let header: PageHeader = db.read_le()?; + + // If this is an overflow page then all we need to do is seek to the start + // of free space and zero out the rest of the page. + if header.page_type == PageType::Overflow { + db.seek(SeekFrom::Current(header.hf_offset as i64))?; + let fill_length = meta_header + .pagesize + .saturating_sub((PAGE_HEADER_SIZE + header.hf_offset) as u32); + write_zeros(&mut db, fill_length)?; + continue; + } + + // For the other page types we have a series of 16-bit item offsets immediately + // after the page header. We need to collect those up. + let mut offsets: Vec = Vec::new(); + for _ in 0..header.entries { + offsets.push(db.read_le()?); + } + offsets.sort_unstable(); + + // Zero out the unused space after the item offsets. This will either be the + // entire rest of the page if there aren't any or the space from the end of + // the offset list to the start of the first item. + let empty = if offsets.is_empty() { + meta_header.pagesize - PAGE_HEADER_SIZE as u32 + } else { + *offsets.first().unwrap() as u32 + - (PAGE_HEADER_SIZE + header.entries * 2) as u32 + }; + write_zeros(&mut db, empty)?; + + let mut offset_iter = offsets.into_iter().peekable(); + while let Some(offset) = offset_iter.next() { + // Seek to the next item offset. + db.seek(SeekFrom::Start( + (pageno * meta_header.pagesize + offset as u32) as u64, + ))?; + + if matches!(header.page_type, PageType::IBTree | PageType::LBTree) { + // BTree items consist of at least a 16-bit length and an 8-bit type. + let item: BTreeItem = db.read_le()?; + if header.page_type == PageType::IBTree { + // If this is an internal page (`IBTree`) then the byte immediately + // following the type field is unused. Zero it. + db.write_all(b"\x00")?; + } else if header.page_type == PageType::LBTree { + if let BTreeItemType::Overflow = item.item_type { + // BTree overflow entries don't use their length fields. Zero it. + db.seek(SeekFrom::Current(-3))?; + db.write_all(b"\x00\x00")?; + } else if let BTreeItemType::KeyData = item.item_type { + // Work out where the next item starts or if we're at the end of + // the page. + let next_offset = if let Some(next) = offset_iter.peek() { + *next + } else { + meta_header.pagesize as u16 + }; + + // Zero out the space between the end of this item and the start + // of the next (or the end of the page). + let remainder = next_offset - (offset + 3 + item.len); + if remainder != 0 { + db.seek(SeekFrom::Current(item.len as i64))?; + write_zeros(&mut db, remainder)?; + } + } + } + } else if header.page_type == PageType::Hash { + // Offpage (aka overflow) Hash entries have three unused bytes immediately + // after the 8-bit item type field. Zero them. + let item_type: HashItemType = db.read_le()?; + + if let HashItemType::Offpage = item_type { + db.write_all(b"\x00\x00\x00")?; + } + } + } + } + + db.flush()?; + } + + // Ensure that we haven't changed (or trashed) the database contents. + let new_digest = database_contents_digest(&path, rootfs) + .context("post-normalization contents check")?; + if new_digest != old_digest { + return Err(anyhow!("bdb normalization failed, detected content change")); + } + } + + Ok(()) + } + + fn write_zeros(file: &mut std::fs::File, length: impl Into) -> Result<(), anyhow::Error> { + std::io::copy(&mut std::io::repeat(b'\x00').take(length.into()), file)?; + Ok(()) + } + + // Verify a given BerkeleyDB database/file and then dump the internal contents into a hash function. + // By checksumming the logical contents rather than the physical bytes on disk we can ensure that we + // haven't actually changed anything. + fn database_contents_digest( + path: &PathBuf, + rootfs: &openat::Dir, + ) -> Result<[u8; 32], anyhow::Error> { + // Build up the path we want and make sure it's a &str so Bubblewrap can use it. + let path = PROC_SELF_CWD.join(path); + let path = path + .as_os_str() + .to_str() + .ok_or_else(|| anyhow!("bad path for bdb file"))?; + + // Run db_verify over the file, this tells us whether the actual BerkeleyDB code thinks it's + // valid. db_verify will exit with a non-0 status if there are problems. + let mut verify = + Bubblewrap::new_with_mutability(rootfs, crate::ffi::BubblewrapMutability::Immutable)?; + verify.append_child_argv(&["db_verify", "-q", path]); + let cancellable = gio::Cancellable::new(); + verify.run_captured(Some(&cancellable))?; + + // Run db_dump, which will dump the contents of the database file in a transportable format, + // and calculate the SHA256 digest of said contents. Since the contents are independent of whatever + // random uninitialized data may lurk in the file itself it acts as a decent check of whether we've + // inadvertently changed anything we shouldn't have. + let mut dump = + Bubblewrap::new_with_mutability(rootfs, crate::ffi::BubblewrapMutability::Immutable)?; + dump.append_child_argv(&["db_dump", path]); + let cancellable = gio::Cancellable::new(); + let digest = sha256(&dump.run_captured(Some(&cancellable))?); + + Ok(digest) + } +} + #[cfg(test)] mod tests { use super::*;