Skip to content

Commit

Permalink
package: canonicalize tar headers for crate packages
Browse files Browse the repository at this point in the history
Currently, when reading a file from disk, we include several pieces of
data from the on-disk file, including the user and group names and IDs,
the device major and minor, the mode, and the timestamp.  This means
that our archives differ between systems, sometimes in unhelpful ways.

In addition, most users probably did not intend to share information
about their user and group settings, operating system and disk type, and
umask.  While these aren't huge privacy leaks, cargo doesn't use them
when extracting archives, so there's no value to including them.

Since using consistent data means that our archives are reproducible and
don't leak user data, both of which are desirable features, let's
canonicalize the header to strip out identifying information.

Omit the inclusion of the timestamp for generated files and tell the tar
crate to copy deterministic data.  That will omit all of the data we
don't care about and also canonicalize the mode properly.

Our tests don't check the specifics of certain fields because they
differ between the generated files and the files that are archived from
the disk format.  They are still canonicalized correctly for each type,
however.
  • Loading branch information
bk2204 committed Nov 18, 2020
1 parent e46ca84 commit 449ead0
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 48 deletions.
39 changes: 2 additions & 37 deletions src/cargo/ops/cargo_package.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@ use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use std::rc::Rc;
use std::sync::Arc;
use std::time::SystemTime;

use flate2::read::GzDecoder;
use flate2::{Compression, GzBuilder};
use log::debug;
use tar::{Archive, Builder, EntryType, Header};
use tar::{Archive, Builder, EntryType, Header, HeaderMode};

use crate::core::compiler::{BuildConfig, CompileMode, DefaultExecutor, Executor};
use crate::core::{Feature, Shell, Verbosity, Workspace};
Expand Down Expand Up @@ -472,35 +471,6 @@ fn check_repo_state(
}
}

fn timestamp() -> u64 {
if let Ok(var) = std::env::var("SOURCE_DATE_EPOCH") {
if let Ok(stamp) = var.parse() {
return stamp;
}
}
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs()
}

fn canonicalize_header(header: &mut Header) {
// Let's not include information about the user or their system here.
header.set_username("root").unwrap();
header.set_groupname("root").unwrap();
header.set_uid(0);
header.set_gid(0);
header.set_device_major(0).unwrap();
header.set_device_minor(0).unwrap();

let mode = if header.mode().unwrap() & 0o100 != 0 {
0o755
} else {
0o644
};
header.set_mode(mode);
}

fn tar(
ws: &Workspace<'_>,
ar_files: Vec<ArchiveFile>,
Expand All @@ -520,7 +490,6 @@ fn tar(

let base_name = format!("{}-{}", pkg.name(), pkg.version());
let base_path = Path::new(&base_name);
let time = timestamp();
for ar_file in ar_files {
let ArchiveFile {
rel_path,
Expand All @@ -540,9 +509,7 @@ fn tar(
let metadata = file.metadata().chain_err(|| {
format!("could not learn metadata for: `{}`", disk_path.display())
})?;
header.set_metadata(&metadata);
header.set_mtime(time);
canonicalize_header(&mut header);
header.set_metadata_in_mode(&metadata, HeaderMode::Deterministic);
header.set_cksum();
ar.append_data(&mut header, &ar_path, &mut file)
.chain_err(|| {
Expand All @@ -557,9 +524,7 @@ fn tar(
};
header.set_entry_type(EntryType::file());
header.set_mode(0o644);
header.set_mtime(time);
header.set_size(contents.len() as u64);
canonicalize_header(&mut header);
header.set_cksum();
ar.append_data(&mut header, &ar_path, contents.as_bytes())
.chain_err(|| format!("could not archive source file `{}`", rel_str))?;
Expand Down
15 changes: 4 additions & 11 deletions tests/testsuite/package.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1938,10 +1938,7 @@ fn reproducible_output() {
.file("src/main.rs", r#"fn main() { println!("hello"); }"#)
.build();

// Timestamp is arbitrary and is the same used by git format-patch.
p.cargo("package")
.env("SOURCE_DATE_EPOCH", "1000684800")
.run();
p.cargo("package").run();
assert!(p.root().join("target/package/foo-0.0.1.crate").is_file());

let f = File::open(&p.root().join("target/package/foo-0.0.1.crate")).unwrap();
Expand All @@ -1951,12 +1948,8 @@ fn reproducible_output() {
let ent = ent.unwrap();
let header = ent.header();
assert_eq!(header.mode().unwrap(), 0o644);
assert_eq!(header.uid().unwrap(), 0);
assert_eq!(header.gid().unwrap(), 0);
assert_eq!(header.mtime().unwrap(), 1000684800);
assert_eq!(header.username().unwrap().unwrap(), "root");
assert_eq!(header.groupname().unwrap().unwrap(), "root");
assert_eq!(header.device_major().unwrap().unwrap(), 0);
assert_eq!(header.device_minor().unwrap().unwrap(), 0);
assert_eq!(header.mtime().unwrap(), 0);
assert_eq!(header.username().unwrap().unwrap(), "");
assert_eq!(header.groupname().unwrap().unwrap(), "");
}
}

0 comments on commit 449ead0

Please sign in to comment.