diff --git a/Makefile b/Makefile index d8eb30b863..de64358948 100644 --- a/Makefile +++ b/Makefile @@ -29,12 +29,23 @@ ifeq "$(DEV_BUILD)" "1" DELTA_TARGET=out/delta-dev.tar.gz endif +ifeq "$(SNP_BUILD)" "1" +DELTA_TARGET=out/delta-snp.tar.gz +endif + # The link aliases for gcstools GCS_TOOLS=\ generichook \ install-drivers -.PHONY: all always rootfs test +# Common path prefix. +PATH_PREFIX:= +# These have PATH_PREFIX prepended to obtain the full path in recipies e.g. $(PATH_PREFIX)/$(VMGS_TOOL) +VMGS_TOOL:= +IGVM_TOOL:= +KERNEL_PATH:= + +.PHONY: all always rootfs test snp simple .DEFAULT_GOAL := all @@ -49,9 +60,58 @@ test: rootfs: out/rootfs.vhd -out/rootfs.vhd: out/rootfs.tar.gz bin/cmd/tar2ext4 +snp: out/kernelinitrd.vmgs out/rootfs.hash.vhd out/rootfs.vhd out/v2056.vmgs + +simple: out/simple.vmgs snp + +%.vmgs: %.bin + rm -f $@ + # du -BM returns the size of the bin file in M, eg 7M. The sed command replaces the M with *1024*1024 and then bc does the math to convert to bytes + $(PATH_PREFIX)/$(VMGS_TOOL) create --filepath $@ --filesize `du -BM $< | sed "s/M.*/*1024*1024/" | bc` + $(PATH_PREFIX)/$(VMGS_TOOL) write --filepath $@ --datapath $< -i=8 + +# Simplest debug UVM used to test changes to the linux kernel. No dmverity protection. Boots an initramdisk rather than directly booting a vhd disk. +out/simple.bin: out/initrd.img $(PATH_PREFIX)/$(KERNEL_PATH) boot/startup_simple.sh + rm -f $@ + python3 $(PATH_PREFIX)/$(IGVM_TOOL) -o $@ -kernel $(PATH_PREFIX)/$(KERNEL_PATH) -append "8250_core.nr_uarts=0 panic=-1 debug loglevel=7 rdinit=/startup_simple.sh" -rdinit out/initrd.img -vtl 0 + +ROOTFS_DEVICE:=/dev/sda +VERITY_DEVICE:=/dev/sdb +# Debug build for use with uvmtester. UVM with dm-verity protected vhd disk mounted directly via the kernel command line. Ignores corruption in dm-verity protected disk. (Use dmesg to see if dm-verity is ignoring data corruption.) +out/v2056.bin: out/rootfs.vhd out/rootfs.hash.vhd $(PATH_PREFIX)/$(KERNEL_PATH) out/rootfs.hash.datasectors out/rootfs.hash.datablocksize out/rootfs.hash.hashblocksize out/rootfs.hash.datablocks out/rootfs.hash.rootdigest out/rootfs.hash.salt boot/startup_v2056.sh + rm -f $@ + python3 $(PATH_PREFIX)/$(IGVM_TOOL) -o $@ -kernel $(PATH_PREFIX)/$(KERNEL_PATH) -append "8250_core.nr_uarts=0 panic=-1 debug loglevel=7 root=/dev/dm-0 dm-mod.create=\"dmverity,,,ro,0 $(shell cat out/rootfs.hash.datasectors) verity 1 $(ROOTFS_DEVICE) $(VERITY_DEVICE) $(shell cat out/rootfs.hash.datablocksize) $(shell cat out/rootfs.hash.hashblocksize) $(shell cat out/rootfs.hash.datablocks) 0 sha256 $(shell cat out/rootfs.hash.rootdigest) $(shell cat out/rootfs.hash.salt) 1 ignore_corruption\" init=/startup_v2056.sh" -vtl 0 + +# Full UVM with dm-verity protected vhd disk mounted directly via the kernel command line. +out/kernelinitrd.bin: out/rootfs.vhd out/rootfs.hash.vhd out/rootfs.hash.datasectors out/rootfs.hash.datablocksize out/rootfs.hash.hashblocksize out/rootfs.hash.datablocks out/rootfs.hash.rootdigest out/rootfs.hash.salt $(PATH_PREFIX)/$(KERNEL_PATH) boot/startup.sh + rm -f $@ + python3 $(PATH_PREFIX)/$(IGVM_TOOL) -o $@ -kernel $(PATH_PREFIX)/$(KERNEL_PATH) -append "8250_core.nr_uarts=0 panic=-1 debug loglevel=7 root=/dev/dm-0 dm-mod.create=\"dmverity,,,ro,0 $(shell cat out/rootfs.hash.datasectors) verity 1 $(ROOTFS_DEVICE) $(VERITY_DEVICE) $(shell cat out/rootfs.hash.datablocksize) $(shell cat out/rootfs.hash.hashblocksize) $(shell cat out/rootfs.hash.datablocks) 0 sha256 $(shell cat out/rootfs.hash.rootdigest) $(shell cat out/rootfs.hash.salt)\" init=/startup.sh" -vtl 0 + +# Rule to make a vhd from a file. This is used to create the rootfs.hash.vhd from rootfs.hash. +%.vhd: % bin/cmd/tar2ext4 + ./bin/cmd/tar2ext4 -only-vhd -i $< -o $@ + +# Rule to make a vhd from an ext4 file. This is used to create the rootfs.vhd from rootfs.ext4. +%.vhd: %.ext4 bin/cmd/tar2ext4 + ./bin/cmd/tar2ext4 -only-vhd -i $< -o $@ + +%.hash %.hash.info %.hash.datablocks %.hash.rootdigest %hash.datablocksize %.hash.datasectors %.hash.hashblocksize: %.ext4 %.hash.salt + veritysetup format --no-superblock --salt $(shell cat out/rootfs.hash.salt) $< $*.hash > $*.hash.info + # Retrieve info required by dm-verity at boot time + # Get the blocksize of rootfs + cat $*.hash.info | awk '/^Root hash:/{ print $$3 }' > $*.hash.rootdigest + cat $*.hash.info | awk '/^Salt:/{ print $$2 }' > $*.hash.salt + cat $*.hash.info | awk '/^Data block size:/{ print $$4 }' > $*.hash.datablocksize + cat $*.hash.info | awk '/^Hash block size:/{ print $$4 }' > $*.hash.hashblocksize + cat $*.hash.info | awk '/^Data blocks:/{ print $$3 }' > $*.hash.datablocks + echo $$(( $$(cat $*.hash.datablocks) * $$(cat $*.hash.datablocksize) / 512 )) > $*.hash.datasectors + +out/rootfs.hash.salt: + hexdump -vn32 -e'8/4 "%08X" 1 "\n"' /dev/random > $@ + +out/rootfs.ext4: out/rootfs.tar.gz bin/cmd/tar2ext4 gzip -f -d ./out/rootfs.tar.gz - bin/cmd/tar2ext4 -vhd -i ./out/rootfs.tar -o $@ + ./bin/cmd/tar2ext4 -i ./out/rootfs.tar -o $@ out/rootfs.tar.gz: out/initrd.img rm -rf rootfs-conv @@ -74,6 +134,20 @@ out/delta-dev.tar.gz: out/delta.tar.gz bin/internal/tools/snp-report tar -zcf $@ -C rootfs-dev . rm -rf rootfs-dev +out/delta-snp.tar.gz: out/delta.tar.gz bin/internal/tools/snp-report boot/startup_v2056.sh boot/startup_simple.sh boot/startup.sh + rm -rf rootfs-snp + mkdir rootfs-snp + tar -xzf out/delta.tar.gz -C rootfs-snp + cp boot/startup_v2056.sh rootfs-snp/startup_v2056.sh + cp boot/startup_simple.sh rootfs-snp/startup_simple.sh + cp boot/startup.sh rootfs-snp/startup.sh + cp bin/internal/tools/snp-report rootfs-snp/bin/ + chmod a+x rootfs-snp/startup_v2056.sh + chmod a+x rootfs-snp/startup_simple.sh + chmod a+x rootfs-snp/startup.sh + tar -zcf $@ -C rootfs-snp . + rm -rf rootfs-snp + out/delta.tar.gz: bin/init bin/vsockexec bin/cmd/gcs bin/cmd/gcstools bin/cmd/hooks/wait-paths Makefile @mkdir -p out rm -rf rootfs @@ -94,7 +168,10 @@ out/delta.tar.gz: bin/init bin/vsockexec bin/cmd/gcs bin/cmd/gcstools bin/cmd/ho tar -zcf $@ -C rootfs . rm -rf rootfs -bin/cmd/gcs bin/cmd/gcstools bin/cmd/hooks/wait-paths bin/cmd/tar2ext4 bin/internal/tools/snp-report: +out/containerd-shim-runhcs-v1.exe: + GOOS=windows $(GO_BUILD) -o $@ $(SRCROOT)/cmd/containerd-shim-runhcs-v1 + +bin/cmd/gcs bin/cmd/gcstools bin/cmd/hooks/wait-paths bin/cmd/tar2ext4 bin/internal/tools/snp-report bin/cmd/dmverity-vhd: @mkdir -p $(dir $@) GOOS=linux $(GO_BUILD) -o $@ $(SRCROOT)/$(@:bin/%=%) @@ -108,4 +185,4 @@ bin/init: init/init.o vsockexec/vsock.o %.o: %.c @mkdir -p $(dir $@) - $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< \ No newline at end of file diff --git a/boot/bootstrap.md b/boot/bootstrap.md new file mode 100644 index 0000000000..975a787466 --- /dev/null +++ b/boot/bootstrap.md @@ -0,0 +1,24 @@ +# UVM Boot Info + +For understanding the UVM's boot sequence it's useful to think of the UVM as consisting of: +- Linux kernel +- Kernel command line + - The command line is a set of parameters the kernel understands which correspond to actions it will perform during boot. +- Root filesystem (rootfs) disk + - This contains all the files that exist when first starting the VM. +- Startup script + - Stored in the rootfs disk. This scripts does the last bits of setup required to get the VM ready for use. +- Hash disk (SNP Mode only) + - Containing DM-Verity hash data (read more below about DM-Verity and SNP mode below). + + +## The SNP Mode UVM boot sequence. +- The vmgs (kernel + commandline) file is loaded into memory. +- The instructions from the kernel command line are performed, the kernel: + - Checks the hash disk's hash data (a merkle tree) is consistent. + - Checks the hash disk's root hash matches the root hash in the kernel command line. The boot fails if not because the integrity of the UVM cannot be confirmed. + - Makes the rootfs disk available as a dm-verity device. + - Mounts the dm-verity rootfs device. + - Sets the newly mounted disk as the root filesystem + - Finds and runs the startup script (which is specified in the kernel command line) from the rootfs to initialise the system. + - Anytime that data is read from the dm-verity rootfs, that data's integrity is checked on the fly by comparing the data's hash with the hash data on the hash disk. diff --git a/boot/startup.sh b/boot/startup.sh new file mode 100755 index 0000000000..576c42c95e --- /dev/null +++ b/boot/startup.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +export PATH="/usr/bin:/usr/local/bin:/bin:/root/bin:/sbin:/usr/sbin:/usr/local/sbin" +export HOME="/root" + +/init -e 1 /bin/vsockexec -o 109 -e 109 /bin/gcs -v4 -log-format json -loglevel debug \ No newline at end of file diff --git a/boot/startup_simple.sh b/boot/startup_simple.sh new file mode 100755 index 0000000000..3824abd7b7 --- /dev/null +++ b/boot/startup_simple.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +export PATH="/usr/bin:/usr/local/bin:/bin:/root/bin:/sbin:/usr/sbin:/usr/local/sbin" +export HOME="/root" + +/bin/vsockexec -o 2056 -e 2056 echo Running startup_simple.sh +/bin/vsockexec -o 2056 -e 2056 date + +/bin/vsockexec -o 2056 -e 2056 echo /init -e 1 /bin/vsockexec -o 2056 -e 109 /bin/gcs -v4 -log-format text -loglevel debug -logfile /tmp/gcs.log +/init -e 1 /bin/vsockexec -o 2056 -e 109 /bin/gcs -v4 -log-format text -loglevel debug -logfile /tmp/gcs.log + +/bin/vsockexec -o 2056 -e 2056 echo dmesg +/bin/vsockexec -o 2056 -e 2056 dmesg + +/bin/vsockexec -o 2056 -e 2056 echo sleeping 2 +/bin/vsockexec -o 2056 -e 2056 sleep 2 + +/bin/vsockexec -o 2056 -e 2056 ls -Rl /dev/se* + diff --git a/boot/startup_v2056.sh b/boot/startup_v2056.sh new file mode 100755 index 0000000000..db42595a9e --- /dev/null +++ b/boot/startup_v2056.sh @@ -0,0 +1,34 @@ +#!/bin/sh + +export PATH="/usr/bin:/usr/local/bin:/bin:/root/bin:/sbin:/usr/sbin:/usr/local/sbin" +export HOME="/root" + +/bin/vsockexec -o 2056 -e 2056 echo Running startup_v2056.sh +/bin/vsockexec -o 2056 -e 2056 date + +/bin/vsockexec -o 2056 -e 2056 echo /init -e 1 /bin/vsockexec -o 2056 -e 109 /bin/gcs -v4 -log-format text -loglevel debug -logfile /tmp/gcs.log +/init -e 1 /bin/vsockexec -o 2056 -e 109 /bin/gcs -v4 -log-format text -loglevel debug -logfile /tmp/gcs.log + +/bin/vsockexec -o 2056 -e 2056 echo ls -l /dev/dm* +/bin/vsockexec -o 2056 -e 2056 ls -l /dev/dm* +/bin/vsockexec -o 2056 -e 2056 echo ls -l /dev/mapper +/bin/vsockexec -o 2056 -e 2056 ls -l /dev/mapper +/bin/vsockexec -o 2056 -e 2056 echo ls -l /dev/mapper +/bin/vsockexec -o 2056 -e 2056 ls -l /dev/mapper + +#/bin/vsockexec -o 2056 -e 2056 /bin/snp-report + +# need init to have run before top shows much +/bin/vsockexec -o 2056 -e 2056 top -n 1 + +/bin/vsockexec -o 2056 -e 2056 echo tmp +/bin/vsockexec -o 2056 -e 2056 ls -la /tmp + +/bin/vsockexec -o 2056 -e 2056 /bin/dmesg + +sleep 1 +/bin/vsockexec -o 2056 -e 2056 echo Thats all folks... + + + + diff --git a/cmd/tar2ext4/tar2ext4.go b/cmd/tar2ext4/tar2ext4.go index 9533a9b1ce..b0dc3a95d7 100644 --- a/cmd/tar2ext4/tar2ext4.go +++ b/cmd/tar2ext4/tar2ext4.go @@ -15,6 +15,7 @@ var ( overlay = flag.Bool("overlay", false, "produce overlayfs-compatible layer image") convertSlash = flag.Bool("convert-slash", false, "convert backslashes ('\\') in path names to slashes ('/')") vhd = flag.Bool("vhd", false, "add a VHD footer to the end of the image") + onlyVhd = flag.Bool("only-vhd", false, "adds a VHD footer to the end of the file but does not convert to ext4; this implies '-vhd' and ignores all other options") inlineData = flag.Bool("inline", false, "write small file data into the inode; not compatible with DAX") ) @@ -48,6 +49,9 @@ func main() { if *vhd { opts = append(opts, tar2ext4.AppendVhdFooter) } + if *onlyVhd { + opts = append(opts, tar2ext4.OnlyAppendVhdFooter) + } if *inlineData { opts = append(opts, tar2ext4.InlineData) } diff --git a/ext4/tar2ext4/tar2ext4.go b/ext4/tar2ext4/tar2ext4.go index a6a3fe282c..5af6bc21bf 100644 --- a/ext4/tar2ext4/tar2ext4.go +++ b/ext4/tar2ext4/tar2ext4.go @@ -18,11 +18,12 @@ import ( ) type params struct { - convertWhiteout bool - convertBackslash bool - appendVhdFooter bool - appendDMVerity bool - ext4opts []compactext4.Option + convertWhiteout bool + convertBackslash bool + appendVhdFooter bool + onlyAppendVhdFooter bool + appendDMVerity bool + ext4opts []compactext4.Option } // Option is the type for optional parameters to Convert. @@ -46,6 +47,12 @@ func AppendVhdFooter(p *params) { p.appendVhdFooter = true } +// OnlyAppendVhdFooter instructs the converter not to convert but still to add a fixed VHD footer to the +// file. +func OnlyAppendVhdFooter(p *params) { + p.onlyAppendVhdFooter = true +} + // AppendDMVerity instructs the converter to add a dmverity Merkle tree for // the ext4 filesystem after the filesystem and before the optional VHD footer func AppendDMVerity(p *params) { @@ -201,6 +208,14 @@ func Convert(r io.Reader, w io.ReadWriteSeeker, options ...Option) error { opt(&p) } + if p.onlyAppendVhdFooter { + _, err := io.Copy(w, r) + if err != nil { + return err + } + return ConvertToVhd(w) + } + if err := ConvertTarToExt4(r, w, options...); err != nil { return err } diff --git a/init/init.c b/init/init.c index 23a7b97dc1..da9d5bf85e 100644 --- a/init/init.c +++ b/init/init.c @@ -18,6 +18,38 @@ #include #include "../vsockexec/vsock.h" + +#ifdef DEBUG +#ifdef USE_TCP +static const int tcpmode = 1; +#else +static const int tcpmode; +#endif +// vsockexec opens vsock connections for the specified stdio descriptors and +// then execs the specified process. + + +static int opentcp(unsigned short port) +{ + int s = socket(AF_INET, SOCK_STREAM, 0); + if (s < 0) + { + return -1; + } + + struct sockaddr_in addr = {0}; + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + if (connect(s, (struct sockaddr *)&addr, sizeof(addr)) < 0) + { + return -1; + } + + return s; +} +#endif + // musl-gcc doesn't use headers in /usr/include, so it can't find // linux/random.h which is where RNDADDENTROPY is defined. We only need this // single definition from linux/random.h, so we just duplicate it here as a @@ -113,6 +145,9 @@ void warn2(const char *msg1, const char *msg2) { } _Noreturn void dien() { + #ifdef DEBUG + printf("dien errno = %d", errno); + #endif exit(errno); } @@ -144,6 +179,9 @@ void init_rlimit() { void init_dev() { if (mount("dev", "/dev", "devtmpfs", MS_NOSUID | MS_NOEXEC, NULL) < 0) { + #ifdef DEBUG + printf("mount - errno %d\n", errno); + #endif warn2("mount", "/dev"); // /dev will be already mounted if devtmpfs.mount = 1 on the kernel // command line or CONFIG_DEVTMPFS_MOUNT is set. Do not consider this @@ -159,6 +197,9 @@ void init_fs(const struct InitOp *ops, size_t count) { switch (ops[i].op) { case OpMount: { const struct Mount *m = &ops[i].mount; + #ifdef DEBUG + printf("OpMount src %s target %s type %s flags %lu data %p\n", m->source, m->target, m->type, m->flags, m->data); + #endif if (mount(m->source, m->target, m->type, m->flags, m->data) < 0) { die2("mount", m->target); } @@ -166,6 +207,9 @@ void init_fs(const struct InitOp *ops, size_t count) { } case OpMkdir: { const struct Mkdir *m = &ops[i].mkdir; + #ifdef DEBUG + printf("OpMkdir path %s mode %d\n", m->path, m->mode); + #endif if (mkdir(m->path, m->mode) < 0) { warn2("mkdir", m->path); if (errno != EEXIST) { @@ -176,6 +220,9 @@ void init_fs(const struct InitOp *ops, size_t count) { } case OpMknod: { const struct Mknod *n = &ops[i].mknod; + #ifdef DEBUG + printf("OpMknod path %s mode %d major %d minor %d\n", n->path, n->mode, n->major, n->minor); + #endif if (mknod(n->path, n->mode, makedev(n->major, n->minor)) < 0) { warn2("mknod", n->path); if (errno != EEXIST) { @@ -186,6 +233,9 @@ void init_fs(const struct InitOp *ops, size_t count) { } case OpSymlink: { const struct Symlink *sl = &ops[i].symlink; + #ifdef DEBUG + printf("OpSymlink targeg %s link %s\n", sl->target, sl->linkpath); + #endif if (symlink(sl->target, sl->linkpath) < 0) { warn2("symlink", sl->linkpath); if (errno != EEXIST) { @@ -353,7 +403,60 @@ int reap_until(pid_t until_pid) { } } +#ifdef DEBUG +int debug_main(int argc, char **argv) { + unsigned int ports[3] = {2056, 2056, 2056}; + int sockets[3] = {-1, -1, -1}; + + for (int i = 0; i < 3; i++) + { + if (ports[i] != 0) + { + int j; + for (j = 0; j < i; j++) + { + if (ports[i] == ports[j]) + { + int s = dup(sockets[j]); + if (s < 0) + { + perror("dup"); + return 1; + } + sockets[i] = s; + break; + } + } + + if (j == i) + { + int s = tcpmode ? opentcp(ports[i]) : openvsock(VMADDR_CID_HOST, ports[i]); + if (s < 0) + { + fprintf(stderr, "connect: port %u: %s", ports[i], strerror(errno)); + return 1; + } + sockets[i] = s; + } + } + } + + for (int i = 0; i < 3; i++) + { + if (sockets[i] >= 0) + { + dup2(sockets[i], i); + close(sockets[i]); + } + } +} +#endif + int main(int argc, char **argv) { + #ifdef DEBUG + debug_main(argc, argv); + printf("Running init\n"); + #endif char *debug_shell = NULL; int entropy_port = 0; if (argc <= 1) { @@ -370,6 +473,9 @@ int main(int argc, char **argv) { case 'e': entropy_port = atoi(optarg); + #ifdef DEBUG + printf("entropy port %d\n", entropy_port); + #endif if (entropy_port == 0) { fputs("invalid entropy port\n", stderr); exit(1); @@ -388,13 +494,39 @@ int main(int argc, char **argv) { // Block all signals in init. SIGCHLD will still cause wait() to return. sigset_t set; + #ifdef DEBUG + printf("sigfillset(&set)\n"); + #endif sigfillset(&set); + + #ifdef DEBUG + printf("sigfillset\n"); + #endif sigprocmask(SIG_BLOCK, &set, 0); + #ifdef DEBUG + printf("init_rlimit\n"); + #endif init_rlimit(); + + #ifdef DEBUG + printf("init_dev\n"); + #endif init_dev(); + + #ifdef DEBUG + printf("init_fs\n"); + #endif init_fs(ops, sizeof(ops) / sizeof(ops[0])); + + #ifdef DEBUG + printf("init_cgroups\n"); + #endif init_cgroups(); + + #ifdef DEBUG + printf("init_network\n"); + #endif init_network("lo", AF_INET); init_network("lo", AF_INET6); if (entropy_port != 0) { diff --git a/internal/annotations/annotations.go b/internal/annotations/annotations.go index 869f0e565f..19c713f3e9 100644 --- a/internal/annotations/annotations.go +++ b/internal/annotations/annotations.go @@ -18,4 +18,7 @@ const ( // // [HCS RegistryValue]: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#registryvalue AdditionalRegistryValues = "io.microsoft.virtualmachine.wcow.additional-reg-keys" + + // ExtraVSockPorts adds additional ports to the list of ports that the UVM is allowed to use. + ExtraVSockPorts = "io.microsoft.virtualmachine.lcow.extra-vsock-ports" ) diff --git a/internal/oci/annotations.go b/internal/oci/annotations.go index 48896e2780..ab52938e69 100644 --- a/internal/oci/annotations.go +++ b/internal/oci/annotations.go @@ -262,6 +262,25 @@ func ParseAnnotationsUint64(ctx context.Context, a map[string]string, key string return def } +// ParseAnnotationCommaSeparated searches `annotations` for `annotation` corresponding to a +// list of comma separated strings +func ParseAnnotationCommaSeparatedUint32(ctx context.Context, annotations map[string]string, annotation string, def []uint32) []uint32 { + cs, ok := annotations[annotation] + if !ok || cs == "" { + return def + } + sints := strings.Split(cs, ",") + ints := make([]uint32, len(sints)) + for i := range sints { + x, err := strconv.ParseUint(sints[i], 10, 32) + ints[i] = uint32(x) + if err != nil { + return def + } + } + return ints +} + // ParseAnnotationsString searches `a` for `key`. If `key` is not found returns `def`. func ParseAnnotationsString(a map[string]string, key string, def string) string { if v, ok := a[key]; ok { diff --git a/internal/oci/uvm.go b/internal/oci/uvm.go index 134211f4f7..caf698a645 100644 --- a/internal/oci/uvm.go +++ b/internal/oci/uvm.go @@ -8,6 +8,7 @@ import ( "strconv" runhcsopts "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" + iannotations "github.com/Microsoft/hcsshim/internal/annotations" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/uvm" "github.com/Microsoft/hcsshim/pkg/annotations" @@ -205,9 +206,18 @@ func handleSecurityPolicy(ctx context.Context, a map[string]string, lopts *uvm.O // set the default GuestState filename. lopts.GuestStateFile = uvm.GuestStateFile lopts.KernelBootOptions = "" - lopts.PreferredRootFSType = uvm.PreferredRootFSTypeNA lopts.AllowOvercommit = false lopts.SecurityPolicyEnabled = true + + // There are two possible ways to boot SNP mode. Either kernelinitrd.vmgs which consists of kernel plus initrd.cpio.gz + // Or a kernel vmgs file (without an initrd) plus a separate vhd file which is dmverity protected via a hash vhd file. + // We only currently support using the dmverity scheme. Note that the dmverity file name may be explicitly specified via + // an annotation this is deliberately not the same annotation as the non-SNP rootfs vhd file. + lopts.PreferredRootFSType = uvm.PreferredRootFSTypeNA + lopts.RootFSFile = "" + lopts.DmVerityRootFsVhd = uvm.DefaultDmVerityRootfsVhd + lopts.DmVerityHashVhd = uvm.DefaultDmVerityHashVhd + lopts.DmVerityMode = true } if len(lopts.SecurityPolicy) > 0 { @@ -260,6 +270,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( lopts.VPMemSizeBytes = ParseAnnotationsUint64(ctx, s.Annotations, annotations.VPMemSize, lopts.VPMemSizeBytes) lopts.VPMemNoMultiMapping = ParseAnnotationsBool(ctx, s.Annotations, annotations.VPMemNoMultiMapping, lopts.VPMemNoMultiMapping) lopts.VPCIEnabled = ParseAnnotationsBool(ctx, s.Annotations, annotations.VPCIEnabled, lopts.VPCIEnabled) + lopts.ExtraVSockPorts = ParseAnnotationCommaSeparatedUint32(ctx, s.Annotations, iannotations.ExtraVSockPorts, lopts.ExtraVSockPorts) handleAnnotationBootFilesPath(ctx, s.Annotations, lopts) lopts.EnableScratchEncryption = ParseAnnotationsBool(ctx, s.Annotations, annotations.EncryptedScratchDisk, lopts.EnableScratchEncryption) lopts.SecurityPolicy = ParseAnnotationsString(s.Annotations, annotations.SecurityPolicy, lopts.SecurityPolicy) @@ -278,8 +289,11 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( // Eg VMPem device count, overridden kernel option cannot be respected. handleSecurityPolicy(ctx, s.Annotations, lopts) - // override the default GuestState filename if specified + // override the default GuestState and DmVerityRootFs/HashVhd filenames if specified lopts.GuestStateFile = ParseAnnotationsString(s.Annotations, annotations.GuestStateFile, lopts.GuestStateFile) + lopts.DmVerityRootFsVhd = ParseAnnotationsString(s.Annotations, annotations.DmVerityRootFsVhd, lopts.DmVerityRootFsVhd) + lopts.DmVerityHashVhd = ParseAnnotationsString(s.Annotations, annotations.DmVerityHashVhd, lopts.DmVerityHashVhd) + lopts.DmVerityMode = ParseAnnotationsBool(ctx, s.Annotations, annotations.DmVerityMode, lopts.DmVerityMode) // Set HclEnabled if specified. Else default to a null pointer, which is omitted from the resulting JSON. lopts.HclEnabled = ParseAnnotationsNullableBool(ctx, s.Annotations, annotations.HclEnabled) return lopts, nil diff --git a/internal/uvm/create_lcow.go b/internal/uvm/create_lcow.go index 4a06684cc3..6448d90934 100644 --- a/internal/uvm/create_lcow.go +++ b/internal/uvm/create_lcow.go @@ -68,13 +68,19 @@ const ( InitrdFile = "initrd.img" // VhdFile is the default file name for a rootfs.vhd used to boot LCOW. VhdFile = "rootfs.vhd" + // DmVerityVhdFile is the default file name for a dmverity_rootfs.vhd which + // is mounted by the GuestStateFile during boot and used as the root file + // system when booting in the SNP case. + DefaultDmVerityRootfsVhd = "rootfs.vhd" + DefaultDmVerityHashVhd = "rootfs.hash.vhd" // KernelFile is the default file name for a kernel used to boot LCOW. KernelFile = "kernel" // UncompressedKernelFile is the default file name for an uncompressed // kernel used to boot LCOW with KernelDirect. UncompressedKernelFile = "vmlinux" // GuestStateFile is the default file name for a vmgs (VM Guest State) file - // which combines kernel and initrd and is used to boot from in the SNP case. + // which combines kernel and initrd and is used to mount DmVerityVhdFile + // when booting in the SNP case. GuestStateFile = "kernelinitrd.vmgs" // UVMReferenceInfoFile is the default file name for a COSE_Sign1 // reference UVM info, which can be made available to workload containers @@ -90,6 +96,9 @@ type ConfidentialOptions struct { SecurityPolicyEnforcer string // Set which security policy enforcer to use (open door, standard or rego). This allows for better fallback mechanic. UVMReferenceInfoFile string // Filename under `BootFilesPath` for (potentially signed) UVM image reference information. BundleDirectory string // pod bundle directory + DmVerityRootFsVhd string // The VHD file (bound to the vmgs file via embedded dmverity hash data file) to load. + DmVerityHashVhd string // The VHD file containing the hash tree + DmVerityMode bool // override to be able to turn off dmverity for debugging } // OptionsLCOW are the set of options passed to CreateLCOW() to create a utility vm. @@ -121,6 +130,7 @@ type OptionsLCOW struct { EnableScratchEncryption bool // Whether the scratch should be encrypted DisableTimeSyncService bool // Disables the time synchronization service HclEnabled *bool // Whether to enable the host compatibility layer + ExtraVSockPorts []uint32 // Extra vsock ports to allow } // defaultLCOWOSBootFilesPath returns the default path used to locate the LCOW @@ -334,16 +344,27 @@ Example JSON document produced once the hcsschema.ComputeSytem returned by makeL // Make a hcsschema.ComputeSytem with the parts that target booting from a VMGS file func makeLCOWVMGSDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcsschema.ComputeSystem, err error) { - // Kernel and initrd are combined into a single vmgs file. + // Raise an error if instructed to use a particular sort of rootfs. + if opts.PreferredRootFSType != PreferredRootFSTypeNA { + return nil, errors.New("specifying a PreferredRootFSType is incompatible with SNP mode") + } + + // The kernel and minimal initrd are combined into a single vmgs file. vmgsTemplatePath := filepath.Join(opts.BootFilesPath, opts.GuestStateFile) if _, err := os.Stat(vmgsTemplatePath); os.IsNotExist(err) { return nil, fmt.Errorf("the GuestState vmgs file '%s' was not found", vmgsTemplatePath) } - // The rootfs must be provided as an initrd within the VMGS file. - // Raise an error if instructed to use a particular sort of rootfs. - if opts.PreferredRootFSType != PreferredRootFSTypeNA { - return nil, fmt.Errorf("cannot override rootfs when using VMGS file") + // The root file system comes from the dmverity vhd file which is mounted by the initrd in the vmgs file. + dmVerityRootFsFullPath := filepath.Join(opts.BootFilesPath, opts.DmVerityRootFsVhd) + if _, err := os.Stat(dmVerityRootFsFullPath); os.IsNotExist(err) { + return nil, fmt.Errorf("the DM Verity VHD file '%s' was not found", dmVerityRootFsFullPath) + } + + // The root file system comes from the dmverity vhd file which is mounted by the initrd in the vmgs file. + dmVerityHashFullPath := filepath.Join(opts.BootFilesPath, opts.DmVerityHashVhd) + if _, err := os.Stat(dmVerityHashFullPath); os.IsNotExist(err) { + return nil, fmt.Errorf("the DM Verity Hash file '%s' was not found", dmVerityHashFullPath) } var processor *hcsschema.Processor2 @@ -420,7 +441,8 @@ func makeLCOWVMGSDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ // entropyVsockPort - 1 is the entropy port, // linuxLogVsockPort - 109 used by vsockexec to log stdout/stderr logging, // 0x40000000 + 1 (LinuxGcsVsockPort + 1) is the bridge (see guestconnectiuon.go) - hvSockets := [...]uint32{entropyVsockPort, linuxLogVsockPort, gcs.LinuxGcsVsockPort, gcs.LinuxGcsVsockPort + 1} + hvSockets := []uint32{entropyVsockPort, linuxLogVsockPort, gcs.LinuxGcsVsockPort, gcs.LinuxGcsVsockPort + 1} + hvSockets = append(hvSockets, opts.ExtraVSockPorts...) for _, whichSocket := range hvSockets { key := fmt.Sprintf("%08x-facb-11e6-bd58-64006a7986d3", whichSocket) // format of a linux hvsock GUID is port#-facb-11e6-bd58-64006a7986d3 doc.VirtualMachine.Devices.HvSocket.HvSocketConfig.ServiceTable[key] = hcsschema.HvSocketServiceConfig{ @@ -439,7 +461,29 @@ func makeLCOWVMGSDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ } if uvm.scsiControllerCount > 0 { + logrus.Debug("makeLCOWVMGSDoc configuring scsi devices") doc.VirtualMachine.Devices.Scsi = map[string]hcsschema.Scsi{} + if opts.DmVerityMode { + logrus.Debug("makeLCOWVMGSDoc DmVerityMode true") + doc.VirtualMachine.Devices.Scsi = map[string]hcsschema.Scsi{ + "RootFileSystemVirtualDisk": { + Attachments: map[string]hcsschema.Attachment{ + "0": { + Type_: "VirtualDisk", + Path: dmVerityRootFsFullPath, + ReadOnly: true, + }, + "1": { + Type_: "VirtualDisk", + Path: dmVerityHashFullPath, + ReadOnly: true, + }, + }, + }, + } + uvm.reservedSCSISlots = append(uvm.reservedSCSISlots, scsi.Slot{Controller: 0, LUN: 0}) + uvm.reservedSCSISlots = append(uvm.reservedSCSISlots, scsi.Slot{Controller: 0, LUN: 1}) + } for i := 0; i < int(uvm.scsiControllerCount); i++ { doc.VirtualMachine.Devices.Scsi[guestrequest.ScsiControllerGuids[i]] = hcsschema.Scsi{ Attachments: make(map[string]hcsschema.Attachment), diff --git a/pkg/amdsevsnp/report.go b/pkg/amdsevsnp/report.go index faa82cf10a..e296a7eeb0 100644 --- a/pkg/amdsevsnp/report.go +++ b/pkg/amdsevsnp/report.go @@ -253,10 +253,7 @@ func fetchRawSNPReport6(reportData []byte) ([]byte, error) { return nil, err } - reportOutSize := unsafe.Sizeof(msgReportOut) - reportOutPtr := unsafe.Pointer(&msgReportOut) - reportOutAsSlice := (*[reportResponseContainerLength6]byte)(reportOutPtr) - copy(reportOutAsSlice[:reportOutSize], reportOutContainer[:reportOutSize]) + msgReportOut = *(*reportResponse)(unsafe.Pointer(&reportOutContainer[0])) return msgReportOut.Report[:], nil } diff --git a/pkg/annotations/annotations.go b/pkg/annotations/annotations.go index 3aec333fe4..ac0823cdf3 100644 --- a/pkg/annotations/annotations.go +++ b/pkg/annotations/annotations.go @@ -266,6 +266,14 @@ const ( // GuestStateFile specifies the path of the vmgs file to use if required. Only applies in SNP mode. GuestStateFile = "io.microsoft.virtualmachine.lcow.gueststatefile" + // DmVerityVhdFile specifies the path of the VHD (with embedded dmverity data) file to use if required. Only applies in SNP mode. + + DmVerityRootFsVhd = "io.microsoft.virtualmachine.lcow.dmverity-rootfs-vhd" + DmVerityHashVhd = "io.microsoft.virtualmachine.lcow.dmverity-hash-vhd" + + // DmVerityMode + DmVerityMode = "io.microsoft.virtualmachine.lcow.dmverity-mode" + // UVMSecurityPolicyEnv specifies if confidential containers' related information // should be written to containers' rootfs. The filenames and location are defined // by securitypolicy.PolicyFilename, securitypolicy.HostAMDCertFilename and