From 08092c15a32814e731e744284dafa3954b654662 Mon Sep 17 00:00:00 2001 From: Arnaldo Garcia Rincon Date: Mon, 24 May 2021 23:34:28 +0000 Subject: [PATCH] host-containers: allow mount propagations from privileged containers This commit adds support to propagate mount points created in bootstrap and superpowered containers, across mount peer groups. The root filesystem of bootstrap and superpowered containers is setup with the `rshared` configuration to allow mounts propagations across peer groups. All mount points attached to the containers are configured as `rprivate` (except for the `mnt` mount). This prevents bootstrap and superpowered containers from remounting directories in the host's root filesystem. The `/.bottlerocket/rootfs/mnt` mount point was added to bootstrap and superpowered containers. This mount point is a bind mount that points to `/mnt` in the host, which itself is a bind mount of `/local/mnt`. This is required to let users create mount points underneath `/mnt`. This mount point is setup with the `rshared` configuration to allow propagations across peer groups. This is the only mount point from which propagations are allowed across peer groups. With this change, bootstrap containers now have access to all the devices in the host. Also, they now have the `CAP_SYS_ADMIN` capability to let users manage ephemeral disks. The logic to build the container specs was refactored to provide a better understanding of what options are set for the containers' spec. Signed-off-by: Arnaldo Garcia Rincon --- README.md | 10 +- packages/release/mnt.mount | 16 ++ packages/release/prepare-local.service | 2 +- packages/release/release.spec | 6 +- sources/host-ctr/cmd/host-ctr/main.go | 263 ++++++++++++++++--------- 5 files changed, 204 insertions(+), 93 deletions(-) create mode 100644 packages/release/mnt.mount diff --git a/README.md b/README.md index ae6541e5f7c..dbfe27e21bb 100644 --- a/README.md +++ b/README.md @@ -528,7 +528,10 @@ Bootstrap containers are host containers that can be used to "bootstrap" the hos Bootstrap containers are very similar to normal host containers; they come with persistent storage and with optional user data. Unlike normal host containers, bootstrap containers can't be treated as `superpowered` containers. -However, these containers have access to the underlying root filesystem on `/.bottlerocket/rootfs`. +However, bootstrap containers do have additional permissions that normal host containers do not have. +Bootstrap containers have access to the underlying root filesystem on `/.bottlerocket/rootfs` as well as to all the devices in the host, and they are set up with the `CAP_SYS_ADMIN` capability. +This allows bootstrap containers to create files, directories, and mounts that are visible to the host. + Bootstrap containers are set up to run after the systemd `configured.target` unit is active. The containers' systemd unit depends on this target (and not on any of the bootstrap containers' peers) which means that bootstrap containers will not execute in a deterministic order The boot process will "wait" for as long as the bootstrap containers run. @@ -558,6 +561,11 @@ mode = "once" essential = true ``` +##### Mount propagations in bootstrap and superpowered containers +Both bootstrap and superpowered host containers are configured with the `/.bottlerocket/rootfs/mnt` bind mount that points to `/mnt` in the host, which itself is a bind mount of `/local/mnt`. +This bind mount is set up with shared propagations, so any new mount point created underneath `/.bottlerocket/rootfs/mnt` in any bootstrap or superpowered host container will propagate across mount namespaces. +You can use this feature to configure ephemeral disks attached to your hosts that you may want to use on your workloads. + #### Platform-specific settings Platform-specific settings are automatically set at boot time by [early-boot-config](sources/api/early-boot-config) based on metadata available on the running platform. diff --git a/packages/release/mnt.mount b/packages/release/mnt.mount new file mode 100644 index 00000000000..bfcc8d61e31 --- /dev/null +++ b/packages/release/mnt.mount @@ -0,0 +1,16 @@ +[Unit] +Description=Mnt Directory (/mnt) +DefaultDependencies=no +Conflicts=umount.target +Before=local-fs.target umount.target +Wants=prepare-local.service +After=prepare-local.service + +[Mount] +What=/local/mnt +Where=/mnt +Type=none +Options=rbind,rshared + +[Install] +WantedBy=local-fs.target diff --git a/packages/release/prepare-local.service b/packages/release/prepare-local.service index 15eae7c2526..36536ee834d 100644 --- a/packages/release/prepare-local.service +++ b/packages/release/prepare-local.service @@ -34,7 +34,7 @@ ExecStart=/usr/bin/mount \ # After the mount is active, we grow the filesystem to fill the resized partition, # and ensure that it has the directories we need for subsequent mounts. ExecStart=/usr/lib/systemd/systemd-growfs ${LOCAL_DIR} -ExecStart=/usr/bin/mkdir -p ${LOCAL_DIR}/var ${LOCAL_DIR}/opt +ExecStart=/usr/bin/mkdir -p ${LOCAL_DIR}/var ${LOCAL_DIR}/opt ${LOCAL_DIR}/mnt # Create the directories we need to set up a read-write overlayfs for the kernel # development sources. diff --git a/packages/release/release.spec b/packages/release/release.spec index 5b24ea89216..104d4b0e358 100644 --- a/packages/release/release.spec +++ b/packages/release/release.spec @@ -27,6 +27,7 @@ Source1006: var.mount Source1007: opt.mount Source1008: var-lib-bottlerocket.mount Source1009: etc-cni.mount +Source1010: mnt.mount # CD-ROM mount & associated udev rules Source1015: media-cdrom.mount @@ -107,8 +108,8 @@ EOF install -d %{buildroot}%{_cross_unitdir} install -p -m 0644 \ %{S:1001} %{S:1002} %{S:1003} %{S:1004} %{S:1005} \ - %{S:1006} %{S:1007} %{S:1008} %{S:1009} %{S:1015} \ - %{S:1040} %{S:1041} %{S:1060} %{S:1061} %{S:1062} \ + %{S:1006} %{S:1007} %{S:1008} %{S:1009} %{S:1010} \ + %{S:1015} %{S:1040} %{S:1041} %{S:1060} %{S:1061} %{S:1062} \ %{buildroot}%{_cross_unitdir} LOWERPATH=$(systemd-escape --path %{_cross_sharedstatedir}/kernel-devel/lower) @@ -155,6 +156,7 @@ ln -s %{_cross_unitdir}/preconfigured.target %{buildroot}%{_cross_unitdir}/defau %{_cross_unitdir}/prepare-local.service %{_cross_unitdir}/var.mount %{_cross_unitdir}/opt.mount +%{_cross_unitdir}/mnt.mount %{_cross_unitdir}/etc-cni.mount %{_cross_unitdir}/media-cdrom.mount %{_cross_unitdir}/*-lower.mount diff --git a/sources/host-ctr/cmd/host-ctr/main.go b/sources/host-ctr/cmd/host-ctr/main.go index 6ebf569d0f2..c74dfe968f0 100644 --- a/sources/host-ctr/cmd/host-ctr/main.go +++ b/sources/host-ctr/cmd/host-ctr/main.go @@ -272,38 +272,30 @@ func runCtr(containerdSocket string, namespace string, containerID string, sourc // Set the destination name for the container persistent storage location persistentDir := cType.PersistentDir() - // Set up the container spec. See `withSuperpowered` for conditional options - // set when configured as superpowered. - ctrOpts := containerd.WithNewSpec( + specOpts := []oci.SpecOpts{ oci.WithImageConfig(img), oci.WithHostNamespace(runtimespec.NetworkNamespace), oci.WithHostHostsFile, oci.WithHostResolvconf, - // Mount in the API socket for the Bottlerocket API server, and the API - // client used to interact with it - oci.WithMounts([]runtimespec.Mount{ - { - Options: []string{"bind", "rw"}, - Destination: "/run/api.sock", - Source: "/run/api.sock", - }, - // Mount in the apiclient to make API calls to the Bottlerocket API server - { - Options: []string{"bind", "ro"}, - Destination: "/usr/local/bin/apiclient", - Source: "/usr/bin/apiclient", - }}), // Pass proxy environment variables to this container withProxyEnv(), - // Mount in the persistent storage location for this container - withPersistentStorage(containerName, persistentDir), - // Mount the rootfs with an SELinux label that makes it writable + // Add a default set of mounts regardless of the container type + withDefaultMounts(containerName, persistentDir), + // Mount the container's rootfs with an SELinux label that makes it writable withMountLabel("system_u:object_r:secret_t:s0"), - // Include conditional options for superpowered containers. - withSuperpowered(superpowered), - // Mount the rootfs if superpowered or bootstrap - withRootFilesystemMounts(superpowered || cType == bootstrap), - ) + } + + // Select the set of specOpts based on the container type + switch { + case superpowered: + specOpts = append(specOpts, withSuperpowered()) + case cType == bootstrap: + specOpts = append(specOpts, withBootstrap()) + default: + specOpts = append(specOpts, withDefault()) + } + + ctrOpts := containerd.WithNewSpec(specOpts...) // Create the container. container, err = client.NewContainer( @@ -609,29 +601,11 @@ func newContainerdClient(ctx context.Context, containerdSocket string, namespace return client, nil } -// withMountLabel configures the mount with the provided SELinux label. -func withMountLabel(label string) oci.SpecOpts { - return func(_ context.Context, _ oci.Client, _ *containers.Container, s *runtimespec.Spec) error { - if s.Linux != nil { - s.Linux.MountLabel = label - } - return nil - } -} - -// withSuperpowered add container options granting administrative privileges -// when it's `superpowered`. -func withSuperpowered(superpowered bool) oci.SpecOpts { - if !superpowered { - // Set the `control_t` process label so the host container can - // interact with the API and modify its local state files. - return oci.Compose( - seccomp.WithDefaultProfile(), - oci.WithSelinuxLabel("system_u:system_r:control_t:s0"), - ) - } - +// withSuperpowered adds container options to grant administrative privileges +func withSuperpowered() oci.SpecOpts { return oci.Compose( + withPrivilegedMounts(), + withRootFsShared(), oci.WithHostNamespace(runtimespec.PIDNamespace), oci.WithParentCgroupDevices, oci.WithPrivileged, @@ -641,59 +615,170 @@ func withSuperpowered(superpowered bool) oci.SpecOpts { ) } -// withRootFileSystemMounts adds container options to mount the root filesystem -func withRootFilesystemMounts(mountRootFilesystem bool) oci.SpecOpts { - // if mountRootFilesystem, return a no-op SpecOpts function - if !mountRootFilesystem { - return func(_ context.Context, _ oci.Client, _ *containers.Container, s *runtimespec.Spec) error { - return nil - } - } +// withBootstrap adds container options to grant read-write access to the underlying +// root filesystem, as well as to manage the devices attached to the +// host +func withBootstrap() oci.SpecOpts { + return oci.Compose( + withPrivilegedMounts(), + withRootFsShared(), + oci.WithSelinuxLabel("system_u:system_r:control_t:s0"), + // Bootstrap containers don't require all "privileges", we only add the + // `CAP_SYS_ADMIN` capability. `WithDefaultProfile` will create the proper + // seccomp profile based on the container's capabilities. + oci.WithAddedCapabilities([]string{"CAP_SYS_ADMIN"}), + seccomp.WithDefaultProfile(), + oci.WithAllDevicesAllowed, + ) +} +// withDefault adds container options for non-privileged containers +func withDefault() oci.SpecOpts { return oci.Compose( - oci.WithMounts([]runtimespec.Mount{ - { - Options: []string{"rbind", "ro"}, - Destination: "/.bottlerocket/rootfs", - Source: "/", - }, - { - Options: []string{"rbind", "ro"}, - Destination: "/lib/modules", - Source: "/lib/modules", - }, - { - Options: []string{"rbind", "rw"}, - Destination: "/usr/src/kernels", - Source: "/usr/src/kernels", - }, - { - Options: []string{"rbind"}, - Destination: "/sys/kernel/debug", - Source: "/sys/kernel/debug", - }, - }), + oci.WithSelinuxLabel("system_u:system_r:control_t:s0"), + // Non-privileged containers only have access to a subset of the devices + oci.WithDefaultUnixDevices, + // No additional capabilities required for non-privileged containers + seccomp.WithDefaultProfile(), ) } -// withPersistentStorage add persistent storage location that matches the container name -// (legacy location) and a generically named `current` dir. The `current` dir was added for easier -// referencing in Dockerfiles and scripts. If a host container is also named `current` this function -// will only add a single `current` mount to the spec. -func withPersistentStorage(containerID string, persistentDir string) oci.SpecOpts { - var persistentMounts = []runtimespec.Mount{{ - Options: []string{"rbind", "rw"}, - Destination: fmt.Sprintf("/.bottlerocket/%s/%s", persistentDir, containerID), - Source: fmt.Sprintf("/local/%s/%s", persistentDir, containerID), - }} +// withMountLabel configures the mount with the provided SELinux label. +func withMountLabel(label string) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *containers.Container, s *runtimespec.Spec) error { + if s.Linux != nil { + s.Linux.MountLabel = label + } + return nil + } +} + +// withDefaultMounts adds the mount configurations required in all container types, +// all default mounts are set up with rprivate propagations +func withDefaultMounts(containerID string, persistentDir string) oci.SpecOpts { + var mounts = []runtimespec.Mount{ + // Local persistent storage for the container + { + Options: []string{"rbind", "rw"}, + Destination: fmt.Sprintf("/.bottlerocket/%s/%s", persistentDir, containerID), + Source: fmt.Sprintf("/local/%s/%s", persistentDir, containerID), + }, + // Mount in the API socket for the Bottlerocket API server, and the API + // client used to interact with it + { + Options: []string{"bind", "rw"}, + Destination: "/run/api.sock", + Source: "/run/api.sock", + }, + // Mount in the apiclient to make API calls to the Bottlerocket API server + { + Options: []string{"bind", "ro"}, + Destination: "/usr/local/bin/apiclient", + Source: "/usr/bin/apiclient", + }, + // Cgroup filesystem for this container + { + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"ro", "nosuid", "noexec", "nodev"}, + }, + } + + // The `current` dir was added for easier referencing in Dockerfiles and scripts. + // If a host container is also named `current`, only add a single `current` mount + // to the spec. if containerID != "current" { - persistentMounts = append(persistentMounts, runtimespec.Mount{ + mounts = append(mounts, runtimespec.Mount{ Options: []string{"rbind", "rw"}, Destination: fmt.Sprintf("/.bottlerocket/%s/current", persistentDir), Source: fmt.Sprintf("/local/%s/%s", persistentDir, containerID), }) } - return oci.Compose(oci.WithMounts(persistentMounts)) + + // Use withMounts to make sure all mounts have rprivate propagations + return withMounts(mounts) +} + +// withPrivilegedMounts adds options to grant access to the host root filesystem +func withPrivilegedMounts() oci.SpecOpts { + // Use withMounts to force rprivate when no propagation configurations + // are set + return withMounts([]runtimespec.Mount{ + { + Options: []string{"rbind", "ro"}, + Destination: "/.bottlerocket/rootfs", + Source: "/", + Type: "bind", + }, + { + Options: []string{"rbind", "ro"}, + Destination: "/lib/modules", + Source: "/lib/modules", + Type: "bind", + }, + { + Options: []string{"rbind", "rw"}, + Destination: "/usr/src/kernels", + Source: "/usr/src/kernels", + Type: "bind", + }, + { + Options: []string{"rbind"}, + Destination: "/sys/kernel/debug", + Source: "/sys/kernel/debug", + Type: "bind", + }, + // Use shared propagations so mounts in this mount point propagate + // across the peer group + { + Options: []string{"rbind", "rshared"}, + Destination: "/.bottlerocket/rootfs/mnt", + Source: "/mnt", + Type: "bind", + }, + }) +} + +// withMounts sets the mounts' propagations as rprivate only when the +// mounts' options don't have propagations settings +func withMounts(mounts []runtimespec.Mount) oci.SpecOpts { + finalMounts := []runtimespec.Mount{} + + for _, mount := range mounts { + // Only set rprivate when no propagations are configured for + // the mount + if !hasPropagation(mount) { + // Update the local mount copy instead of the original + mount.Options = append(mount.Options, "rprivate") + } + finalMounts = append(finalMounts, mount) + } + + return oci.WithMounts(finalMounts) +} + +// hasPropagation checks if the mount has propagation options +func hasPropagation(mount runtimespec.Mount) bool { + // Propagations can be shared, rshared, private, rprivate, slave, rslave + for _, option := range mount.Options { + switch option { + case "shared", "rshared", "private", "rprivate", "slave", "rslave": + return true + } + } + + return false +} + +// withRootFsShared sets the rootfs mount propagation as `rshared` +func withRootFsShared() oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *containers.Container, s *runtimespec.Spec) error { + if s.Linux != nil { + s.Linux.RootfsPropagation = "rshared" + } + return nil + } } // withProxyEnv reads proxy environment variables and returns a spec option for passing said proxy environment variables