Skip to content

Commit

Permalink
[launcher] Change retry behavior to reboot
Browse files Browse the repository at this point in the history
Update scripts and service unit file

Signed-off-by: Jiankun Lu <jiankun@google.com>
  • Loading branch information
jkl73 committed Nov 18, 2022
1 parent ef14c14 commit 31dba6e
Show file tree
Hide file tree
Showing 13 changed files with 378 additions and 170 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
launcher/launcher
launcher/launcher/launcher
*.test
*.test.exe
cmd/gotpm/gotpm
Expand Down
4 changes: 2 additions & 2 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ steps:
args:
- -c
- |
cd launcher
go build -o image/launcher
cd launcher/launcher
go build -o ../image/launcher
- name: 'gcr.io/cos-cloud/cos-customizer'
args: ['start-image-build',
'-build-context=launcher/image',
Expand Down
2 changes: 1 addition & 1 deletion launcher/auth.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package launcher

import (
"encoding/json"
Expand Down
92 changes: 44 additions & 48 deletions launcher/container_runner.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
package main
// Package launcher contains functionalities to start a measured workload
package launcher

import (
"context"
Expand Down Expand Up @@ -86,7 +87,7 @@ func fetchImpersonatedToken(ctx context.Context, serviceAccount string, audience
func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.Token, launchSpec spec.LaunchSpec, mdsClient *metadata.Client, tpm io.ReadWriteCloser, logger *log.Logger) (*ContainerRunner, error) {
image, err := initImage(ctx, cdClient, launchSpec, token, logger)
if err != nil {
return nil, err
return nil, &NonRetryableError{err}
}

mounts := make([]specs.Mount, 0)
Expand All @@ -112,10 +113,10 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
logger.Printf("Image Labels : %v\n", imageLabels)
launchPolicy, err := spec.GetLaunchPolicy(imageLabels)
if err != nil {
return nil, err
return nil, &NonRetryableError{err}
}
if err := launchPolicy.Verify(launchSpec); err != nil {
return nil, err
return nil, &NonRetryableError{err}
}

if imageConfig, err := image.Config(ctx); err != nil {
Expand All @@ -127,7 +128,7 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To

hostname, err := os.Hostname()
if err != nil {
return nil, fmt.Errorf("cannot get hostname: [%w]", err)
return nil, &RetryableError{fmt.Errorf("cannot get hostname: [%w]", err)}
}

container, err = cdClient.NewContainer(
Expand All @@ -151,19 +152,21 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
if container != nil {
container.Delete(ctx, containerd.WithSnapshotCleanup)
}
return nil, fmt.Errorf("failed to create a container: [%w]", err)
return nil, &RetryableError{fmt.Errorf("failed to create a container: [%w]", err)}
}

containerSpec, err := container.Spec(ctx)
if err != nil {
return nil, err
return nil, &RetryableError{err}
}
// Container process Args length should be strictly longer than the Cmd
// override length set by the operator, as we want the Entrypoint filed
// to be mandatory for the image.
// Roughly speaking, Args = Entrypoint + Cmd
if len(containerSpec.Process.Args) <= len(launchSpec.Cmd) {
return nil, fmt.Errorf("length of Args [%d] is shorter or equal to the length of the given Cmd [%d], maybe the Entrypoint is set to empty in the image?", len(containerSpec.Process.Args), len(launchSpec.Cmd))
return nil, &NonRetryableError{
fmt.Errorf("length of Args [%d] is shorter or equal to the length of the given Cmd [%d], maybe the Entrypoint is set to empty in the image?",
len(containerSpec.Process.Args), len(launchSpec.Cmd))}
}

// Fetch ID token with specific audience.
Expand Down Expand Up @@ -207,7 +210,7 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
verifierClient, conn, err = getGRPCClient(asAddr, logger)
}
if err != nil {
return nil, fmt.Errorf("failed to create verifier client: %v", err)
return nil, &NonRetryableError{fmt.Errorf("failed to create verifier client: %v", err)}
}

return &ContainerRunner{
Expand Down Expand Up @@ -411,53 +414,46 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
defer cancel()

if err := r.measureContainerClaims(ctx); err != nil {
return fmt.Errorf("failed to measure container claims: %v", err)
return &NonRetryableError{fmt.Errorf("failed to measure container claims: %v", err)}
}
if err := r.fetchAndWriteToken(ctx); err != nil {
return fmt.Errorf("failed to fetch and write OIDC token: %v", err)
return &NonRetryableError{fmt.Errorf("failed to fetch and write OIDC token: %v", err)}
}

for {
var streamOpt cio.Opt
if r.launchSpec.LogRedirect {
streamOpt = cio.WithStreams(nil, r.logger.Writer(), r.logger.Writer())
r.logger.Println("container stdout/stderr will be redirected")
} else {
streamOpt = cio.WithStreams(nil, nil, nil)
r.logger.Println("container stdout/stderr will not be redirected")
}

task, err := r.container.NewTask(ctx, cio.NewCreator(streamOpt))
if err != nil {
return err
}
exitStatus, err := task.Wait(ctx)
if err != nil {
return err
}
r.logger.Println("task started")
var streamOpt cio.Opt
if r.launchSpec.LogRedirect {
streamOpt = cio.WithStreams(nil, r.logger.Writer(), r.logger.Writer())
r.logger.Println("container stdout/stderr will be redirected")
} else {
streamOpt = cio.WithStreams(nil, nil, nil)
r.logger.Println("container stdout/stderr will not be redirected")
}

if err := task.Start(ctx); err != nil {
return err
}
status := <-exitStatus
task, err := r.container.NewTask(ctx, cio.NewCreator(streamOpt))
if err != nil {
return &RetryableError{err}
}
exitStatus, err := task.Wait(ctx)
if err != nil {
return &RetryableError{err}
}
r.logger.Println("workload task started")

code, _, err := status.Result()
if err != nil {
return err
}
task.Delete(ctx)

r.logger.Printf("task ended with return code %d \n", code)
if r.launchSpec.RestartPolicy == spec.Always {
r.logger.Println("restarting task")
} else if r.launchSpec.RestartPolicy == spec.OnFailure && code != 0 {
r.logger.Println("restarting task on failure")
} else {
break
}
if err := task.Start(ctx); err != nil {
return &RetryableError{err}
}
status := <-exitStatus

code, _, err := status.Result()
if err != nil {
return &NonRetryableError{err}
}
if _, err := task.Delete(ctx); err != nil {
return &NonRetryableError{err}
}
if code != 0 {
return &WorkloadError{code}
}
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion launcher/container_runner_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package launcher

import (
"bytes"
Expand Down
28 changes: 28 additions & 0 deletions launcher/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package launcher

// RetryableError means launcher should reboot the VM to retry.
type RetryableError struct {
Err error
}

// NonRetryableError means launcher shouldn't reboot the VM to retry.
type NonRetryableError struct {
Err error
}

// WorkloadError represents the result of an workload/task that is non-zero.
type WorkloadError struct {
ReturnCode uint32
}

func (e *RetryableError) Error() string {
return e.Err.Error()
}

func (e *NonRetryableError) Error() string {
return e.Err.Error()
}

func (e *WorkloadError) Error() string {
return "workload finished with non-zero return code"
}
7 changes: 2 additions & 5 deletions launcher/image/container-runner.service
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@ Wants=network-online.target gcr-online.target containerd.service
After=network-online.target gcr-online.target containerd.service

[Service]
ExecStart=/var/lib/google/cc_container_launcher
# Shutdown the host after the launcher exits
ExecStopPost=/bin/sleep 60
ExecStopPost=/usr/bin/systemctl poweroff
ExecStart=/usr/share/oem/confidential_space/cs_container_launcher
ExecStopPost=/usr/share/oem/confidential_space/exit_script.sh
Restart=no
# RestartSec=90
StandardOutput=journal+console
StandardError=journal+console

Expand Down
8 changes: 1 addition & 7 deletions launcher/image/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
#!/bin/bash

main() {
# copy the binary
cp /usr/share/oem/cc_container_launcher /var/lib/google/cc_container_launcher
chmod +x /var/lib/google/cc_container_launcher

# copy systemd files
cp /usr/share/oem/container-runner.service /etc/systemd/system/container-runner.service
mkdir -p /etc/systemd/system/container-runner.service.d/
cp /usr/share/oem/launcher.conf /etc/systemd/system/container-runner.service.d/launcher.conf
cp /usr/share/oem/confidential_space/container-runner.service /etc/systemd/system/container-runner.service

systemctl daemon-reload
systemctl enable container-runner.service
Expand Down
14 changes: 14 additions & 0 deletions launcher/image/exit_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#! /bin/bash

if [[ $EXIT_STATUS -eq 3 ]]
then
# reboot after 2 min
shutdown --reboot +2
fi

if [[ $EXIT_STATUS -eq 0 ]] || [[ $EXIT_STATUS -eq 1 ]] || [[ $EXIT_STATUS -eq 2 ]]
then
# poweroff after 2 min
shutdown --poweroff +2
fi

27 changes: 11 additions & 16 deletions launcher/image/preload.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,15 @@
#!/bin/bash

readonly OEM_PATH='/usr/share/oem'
readonly CS_PATH="${OEM_PATH}/confidential_space"

copy_launcher() {
cp launcher /usr/share/oem/cc_container_launcher
cp launcher "${CS_PATH}/cs_container_launcher"
}

setup_launcher_systemd_unit() {
cp container-runner.service /usr/share/oem/container-runner.service

if [ "$IMAGE_ENV" == "hardened" ]; then
cp hardened.conf /usr/share/oem/launcher.conf
elif [ "$IMAGE_ENV" == "debug" ]; then
cp debug.conf /usr/share/oem/launcher.conf
else
echo "Unknown IMAGE_ENV: ${IMAGE_ENV}. Use hardened or debug"
exit 1
fi
cp container-runner.service "${CS_PATH}/container-runner.service"
cp exit_script.sh "${CS_PATH}/exit_script.sh"
}

append_cmdline() {
Expand All @@ -40,9 +35,9 @@ enable_unit() {
}

configure_entrypoint() {
cp "$1" /usr/share/oem/user-data
touch /usr/share/oem/meta-data
append_cmdline "'ds=nocloud;s=/usr/share/oem/'"
cp "$1" ${OEM_PATH}/user-data
touch ${OEM_PATH}/meta-data
append_cmdline "'ds=nocloud;s=${OEM_PATH}/'"
}

configure_necessary_systemd_units() {
Expand All @@ -62,7 +57,6 @@ configure_systemd_units_for_debug() {
# No-op for now, as debug will default to using multi-user.target.
:
}

configure_systemd_units_for_hardened() {
configure_necessary_systemd_units
# Make entrypoint (via cloud-init) the default unit.
Expand All @@ -81,7 +75,8 @@ configure_systemd_units_for_hardened() {
}

main() {
mount -o remount,rw /usr/share/oem
mount -o remount,rw ${OEM_PATH}
mkdir ${CS_PATH}

# Install container launcher entrypoint.
configure_entrypoint "entrypoint.sh"
Expand Down
Loading

0 comments on commit 31dba6e

Please sign in to comment.