Skip to content

Commit

Permalink
Adding mem monitoring measurement
Browse files Browse the repository at this point in the history
  • Loading branch information
yawangwang committed Jan 24, 2024
1 parent 912a436 commit 15808b4
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 29 deletions.
94 changes: 67 additions & 27 deletions launcher/container_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,25 @@ func appendTokenMounts(mounts []specs.Mount) []specs.Mount {
return append(mounts, m)
}

func (r *ContainerRunner) measureCOSEvents(ctx context.Context) error {
if err := r.measureContainerClaims(ctx); err != nil {
return fmt.Errorf("failed to measure container claims: %v", err)
}

if r.launchSpec.Experiments.EnableMeasureMemoryMonitor {
if err := r.measureHealthMonitoringStates(ctx); err != nil {
return fmt.Errorf("failed to measure health monitoring states: %v", err)
}
r.logger.Println("Successfully measured memory monitoring state.")
}

separator := cel.CosTlv{
EventType: cel.LaunchSeparatorType,
EventContent: nil, // Success
}
return r.attestAgent.MeasureEvent(separator)
}

// measureContainerClaims will measure various container claims into the COS
// eventlog in the AttestationAgent.
func (r *ContainerRunner) measureContainerClaims(ctx context.Context) error {
Expand Down Expand Up @@ -334,11 +353,33 @@ func (r *ContainerRunner) measureContainerClaims(ctx context.Context) error {
}
}

separator := cel.CosTlv{
EventType: cel.LaunchSeparatorType,
EventContent: nil, // Success
return nil
}

// measureHealthMonitoringStates will measure various health monitoring states into the COS
// eventlog in the AttestationAgent.
func (r *ContainerRunner) measureHealthMonitoringStates(ctx context.Context) error {
s, err := systemctl.New()
if err != nil {
return fmt.Errorf("failed to create systemctl client: %v", err)
}
return r.attestAgent.MeasureEvent(separator)
defer s.Close()

status, err := s.GetStatus(ctx, "node-problem-detector.service")
if err != nil {
return err
}

var memoryMonitoringEnabledBit uint8
if status == "active" {
memoryMonitoringEnabledBit = 1
}

if err := r.attestAgent.MeasureEvent(cel.CosTlv{EventType: cel.MemoryMonitorType, EventContent: []byte{memoryMonitoringEnabledBit}}); err != nil {
return err
}

return nil
}

// Retrieves the default OIDC token from the attestation service, and returns how long
Expand Down Expand Up @@ -491,13 +532,6 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()

if err := r.measureContainerClaims(ctx); err != nil {
return fmt.Errorf("failed to measure container claims: %v", err)
}
if err := r.fetchAndWriteToken(ctx); err != nil {
return fmt.Errorf("failed to fetch and write OIDC token: %v", err)
}

r.logger.Printf("EnableTestFeatureForImage is set to %v\n", r.launchSpec.Experiments.EnableTestFeatureForImage)
// create and start the TEE server behind the experiment
if r.launchSpec.Experiments.EnableOnDemandAttestation {
Expand All @@ -510,24 +544,30 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
defer teeServer.Shutdown(ctx)
}

if r.launchSpec.Experiments.EnableMemoryMonitoring {
// start node-problem-detector.service to collect memory related metrics.
if r.launchSpec.MemoryMonitoringEnabled {
r.logger.Println("MemoryMonitoring is enabled by the VM operator")
s, err := systemctl.New()
if err != nil {
return fmt.Errorf("failed to create systemctl client: %v", err)
}
defer s.Close()
// start node-problem-detector.service to collect memory related metrics.
if r.launchSpec.MemoryMonitoringEnabled {
r.logger.Println("MemoryMonitoring is enabled by the VM operator")
s, err := systemctl.New()
if err != nil {
return fmt.Errorf("failed to create systemctl client: %v", err)
}
defer s.Close()

r.logger.Println("Starting a systemctl operation: systemctl start node-problem-detector.service")
if err := s.Start("node-problem-detector.service"); err != nil {
return fmt.Errorf("failed to start node-problem-detector.service: %v", err)
}
r.logger.Println("node-problem-detector.service successfully started.")
} else {
r.logger.Println("MemoryMonitoring is disabled by the VM operator")
r.logger.Println("Starting a systemctl operation: systemctl start node-problem-detector.service")
if err := s.Start("node-problem-detector.service"); err != nil {
return fmt.Errorf("failed to start node-problem-detector.service: %v", err)
}
r.logger.Println("node-problem-detector.service successfully started.")
} else {
r.logger.Println("MemoryMonitoring is disabled by the VM operator")
}

if err := r.measureCOSEvents(ctx); err != nil {
return fmt.Errorf("failed to measure COS events: %v", err)
}

if err := r.fetchAndWriteToken(ctx); err != nil {
return fmt.Errorf("failed to fetch and write OIDC token: %v", err)
}

var streamOpt cio.Opt
Expand Down
6 changes: 4 additions & 2 deletions launcher/image/test/test_memory_monitoring.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ steps:
- name: 'gcr.io/cloud-builders/gcloud'
id: CheckMemoryMonitoringEnabled
entrypoint: 'bash'
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-enable-${BUILD_ID}', '${_ZONE}', 'node-problem-detector.service successfully started']
# Search a regex pattern that ensures memory monitoring is enabled and measured into COS event logs.
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-enable-${BUILD_ID}', '${_ZONE}', 'node-problem-detector.service successfully started.*Successfully measured memory monitoring state']
waitFor: ['CreateVMMemoryMemonitorEnabled']
- name: 'gcr.io/cloud-builders/gcloud'
id: CleanUpVMMemoryMonitorEnabled
Expand All @@ -47,7 +48,8 @@ steps:
- name: 'gcr.io/cloud-builders/gcloud'
id: CheckMemoryMonitoringDisabled
entrypoint: 'bash'
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-disable-${BUILD_ID}', '${_ZONE}', 'MemoryMonitoring is disabled by the VM operator']
# Search a regex pattern that ensures memory monitoring is disabled and measured into COS event logs.
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-disable-${BUILD_ID}', '${_ZONE}', 'MemoryMonitoring is disabled by the VM operator.*Successfully measured memory monitoring state']
waitFor: ['CreateVMMemoryMemonitorDisabled']
- name: 'gcr.io/cloud-builders/gcloud'
id: CleanUpVMMemoryMonitorDisabled
Expand Down
1 change: 1 addition & 0 deletions launcher/internal/experiments/experiments.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ type Experiments struct {
EnableSignedContainerImage bool
EnableOnDemandAttestation bool
EnableMemoryMonitoring bool
EnableMeasureMemoryMonitor bool
}

// New takes a filepath, opens the file, and calls ReadJsonInput with the contents
Expand Down
14 changes: 14 additions & 0 deletions launcher/internal/systemctl/systemctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
type Systemd interface {
Start(string) error
Stop(string) error
GetStatus(context.Context, string) (string, error)
Close()
}

Expand Down Expand Up @@ -42,6 +43,19 @@ func (s *Systemctl) Stop(unit string) error {
return runSystemdCmd(s.dbus.StopUnitContext, "stop", unit)
}

// GetStatus is the equivalent of `systemctl is-active $unit`.
// The status can be "active", "activating", "deactivating", "inactive" or "failed".
func (s *Systemctl) GetStatus(ctx context.Context, unit string) (string, error) {
status, err := s.dbus.ListUnitsByNamesContext(ctx, []string{unit})
if err != nil {
return "", err
}
if len(status) != 1 {
return "", fmt.Errorf("want 1 unit from ListUnitsByNames, got %d", len(status))
}
return status[0].ActiveState, nil
}

// Close disconnects from dbus.
func (s *Systemctl) Close() { s.dbus.Close() }

Expand Down
40 changes: 40 additions & 0 deletions launcher/internal/systemctl/systemctl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,43 @@ func TestRunSystmedCmd(t *testing.T) {
})
}
}

// TestGetStatus reads the `-.mount` which should exist on all systemd
// systems and ensures that one of its properties is valid.
func TestGetStatus(t *testing.T) {
systemctl, err := New()
if err != nil {
t.Skipf("Failed to create systemctl client: %v", err)
}

t.Cleanup(systemctl.Close)

testCases := []struct {
name string
unit string
want string
}{
{
name: "success",
unit: "-.mount", //`-.mount` which should exist on all systemd systems,
want: "active",
},
{
name: "success with an inactive unit",
unit: "node-problem-detector.service",
want: "inactive",
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
got, err := systemctl.GetStatus(context.Background(), tc.unit)
if err != nil {
t.Fatalf("failed to read status for unit [%s]: %v", tc.unit, got)
}
if got != tc.want {
t.Errorf("GetStatus returned unexpected status for unit [%s], got %s, but want %s", tc.unit, got, tc.want)
}
})
}
}

0 comments on commit 15808b4

Please sign in to comment.