Skip to content

Commit

Permalink
Adding mem monitoring measurement
Browse files Browse the repository at this point in the history
  • Loading branch information
yawangwang committed Feb 1, 2024
1 parent fd156ad commit bfba9f6
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 26 deletions.
82 changes: 58 additions & 24 deletions launcher/container_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,26 @@ func (r *ContainerRunner) measureContainerClaims(ctx context.Context) error {
return r.attestAgent.MeasureEvent(separator)
}

// measureMemoryMonitor will measure the current memory monitoring state into the COS
// eventlog in the AttestationAgent.
func (r *ContainerRunner) measureMemoryMonitor(ctx context.Context, systemd systemctl.Systemd) error {
status, err := systemd.GetStatus(ctx, "node-problem-detector.service")
if err != nil {
return err
}

var enabled uint8
if status == "active" || status == "activating" {
enabled = 1
}

if err := r.attestAgent.MeasureEvent(cel.CosTlv{EventType: cel.MemoryMonitorType, EventContent: []byte{enabled}}); err != nil {
return err
}

return nil
}

// Retrieves the default OIDC token from the attestation service, and returns how long
// to wait before attemping to refresh it.
// The token file will be written to a tmp file and then renamed.
Expand Down Expand Up @@ -488,19 +508,36 @@ func defaultRetryPolicy() *backoff.ExponentialBackOff {
return expBack
}

// enableMemoryMonitoring will start node-problem-detector and measure the memory monitoring status if experiment flag is on.
func (r *ContainerRunner) enableMemoryMonitoring(ctx context.Context) error {
r.logger.Println("MemoryMonitoring is enabled by the VM operator")
s, err := systemctl.New()
if err != nil {
return fmt.Errorf("failed to create systemctl client: %v", err)
}
defer s.Close()

r.logger.Println("Starting a systemctl operation: systemctl start node-problem-detector.service")
if err := s.Start("node-problem-detector.service"); err != nil {
return fmt.Errorf("failed to start node-problem-detector.service: %v", err)
}
r.logger.Println("node-problem-detector.service successfully started.")

if r.launchSpec.Experiments.EnableMeasureMemoryMonitor {
if err := r.measureMemoryMonitor(ctx, s); err != nil {
return fmt.Errorf("failed to measure memory monitoring state: %v", err)
}
r.logger.Println("Successfully measured memory monitoring state.")
}
return nil
}

// Run the container
// Container output will always be redirected to logger writer for now
func (r *ContainerRunner) Run(ctx context.Context) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()

if err := r.measureContainerClaims(ctx); err != nil {
return fmt.Errorf("failed to measure container claims: %v", err)
}
if err := r.fetchAndWriteToken(ctx); err != nil {
return fmt.Errorf("failed to fetch and write OIDC token: %v", err)
}

r.logger.Printf("EnableTestFeatureForImage is set to %v\n", r.launchSpec.Experiments.EnableTestFeatureForImage)
// create and start the TEE server behind the experiment
if r.launchSpec.Experiments.EnableOnDemandAttestation {
Expand All @@ -513,24 +550,21 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
defer teeServer.Shutdown(ctx)
}

if r.launchSpec.Experiments.EnableMemoryMonitoring {
// start node-problem-detector.service to collect memory related metrics.
if r.launchSpec.MemoryMonitoringEnabled {
r.logger.Println("MemoryMonitoring is enabled by the VM operator")
s, err := systemctl.New()
if err != nil {
return fmt.Errorf("failed to create systemctl client: %v", err)
}
defer s.Close()

r.logger.Println("Starting a systemctl operation: systemctl start node-problem-detector.service")
if err := s.Start("node-problem-detector.service"); err != nil {
return fmt.Errorf("failed to start node-problem-detector.service: %v", err)
}
r.logger.Println("node-problem-detector.service successfully started.")
} else {
r.logger.Println("MemoryMonitoring is disabled by the VM operator")
// start node-problem-detector.service to collect memory related metrics.
if r.launchSpec.MemoryMonitoringEnabled {
if err := r.enableMemoryMonitoring(ctx); err != nil {
return err
}
} else {
r.logger.Println("MemoryMonitoring is disabled by the VM operator")
}

if err := r.measureContainerClaims(ctx); err != nil {
return fmt.Errorf("failed to measure container claims: %v", err)
}

if err := r.fetchAndWriteToken(ctx); err != nil {
return fmt.Errorf("failed to fetch and write OIDC token: %v", err)
}

var streamOpt cio.Opt
Expand Down
6 changes: 4 additions & 2 deletions launcher/image/test/test_memory_monitoring.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ steps:
- name: 'gcr.io/cloud-builders/gcloud'
id: CheckMemoryMonitoringEnabled
entrypoint: 'bash'
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-enable-${BUILD_ID}', '${_ZONE}', 'node-problem-detector.service successfully started']
# Search a regex pattern that ensures memory monitoring is enabled and measured into COS event logs.
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-enable-${BUILD_ID}', '${_ZONE}', 'node-problem-detector.service successfully started.*Successfully measured memory monitoring state']
waitFor: ['CreateVMMemoryMemonitorEnabled']
- name: 'gcr.io/cloud-builders/gcloud'
id: CleanUpVMMemoryMonitorEnabled
Expand All @@ -47,7 +48,8 @@ steps:
- name: 'gcr.io/cloud-builders/gcloud'
id: CheckMemoryMonitoringDisabled
entrypoint: 'bash'
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-disable-${BUILD_ID}', '${_ZONE}', 'MemoryMonitoring is disabled by the VM operator']
# Search a regex pattern that ensures memory monitoring is disabled and measured into COS event logs.
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-disable-${BUILD_ID}', '${_ZONE}', 'MemoryMonitoring is disabled by the VM operator.*Successfully measured memory monitoring state']
waitFor: ['CreateVMMemoryMemonitorDisabled']
- name: 'gcr.io/cloud-builders/gcloud'
id: CleanUpVMMemoryMonitorDisabled
Expand Down
1 change: 1 addition & 0 deletions launcher/internal/experiments/experiments.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ type Experiments struct {
EnableOnDemandAttestation bool
EnableMemoryMonitoring bool
EnableSignedContainerCache bool
EnableMeasureMemoryMonitor bool
}

// New takes a filepath, opens the file, and calls ReadJsonInput with the contents
Expand Down
14 changes: 14 additions & 0 deletions launcher/internal/systemctl/systemctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
type Systemd interface {
Start(string) error
Stop(string) error
GetStatus(context.Context, string) (string, error)
Close()
}

Expand Down Expand Up @@ -42,6 +43,19 @@ func (s *Systemctl) Stop(unit string) error {
return runSystemdCmd(s.dbus.StopUnitContext, "stop", unit)
}

// GetStatus is the equivalent of `systemctl is-active $unit`.
// The status can be "active", "activating", "deactivating", "inactive" or "failed".
func (s *Systemctl) GetStatus(ctx context.Context, unit string) (string, error) {
status, err := s.dbus.ListUnitsByNamesContext(ctx, []string{unit})
if err != nil {
return "", err
}
if len(status) != 1 {
return "", fmt.Errorf("want 1 unit from ListUnitsByNames, got %d", len(status))
}
return status[0].ActiveState, nil
}

// Close disconnects from dbus.
func (s *Systemctl) Close() { s.dbus.Close() }

Expand Down
40 changes: 40 additions & 0 deletions launcher/internal/systemctl/systemctl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,43 @@ func TestRunSystmedCmd(t *testing.T) {
})
}
}

// TestGetStatus reads the `-.mount` which should exist on all systemd
// systems and ensures that one of its properties is valid.
func TestGetStatus(t *testing.T) {
systemctl, err := New()
if err != nil {
t.Skipf("Failed to create systemctl client: %v", err)
}

t.Cleanup(systemctl.Close)

testCases := []struct {
name string
unit string
want string
}{
{
name: "success",
unit: "-.mount", //`-.mount` which should exist on all systemd systems,
want: "active",
},
{
name: "success with an inactive unit",
unit: "node-problem-detector.service",
want: "inactive",
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
got, err := systemctl.GetStatus(context.Background(), tc.unit)
if err != nil {
t.Fatalf("failed to read status for unit [%s]: %v", tc.unit, got)
}
if got != tc.want {
t.Errorf("GetStatus returned unexpected status for unit [%s], got %s, but want %s", tc.unit, got, tc.want)
}
})
}
}

0 comments on commit bfba9f6

Please sign in to comment.