Adding mem monitoring measurement

google · Feb 1, 2024 · bfba9f6 · bfba9f6
1 parent fd156ad
commit bfba9f6
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 26 deletions.
diff --git a/launcher/container_runner.go b/launcher/container_runner.go
@@ -341,6 +341,26 @@ func (r *ContainerRunner) measureContainerClaims(ctx context.Context) error {
 	return r.attestAgent.MeasureEvent(separator)
 }
 
+// measureMemoryMonitor will measure the current memory monitoring state into the COS
+// eventlog in the AttestationAgent.
+func (r *ContainerRunner) measureMemoryMonitor(ctx context.Context, systemd systemctl.Systemd) error {
+	status, err := systemd.GetStatus(ctx, "node-problem-detector.service")
+	if err != nil {
+		return err
+	}
+
+	var enabled uint8
+	if status == "active" || status == "activating" {
+		enabled = 1
+	}
+
+	if err := r.attestAgent.MeasureEvent(cel.CosTlv{EventType: cel.MemoryMonitorType, EventContent: []byte{enabled}}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // Retrieves the default OIDC token from the attestation service, and returns how long
 // to wait before attemping to refresh it.
 // The token file will be written to a tmp file and then renamed.
@@ -488,19 +508,36 @@ func defaultRetryPolicy() *backoff.ExponentialBackOff {
 	return expBack
 }
 
+// enableMemoryMonitoring will start node-problem-detector and measure the memory monitoring status if experiment flag is on.
+func (r *ContainerRunner) enableMemoryMonitoring(ctx context.Context) error {
+	r.logger.Println("MemoryMonitoring is enabled by the VM operator")
+	s, err := systemctl.New()
+	if err != nil {
+		return fmt.Errorf("failed to create systemctl client: %v", err)
+	}
+	defer s.Close()
+
+	r.logger.Println("Starting a systemctl operation: systemctl start node-problem-detector.service")
+	if err := s.Start("node-problem-detector.service"); err != nil {
+		return fmt.Errorf("failed to start node-problem-detector.service: %v", err)
+	}
+	r.logger.Println("node-problem-detector.service successfully started.")
+
+	if r.launchSpec.Experiments.EnableMeasureMemoryMonitor {
+		if err := r.measureMemoryMonitor(ctx, s); err != nil {
+			return fmt.Errorf("failed to measure memory monitoring state: %v", err)
+		}
+		r.logger.Println("Successfully measured memory monitoring state.")
+	}
+	return nil
+}
+
 // Run the container
 // Container output will always be redirected to logger writer for now
 func (r *ContainerRunner) Run(ctx context.Context) error {
 	ctx, cancel := context.WithCancel(ctx)
 	defer cancel()
 
-	if err := r.measureContainerClaims(ctx); err != nil {
-		return fmt.Errorf("failed to measure container claims: %v", err)
-	}
-	if err := r.fetchAndWriteToken(ctx); err != nil {
-		return fmt.Errorf("failed to fetch and write OIDC token: %v", err)
-	}
-
 	r.logger.Printf("EnableTestFeatureForImage is set to %v\n", r.launchSpec.Experiments.EnableTestFeatureForImage)
 	// create and start the TEE server behind the experiment
 	if r.launchSpec.Experiments.EnableOnDemandAttestation {
@@ -513,24 +550,21 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
 		defer teeServer.Shutdown(ctx)
 	}
 
-	if r.launchSpec.Experiments.EnableMemoryMonitoring {
-		// start node-problem-detector.service to collect memory related metrics.
-		if r.launchSpec.MemoryMonitoringEnabled {
-			r.logger.Println("MemoryMonitoring is enabled by the VM operator")
-			s, err := systemctl.New()
-			if err != nil {
-				return fmt.Errorf("failed to create systemctl client: %v", err)
-			}
-			defer s.Close()
-
-			r.logger.Println("Starting a systemctl operation: systemctl start node-problem-detector.service")
-			if err := s.Start("node-problem-detector.service"); err != nil {
-				return fmt.Errorf("failed to start node-problem-detector.service: %v", err)
-			}
-			r.logger.Println("node-problem-detector.service successfully started.")
-		} else {
-			r.logger.Println("MemoryMonitoring is disabled by the VM operator")
+	// start node-problem-detector.service to collect memory related metrics.
+	if r.launchSpec.MemoryMonitoringEnabled {
+		if err := r.enableMemoryMonitoring(ctx); err != nil {
+			return err
 		}
+	} else {
+		r.logger.Println("MemoryMonitoring is disabled by the VM operator")
+	}
+
+	if err := r.measureContainerClaims(ctx); err != nil {
+		return fmt.Errorf("failed to measure container claims: %v", err)
+	}
+
+	if err := r.fetchAndWriteToken(ctx); err != nil {
+		return fmt.Errorf("failed to fetch and write OIDC token: %v", err)
 	}
 
 	var streamOpt cio.Opt

diff --git a/launcher/image/test/test_memory_monitoring.yaml b/launcher/image/test/test_memory_monitoring.yaml
@@ -22,7 +22,8 @@ steps:
 - name: 'gcr.io/cloud-builders/gcloud'
   id: CheckMemoryMonitoringEnabled
   entrypoint: 'bash'
-  args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-enable-${BUILD_ID}', '${_ZONE}', 'node-problem-detector.service successfully started']
+  # Search a regex pattern that ensures memory monitoring is enabled and measured into COS event logs.
+  args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-enable-${BUILD_ID}', '${_ZONE}', 'node-problem-detector.service successfully started.*Successfully measured memory monitoring state']
   waitFor: ['CreateVMMemoryMemonitorEnabled']
 - name: 'gcr.io/cloud-builders/gcloud'
   id: CleanUpVMMemoryMonitorEnabled
@@ -47,7 +48,8 @@ steps:
 - name: 'gcr.io/cloud-builders/gcloud'
   id: CheckMemoryMonitoringDisabled
   entrypoint: 'bash'
-  args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-disable-${BUILD_ID}', '${_ZONE}', 'MemoryMonitoring is disabled by the VM operator']
+  # Search a regex pattern that ensures memory monitoring is disabled and measured into COS event logs.
+  args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-disable-${BUILD_ID}', '${_ZONE}', 'MemoryMonitoring is disabled by the VM operator.*Successfully measured memory monitoring state']
   waitFor: ['CreateVMMemoryMemonitorDisabled']
 - name: 'gcr.io/cloud-builders/gcloud'
   id: CleanUpVMMemoryMonitorDisabled

diff --git a/launcher/internal/experiments/experiments.go b/launcher/internal/experiments/experiments.go
@@ -16,6 +16,7 @@ type Experiments struct {
 	EnableOnDemandAttestation  bool
 	EnableMemoryMonitoring     bool
 	EnableSignedContainerCache bool
+	EnableMeasureMemoryMonitor bool
 }
 
 // New takes a filepath, opens the file, and calls ReadJsonInput with the contents

diff --git a/launcher/internal/systemctl/systemctl.go b/launcher/internal/systemctl/systemctl.go
@@ -13,6 +13,7 @@ import (
 type Systemd interface {
 	Start(string) error
 	Stop(string) error
+	GetStatus(context.Context, string) (string, error)
 	Close()
 }
 
@@ -42,6 +43,19 @@ func (s *Systemctl) Stop(unit string) error {
 	return runSystemdCmd(s.dbus.StopUnitContext, "stop", unit)
 }
 
+// GetStatus is the equivalent of `systemctl is-active $unit`.
+// The status can be "active", "activating", "deactivating", "inactive" or "failed".
+func (s *Systemctl) GetStatus(ctx context.Context, unit string) (string, error) {
+	status, err := s.dbus.ListUnitsByNamesContext(ctx, []string{unit})
+	if err != nil {
+		return "", err
+	}
+	if len(status) != 1 {
+		return "", fmt.Errorf("want 1 unit from ListUnitsByNames, got %d", len(status))
+	}
+	return status[0].ActiveState, nil
+}
+
 // Close disconnects from dbus.
 func (s *Systemctl) Close() { s.dbus.Close() }
 

diff --git a/launcher/internal/systemctl/systemctl_test.go b/launcher/internal/systemctl/systemctl_test.go
@@ -49,3 +49,43 @@ func TestRunSystmedCmd(t *testing.T) {
 		})
 	}
 }
+
+// TestGetStatus reads the `-.mount` which should exist on all systemd
+// systems and ensures that one of its properties is valid.
+func TestGetStatus(t *testing.T) {
+	systemctl, err := New()
+	if err != nil {
+		t.Skipf("Failed to create systemctl client: %v", err)
+	}
+
+	t.Cleanup(systemctl.Close)
+
+	testCases := []struct {
+		name string
+		unit string
+		want string
+	}{
+		{
+			name: "success",
+			unit: "-.mount", //`-.mount` which should exist on all systemd systems,
+			want: "active",
+		},
+		{
+			name: "success with an inactive unit",
+			unit: "node-problem-detector.service",
+			want: "inactive",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := systemctl.GetStatus(context.Background(), tc.unit)
+			if err != nil {
+				t.Fatalf("failed to read status for unit [%s]: %v", tc.unit, got)
+			}
+			if got != tc.want {
+				t.Errorf("GetStatus returned unexpected status for unit [%s], got %s, but want %s", tc.unit, got, tc.want)
+			}
+		})
+	}
+}