From 24d1922da1d2be9541d12c41d074387d6c2e1148 Mon Sep 17 00:00:00 2001
From: Christian Rohmann <christian.rohmann@inovex.de>
Date: Thu, 22 Aug 2024 09:53:20 +0200
Subject: [PATCH 1/3] Use exit codes (of k6 archive + inspect) as sole
 indication for initialization success

1) Introduce an EmptyDir volume for temporary data
This removes the need for any mkdir and avoids writing to the container filesystem,
allowing for e.g. `readOnlyRootFilesystem` security contexts ([1]).

2) Remove all output redirection and grepping for error indicating strings in favor of
using the exit codes from the commands to indicate success or failure.
This approach is much cleaner in regards to any changes to the output of K6 and the vast
space of error there is.

It also reestablishes the clear contract of the `inspect` command to judge the provided
config.

`k6 inspect --execution-requirements` ([2,3]) while also printing warnings or the JSON
should clearly indicate via `err` if there are issues with the provided tests, which
are then converted to a non-zero RC ([4]).

[1] https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
[2] https://github.com/grafana/k6/blob/c212bd277aeedbf8395b917df11bd9be4da490a4/cmd/inspect.go#L34
[3] https://github.com/grafana/k6/blob/c212bd277aeedbf8395b917df11bd9be4da490a4/cmd/inspect.go#L64
[4] https://github.com/grafana/k6/blob/master/cmd/root.go#L90-L118

Fixes: #435
---
 pkg/resources/jobs/initializer.go      | 34 ++++++++++++--------------
 pkg/resources/jobs/initializer_test.go | 22 +++++++++++++++--
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/pkg/resources/jobs/initializer.go b/pkg/resources/jobs/initializer.go
index 64bb2de4..f93d5028 100644
--- a/pkg/resources/jobs/initializer.go
+++ b/pkg/resources/jobs/initializer.go
@@ -61,31 +61,29 @@ func NewInitializerJob(k6 v1alpha1.TestRunI, argLine string) (*batchv1.Job, erro
 	)
 	istioCommand, istioEnabled := newIstioCommand(k6.GetSpec().Scuttle.Enabled, []string{"sh", "-c"})
 	command := append(istioCommand, fmt.Sprintf(
-		// There can be several scenarios from k6 command here:
-		// a) script is correct and `k6 inspect` outputs JSON
-		// b) script is partially incorrect and `k6` outputs a warning log message and
-		// then a JSON
-		// c) script is incorrect and `k6` outputs an error log message
-		// Warnings at this point are not necessary (warning messages will re-appear in
-		// runner's logs and the user can see them there) so we need a pure JSON here
-		// without any additional messages in cases a) and b). In case c), output should
-		// contain error message and the Job is to exit with non-zero code.
-		//
-		// Due to some pecularities of k6 logging, to achieve the above behaviour,
-		// we need to use a workaround to store all log messages in temp file while
-		// printing JSON as usual. Then parse temp file only for errors, ignoring
-		// any other log messages.
-		// Related: https://github.com/grafana/k6-docs/issues/877
-		"mkdir -p $(dirname %s) && k6 archive %s -O %s %s 2> /tmp/k6logs && k6 inspect --execution-requirements %s 2> /tmp/k6logs ; ! cat /tmp/k6logs | grep 'level=error'",
-		archiveName, scriptName, archiveName, argLine,
-		archiveName))
+		"k6 archive %s -O %s %s && k6 inspect --execution-requirements %s",
+		scriptName, archiveName, argLine, archiveName))
 
 	env := append(newIstioEnvVar(k6.GetSpec().Scuttle, istioEnabled), k6.GetSpec().Initializer.Env...)
 
 	volumes := script.Volume()
+	// emptyDir to hold our temporary data
+	tmpVolume := corev1.Volume{
+		Name: "tmpdir",
+		VolumeSource: corev1.VolumeSource{
+			EmptyDir: &corev1.EmptyDirVolumeSource{},
+		},
+	}
+	volumes = append(volumes, tmpVolume)
 	volumes = append(volumes, k6.GetSpec().Initializer.Volumes...)
 
 	volumeMounts := script.VolumeMount()
+	// make /tmp an EmptyDir
+	tmpVolumeMount := corev1.VolumeMount{
+		Name:      "tmpdir",
+		MountPath: "/tmp",
+	}
+	volumeMounts = append(volumeMounts, tmpVolumeMount)
 	volumeMounts = append(volumeMounts, k6.GetSpec().Initializer.VolumeMounts...)
 
 	var zero32 int32
diff --git a/pkg/resources/jobs/initializer_test.go b/pkg/resources/jobs/initializer_test.go
index 868d13ef..ec03c39a 100644
--- a/pkg/resources/jobs/initializer_test.go
+++ b/pkg/resources/jobs/initializer_test.go
@@ -21,6 +21,24 @@ func TestNewInitializerJob(t *testing.T) {
 	automountServiceAccountToken := true
 	zero := int32(0)
 
+	volumes := script.Volume()
+	// emptyDir to hold our temporary data
+	tmpVolume := corev1.Volume{
+		Name: "tmpdir",
+		VolumeSource: corev1.VolumeSource{
+			EmptyDir: &corev1.EmptyDirVolumeSource{},
+		},
+	}
+	volumes = append(volumes, tmpVolume)
+
+	volumeMounts := script.VolumeMount()
+	// make /tmp an EmptyDir
+	tmpVolumeMount := corev1.VolumeMount{
+		Name:      "tmpdir",
+		MountPath: "/tmp",
+	}
+	volumeMounts = append(volumeMounts, tmpVolumeMount)
+
 	expectedOutcome := &batchv1.Job{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "test-initializer",
@@ -63,7 +81,7 @@ func TestNewInitializerJob(t *testing.T) {
 							Name:            "k6",
 							Command: []string{
 								"sh", "-c",
-								"mkdir -p $(dirname /tmp/test.js.archived.tar) && k6 archive /test/test.js -O /tmp/test.js.archived.tar --out cloud 2> /tmp/k6logs && k6 inspect --execution-requirements /tmp/test.js.archived.tar 2> /tmp/k6logs ; ! cat /tmp/k6logs | grep 'level=error'",
+								"k6 archive /test/test.js -O /tmp/test.js.archived.tar --out cloud && k6 inspect --execution-requirements /tmp/test.js.archived.tar",
 							},
 							Env: []corev1.EnvVar{},
 							EnvFrom: []corev1.EnvFromSource{
@@ -81,7 +99,7 @@ func TestNewInitializerJob(t *testing.T) {
 							SecurityContext: &corev1.SecurityContext{},
 						},
 					},
-					Volumes: script.Volume(),
+					Volumes: volumes,
 				},
 			},
 		},

From 28685358f5659e63d2cd0297f86616bb639cb6ad Mon Sep 17 00:00:00 2001
From: Christian Rohmann <christian.rohmann@inovex.de>
Date: Thu, 22 Aug 2024 13:24:02 +0200
Subject: [PATCH 2/3] Set TerminationMessagePolicy on Initializer to use logs
 as fallback

While not explicitly used, with the removal of the log fetching from the
Initializer (Job->Pod->Container) any potential source for verbose information
about the error might be lost. The Kubernetes approach to provide more details
than the exit code about the reason a container terminated is the termination
message (see [1]).

Per default this message needs to be explictly written to `/dev/termination-log`,
but there also is the option to use the last bit of log output as fallback source.

[1] https://kubernetes.io/docs/tasks/debug/debug-application/determine-reason-pod-failure/
---
 pkg/resources/jobs/initializer.go      | 21 +++++++++++----------
 pkg/resources/jobs/initializer_test.go |  9 +++++----
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/pkg/resources/jobs/initializer.go b/pkg/resources/jobs/initializer.go
index f93d5028..9db0da01 100644
--- a/pkg/resources/jobs/initializer.go
+++ b/pkg/resources/jobs/initializer.go
@@ -114,16 +114,17 @@ func NewInitializerJob(k6 v1alpha1.TestRunI, argLine string) (*batchv1.Job, erro
 					InitContainers:               getInitContainers(k6.GetSpec().Initializer, script),
 					Containers: []corev1.Container{
 						{
-							Image:           image,
-							ImagePullPolicy: k6.GetSpec().Initializer.ImagePullPolicy,
-							Name:            "k6",
-							Command:         command,
-							Env:             env,
-							Resources:       k6.GetSpec().Initializer.Resources,
-							VolumeMounts:    volumeMounts,
-							EnvFrom:         k6.GetSpec().Initializer.EnvFrom,
-							Ports:           ports,
-							SecurityContext: &k6.GetSpec().Initializer.ContainerSecurityContext,
+							Image:                    image,
+							ImagePullPolicy:          k6.GetSpec().Initializer.ImagePullPolicy,
+							Name:                     "k6",
+							Command:                  command,
+							Env:                      env,
+							Resources:                k6.GetSpec().Initializer.Resources,
+							VolumeMounts:             volumeMounts,
+							EnvFrom:                  k6.GetSpec().Initializer.EnvFrom,
+							Ports:                    ports,
+							SecurityContext:          &k6.GetSpec().Initializer.ContainerSecurityContext,
+							TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError,
 						},
 					},
 					Volumes: volumes,
diff --git a/pkg/resources/jobs/initializer_test.go b/pkg/resources/jobs/initializer_test.go
index ec03c39a..10d7ac58 100644
--- a/pkg/resources/jobs/initializer_test.go
+++ b/pkg/resources/jobs/initializer_test.go
@@ -93,10 +93,11 @@ func TestNewInitializerJob(t *testing.T) {
 									},
 								},
 							},
-							Resources:       corev1.ResourceRequirements{},
-							VolumeMounts:    script.VolumeMount(),
-							Ports:           []corev1.ContainerPort{{ContainerPort: 6565}},
-							SecurityContext: &corev1.SecurityContext{},
+							Resources:                corev1.ResourceRequirements{},
+							VolumeMounts:             volumeMounts,
+							Ports:                    []corev1.ContainerPort{{ContainerPort: 6565}},
+							SecurityContext:          &corev1.SecurityContext{},
+							TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError,
 						},
 					},
 					Volumes: volumes,

From 1fd1c56387246b0cb1a8dee87713dee899d5d924 Mon Sep 17 00:00:00 2001
From: Christian Rohmann <christian.rohmann@inovex.de>
Date: Thu, 22 Aug 2024 13:12:08 +0200
Subject: [PATCH 3/3] Remove fetching of container logs from initializer

Following the focus on the exit code of the command using in the initializer job (Pod),
this commit removes the fetching of the container log.

There was only a basic JSON unmarshalling applied with no interpretion of what it contained.
This is either covered by `k6 inspect` exiting with rc=0 or should be added to the initializer
job.

If further details about the failure reason of the initalizer container was to be required,
the source should be the termination message. It could be used to enrich the TestRun CR to provide
a higher level of detail about the failure to the user.

[1] https://kubernetes.io/docs/tasks/debug/debug-application/determine-reason-pod-failure/#customizing-the-termination-message
---
 controllers/common.go | 55 +------------------------------------------
 1 file changed, 1 insertion(+), 54 deletions(-)

diff --git a/controllers/common.go b/controllers/common.go
index 078d96b5..55278fb2 100644
--- a/controllers/common.go
+++ b/controllers/common.go
@@ -1,24 +1,17 @@
 package controllers
 
 import (
-	"bytes"
 	"context"
-	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
-	"time"
 
 	"github.com/go-logr/logr"
 	"github.com/grafana/k6-operator/api/v1alpha1"
 	"github.com/grafana/k6-operator/pkg/cloud"
 	"github.com/grafana/k6-operator/pkg/testrun"
 	corev1 "k8s.io/api/core/v1"
-	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/types"
-	"k8s.io/client-go/kubernetes"
-	"k8s.io/client-go/rest"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -65,52 +58,6 @@ func inspectTestRun(ctx context.Context, log logr.Logger, k6 v1alpha1.TestRunI,
 		return
 	}
 
-	// Here we need to get the output of the pod
-	// pods/log is not currently supported by controller-runtime client and it is officially
-	// recommended to use REST client instead:
-	// https://github.com/kubernetes-sigs/controller-runtime/issues/1229
-
-	// TODO: if the below errors repeat several times, it'd be a real error case scenario.
-	// How likely is it? Should we track frequency of these errors here?
-	config, err := rest.InClusterConfig()
-	if err != nil {
-		log.Error(err, "unable to fetch in-cluster REST config")
-		returnErr = err
-		return
-	}
-
-	clientset, err := kubernetes.NewForConfig(config)
-	if err != nil {
-		log.Error(err, "unable to get access to clientset")
-		returnErr = err
-		return
-	}
-	req := clientset.CoreV1().Pods(k6.NamespacedName().Namespace).GetLogs(podList.Items[0].Name, &corev1.PodLogOptions{
-		Container: "k6",
-	})
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
-	defer cancel()
-
-	podLogs, err := req.Stream(ctx)
-	if err != nil {
-		log.Error(err, "unable to stream logs from the pod")
-		returnErr = err
-		return
-	}
-	defer podLogs.Close()
-
-	buf := new(bytes.Buffer)
-	_, returnErr = io.Copy(buf, podLogs)
-	if err != nil {
-		log.Error(err, "unable to copy logs from the pod")
-		return
-	}
-
-	if returnErr = json.Unmarshal(buf.Bytes(), &inspectOutput); returnErr != nil {
-		// this shouldn't normally happen but if it does, let's log output by default
-		log.Error(returnErr, fmt.Sprintf("unable to marshal: `%s`", buf.String()))
-	}
-
 	ready = true
 	return
 }
@@ -201,7 +148,7 @@ func (r *TestRunReconciler) hostnames(ctx context.Context, log logr.Logger, abor
 		err       error
 	)
 
-	sl := &v1.ServiceList{}
+	sl := &corev1.ServiceList{}
 
 	if err = r.List(ctx, sl, opts); err != nil {
 		log.Error(err, "Could not list services")