Surface error results from TaskRun to BuildRun

According to ship 0024, this commit adds a mechanism to surface errors from TaskRun steps. The errors are published under `status.failure` in BuildRun iff underlying TaskRun has failed.
shipwright-io · Dec 14, 2021 · 3a2e93d · 3a2e93d
1 parent 2147597
commit 3a2e93d
Show file tree

Hide file tree

Showing 11 changed files with 357 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,5 @@
 # Temporary Build Files
 .DS_Store
-git
 shipwright-build-controller
 build/_output
 build/_test

diff --git a/deploy/crds/shipwright.io_buildruns.yaml b/deploy/crds/shipwright.io_buildruns.yaml
@@ -558,13 +558,32 @@ spec:
                   type: object
                 type: array
               failedAt:
-                description: FailedAt points to the resource where the BuildRun failed
+                description: 'Deprecated: FailedAt points to the resource where the
+                  BuildRun failed'
                 properties:
                   container:
                     type: string
                   pod:
                     type: string
                 type: object
+              failureDetails:
+                description: FailureDetails contains error details that are collected
+                  and surfaced from TaskRun
+                properties:
+                  location:
+                    description: FailedAt describes the location where the failure
+                      happened
+                    properties:
+                      container:
+                        type: string
+                      pod:
+                        type: string
+                    type: object
+                  message:
+                    type: string
+                  reason:
+                    type: string
+                type: object
               latestTaskRunRef:
                 description: "LatestTaskRunRef is the name of the TaskRun responsible
                   for executing this BuildRun. \n TODO: This should be called something

diff --git a/docs/buildrun.md b/docs/buildrun.md
@@ -269,10 +269,29 @@ _Note_: We heavily rely on the Tekton TaskRun [Conditions](https://github.com/te
 
 ### Understanding failed BuildRuns
 
-To make it easier for users to understand why did a BuildRun failed, users can infer from the `Status.FailedAt` field, the pod and container where the failure took place.
+[DEPRECATED] To make it easier for users to understand why did a BuildRun failed, users can infer from the `Status.FailedAt` field, the pod and container where the failure took place.
 
 In addition, the `Status.Conditions` will host under the `Message` field a compacted message containing the `kubectl` command to trigger, in order to retrieve the logs.
 
+Lastly, users can check `Status.FailureDetails` field, which includes the same information available in the `Status.FailedAt` field, 
+as well as a humanly-readable error message and reason.
+The message and reason are only included if the build strategy provides them.
+
+Example of failed BuildRun:
+
+```yaml 
+# [...]
+status:
+  # [...]
+  failureDetails:
+    location:
+      container: step-source-default
+      pod: baran-build-buildrun-gzmv5-b7wbf-pod-bbpqr
+    message: The source repository does not exist, or you have insufficient permission
+      to access it.
+    reason: GitRemotePrivate
+```
+
 ### Step Results in BuildRun Status
 
 After the successful completion of a `BuildRun`, the `.status` field contains the results (`.status.taskResults`) emitted from the `TaskRun` steps generate by the `BuildRun` controller as part of processing the `BuildRun`. These results contain valuable metadata for users, like the _image digest_ or the _commit sha_ of the source code used for building.

diff --git a/docs/buildstrategies.md b/docs/buildstrategies.md
@@ -296,10 +296,36 @@ status:
  # [...]
   output:
     digest: sha256:07626e3c7fdd28d5328a8d6df8d29cd3da760c7f5e2070b534f9b880ed093a53
-    size: "1989004"
+    size: 1989004
   # [...]
 ```
 
+Additionally, you can store error details for debugging purposes when a BuildRun fails using your strategy.
+
+| Result file                        | Description                                     |
+| ---------------------------------- | ----------------------------------------------- |
+| `$(results.shp-error-reason.path)` | File to store the error reason.          |
+| `$(results.shp-error-message.path)` | File to store the error message. |
+
+Reason is intended to be a one-word, CamelCase classification of the error source, with the first letter capitalized.
+Error details are only propagated if the build container terminates with a non-zero exit code.
+This information will be available in the `.status.failureDetails` field of the BuildRun. 
+
+```yaml 
+apiVersion: shipwright.io/v1alpha1
+kind: BuildRun
+# [...]
+status:
+  # [...]
+  failureDetails:
+    location:
+      container: step-source-default
+      pod: baran-build-buildrun-gzmv5-b7wbf-pod-bbpqr
+    message: The source repository does not exist, or you have insufficient permission
+      to access it.
+    reason: GitRemotePrivate
+```
+
 ## Steps Resource Definition
 
 All strategies steps can include a definition of resources(_limits and requests_) for CPU, memory and disk. For strategies with more than one step, each step(_container_) could require more resources than others. Strategy admins are free to define the values that they consider the best fit for each step. Also, identical strategies with the same steps that are only different in their name and step resources can be installed on the cluster to allow users to create a build with smaller and larger resource requirements.

diff --git a/pkg/apis/build/v1alpha1/buildrun_types.go b/pkg/apis/build/v1alpha1/buildrun_types.go
@@ -149,9 +149,13 @@ type BuildRunStatus struct {
 	// +optional
 	BuildSpec *BuildSpec `json:"buildSpec,omitempty"`
 
-	// FailedAt points to the resource where the BuildRun failed
+	// Deprecated: FailedAt points to the resource where the BuildRun failed
 	// +optional
 	FailedAt *FailedAt `json:"failedAt,omitempty"`
+
+	// FailureDetails contains error details that are collected and surfaced from TaskRun
+	// +optional
+	FailureDetails *FailureDetails `json:"failureDetails,omitempty"`
 }
 
 // FailedAt describes the location where the failure happened
@@ -160,6 +164,13 @@ type FailedAt struct {
 	Container string `json:"container,omitempty"`
 }
 
+// FailureDetails describes an error while building images
+type FailureDetails struct {
+	Reason   string    `json:"reason,omitempty"`
+	Message  string    `json:"message,omitempty"`
+	Location *FailedAt `json:"location,omitempty"`
+}
+
 // BuildRef can be used to refer to a specific instance of a Build.
 type BuildRef struct {
 	// Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names

diff --git a/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go
diff --git a/pkg/reconciler/buildrun/buildrun.go b/pkg/reconciler/buildrun/buildrun.go
@@ -306,6 +306,7 @@ func (r *ReconcileBuildRun) Reconcile(ctx context.Context, request reconcile.Req
 				return reconcile.Result{}, err
 			}
 
+			resources.UpdateBuildRunUsingTaskFailures(ctx, r.client, buildRun, lastTaskRun)
 			taskRunStatus := trCondition.Status
 
 			// check if we should delete the generated service account by checking the build run spec and that the task run is complete

diff --git a/pkg/reconciler/buildrun/resources/conditions.go b/pkg/reconciler/buildrun/resources/conditions.go
@@ -14,7 +14,6 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/types"
 
 	buildv1alpha1 "github.com/shipwright-io/build/pkg/apis/build/v1alpha1"
 	"github.com/shipwright-io/build/pkg/ctxlog"
@@ -70,8 +69,8 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie
 
 	case v1beta1.TaskRunReasonFailed:
 		if taskRun.Status.CompletionTime != nil {
-			var pod corev1.Pod
-			if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil {
+			pod, failedContainer, err := extractFailedPodAndContainer(ctx, client, taskRun)
+			if err != nil {
 				// when trying to customize the Condition Message field, ensure the Message cover the case
 				// when a Pod is deleted.
 				// Note: this is an edge case, but not doing this prevent a BuildRun from being marked as Failed
@@ -83,26 +82,11 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie
 				return err
 			}
 
+			//lint:ignore SA1019 we want to give users some time to adopt to failureDetails
 			buildRun.Status.FailedAt = &buildv1alpha1.FailedAt{Pod: pod.Name}
 
-			// Since the container status list is not sorted, as a quick workaround mark all failed containers
-			var failures = make(map[string]struct{})
-			for _, containerStatus := range pod.Status.ContainerStatuses {
-				if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 {
-					failures[containerStatus.Name] = struct{}{}
-				}
-			}
-
-			// Find the first container that failed
-			var failedContainer *corev1.Container
-			for i, container := range pod.Spec.Containers {
-				if _, has := failures[container.Name]; has {
-					failedContainer = &pod.Spec.Containers[i]
-					break
-				}
-			}
-
 			if failedContainer != nil {
+			  	//lint:ignore SA1019 we want to give users some time to adopt to failureDetails
 				buildRun.Status.FailedAt.Container = failedContainer.Name
 				message = fmt.Sprintf("buildrun step %s failed in pod %s, for detailed information: kubectl --namespace %s logs %s --container=%s",
 					failedContainer.Name,

diff --git a/pkg/reconciler/buildrun/resources/failure_details.go b/pkg/reconciler/buildrun/resources/failure_details.go
@@ -0,0 +1,116 @@
+// Copyright The Shipwright Contributors
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package resources
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"knative.dev/pkg/apis"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	buildv1alpha1 "github.com/shipwright-io/build/pkg/apis/build/v1alpha1"
+)
+
+const (
+	resultErrorMessage         = "error-message"
+	resultErrorReason          = "error-reason"
+	prefixedResultErrorReason  = prefixParamsResultsVolumes + "-" + resultErrorReason
+	prefixedResultErrorMessage = prefixParamsResultsVolumes + "-" + resultErrorMessage
+)
+
+// UpdateBuildRunUsingTaskFailures is extracting failures from taskRun steps and adding them to buildRun (mutates)
+func UpdateBuildRunUsingTaskFailures(ctx context.Context, client client.Client, buildRun *buildv1alpha1.BuildRun, taskRun *v1beta1.TaskRun) {
+	trCondition := taskRun.Status.GetCondition(apis.ConditionSucceeded)
+
+	// only extract failures when failing condition is present
+	if trCondition != nil && v1beta1.TaskRunReason(trCondition.Reason) == v1beta1.TaskRunReasonFailed {
+		buildRun.Status.FailureDetails = extractFailureDetails(ctx, client, taskRun)
+	}
+}
+
+func extractFailureReasonAndMessage(taskRun *v1beta1.TaskRun) (errorReason string, errorMessage string, hasReasonAndMessage bool) {
+	for _, step := range taskRun.Status.Steps {
+		message := step.Terminated.Message
+		var taskRunResults []v1beta1.PipelineResourceResult
+
+		if err := json.Unmarshal([]byte(message), &taskRunResults); err != nil {
+			continue
+		}
+
+		for _, result := range taskRunResults {
+			if result.Key == prefixedResultErrorMessage {
+				errorMessage = result.Value
+			}
+
+			if result.Key == prefixedResultErrorReason {
+				errorReason = result.Value
+			}
+		}
+	}
+
+	return errorReason, errorMessage, true
+}
+
+func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *v1beta1.TaskRun) (*v1.Pod, *v1.Container, error) {
+	var pod v1.Pod
+	if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil {
+		return nil, nil, err
+	}
+
+	failures := make(map[string]struct{})
+	// Find the names of all containers with failure status
+	for _, containerStatus := range pod.Status.ContainerStatuses {
+		if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 {
+			failures[containerStatus.Name] = struct{}{}
+		}
+	}
+
+	// Find the first container that has a failure status
+	var failedContainer *v1.Container
+	for i, container := range pod.Spec.Containers {
+		if _, has := failures[container.Name]; has {
+			failedContainer = &pod.Spec.Containers[i]
+			break
+		}
+	}
+
+	return &pod, failedContainer, nil
+}
+
+func extractFailureDetails(ctx context.Context, client client.Client, taskRun *v1beta1.TaskRun) (failure *buildv1alpha1.FailureDetails) {
+	failure = &buildv1alpha1.FailureDetails{}
+	failure.Location = &buildv1alpha1.FailedAt{Pod: taskRun.Status.PodName}
+
+	if reason, message, hasReasonAndMessage := extractFailureReasonAndMessage(taskRun); hasReasonAndMessage {
+		failure.Reason = reason
+		failure.Message = message
+	}
+
+	pod, container, _ := extractFailedPodAndContainer(ctx, client, taskRun)
+
+	if pod != nil && container != nil {
+		failure.Location.Pod = pod.Name
+		failure.Location.Container = container.Name
+	}
+
+	return failure
+}
+
+func getFailureDetailsTaskSpecResults() []v1beta1.TaskResult {
+	return []v1beta1.TaskResult{
+		{
+			Name:        fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorMessage),
+			Description: "The error description of the task run",
+		},
+		{
+			Name:        fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorReason),
+			Description: "The error reason of the task run",
+		},
+	}
+}