From d189b9050fada74882b846b1e08ee02a18c81916 Mon Sep 17 00:00:00 2001 From: Baran Dalgic Date: Tue, 2 Nov 2021 09:10:21 +0100 Subject: [PATCH] Surface error results from TaskRun to BuildRun According to ship 0024, this commit adds a mechanism to surface errors from TaskRun steps. The errors are published under `status.failure` in BuildRun iff underlying TaskRun has failed. Adds `pkg/reconciler/buildrun/resources/failures.go`, `pkg/reconciler/buildrun/resources/failures_test.go` that contains logic for extracting error reason and message. Modifies `pkg/reconciler/buildrun/resources/taskrun.go` by adding two new generic results for errors. Adds error location in error results This commit extracts the login in `conditions.go` that searches for the failed pod and its container. It merges the information of the error location (pod and container) into the error results. Fix tests Added docs for failure details --- .gitignore | 1 - deploy/crds/shipwright.io_buildruns.yaml | 18 ++- docs/buildrun.md | 21 ++- docs/buildstrategies.md | 28 +++- pkg/apis/build/v1alpha1/buildrun_types.go | 13 +- .../build/v1alpha1/zz_generated.deepcopy.go | 26 ++++ pkg/reconciler/buildrun/buildrun.go | 1 + .../buildrun/resources/conditions.go | 24 +--- .../buildrun/resources/failure_details.go | 116 ++++++++++++++++ .../resources/failure_details_test.go | 130 ++++++++++++++++++ pkg/reconciler/buildrun/resources/taskrun.go | 2 +- 11 files changed, 354 insertions(+), 26 deletions(-) create mode 100644 pkg/reconciler/buildrun/resources/failure_details.go create mode 100644 pkg/reconciler/buildrun/resources/failure_details_test.go diff --git a/.gitignore b/.gitignore index 2baa7086ea..b45618b7bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ # Temporary Build Files .DS_Store -git shipwright-build-controller build/_output build/_test diff --git a/deploy/crds/shipwright.io_buildruns.yaml b/deploy/crds/shipwright.io_buildruns.yaml index 42db45ba78..789df96db3 100644 --- a/deploy/crds/shipwright.io_buildruns.yaml +++ b/deploy/crds/shipwright.io_buildruns.yaml @@ -459,13 +459,29 @@ spec: type: object type: array failedAt: - description: FailedAt points to the resource where the BuildRun failed + description: 'DEPRECATED: FailedAt points to the resource where the BuildRun failed' properties: container: type: string pod: type: string type: object + failureDetails: + description: FailureDetails contains error details that are collected and surfaced from TaskRun + properties: + location: + description: FailedAt describes the location where the failure happened + properties: + container: + type: string + pod: + type: string + type: object + message: + type: string + reason: + type: string + type: object latestTaskRunRef: description: "LatestTaskRunRef is the name of the TaskRun responsible for executing this BuildRun. \n TODO: This should be called something like \"TaskRunName\"" type: string diff --git a/docs/buildrun.md b/docs/buildrun.md index 1e4d6c952b..143251a6e7 100644 --- a/docs/buildrun.md +++ b/docs/buildrun.md @@ -268,10 +268,29 @@ _Note_: We heavily rely on the Tekton TaskRun [Conditions](https://github.com/te ### Understanding failed BuildRuns -To make it easier for users to understand why did a BuildRun failed, users can infer from the `Status.FailedAt` field, the pod and container where the failure took place. +[DEPRECATED] To make it easier for users to understand why did a BuildRun failed, users can infer from the `Status.FailedAt` field, the pod and container where the failure took place. In addition, the `Status.Conditions` will host under the `Message` field a compacted message containing the `kubectl` command to trigger, in order to retrieve the logs. +Lastly, users can check `Status.FailureDetails` field, which includes the same information available in the `Status.FailedAt` field, +as well as a humanly-readable error message and reason. +The message and reason are only included if the build strategy provides them. + +Example of failed BuildRun: + +```yaml +# [...] +status: + # [...] + failureDetails: + location: + container: step-source-default + pod: baran-build-buildrun-gzmv5-b7wbf-pod-bbpqr + message: The source repository does not exist, or you have insufficient permission + to access it. + reason: GitRemotePrivate +``` + ### Step Results in BuildRun Status After the successful completion of a `BuildRun`, the `.status` field contains the results (`.status.taskResults`) emitted from the `TaskRun` steps generate by the `BuildRun` controller as part of processing the `BuildRun`. These results contain valuable metadata for users, like the _image digest_ or the _commit sha_ of the source code used for building. diff --git a/docs/buildstrategies.md b/docs/buildstrategies.md index a37a45ace5..3e86120ecc 100644 --- a/docs/buildstrategies.md +++ b/docs/buildstrategies.md @@ -296,10 +296,36 @@ status: # [...] output: digest: sha256:07626e3c7fdd28d5328a8d6df8d29cd3da760c7f5e2070b534f9b880ed093a53 - size: "1989004" + size: 1989004 # [...] ``` +Additionally, you can store error details for debugging purposes when a BuildRun fails using your strategy. + +| Result file | Description | +| ---------------------------------- | ----------------------------------------------- | +| `$(results.shp-error-reason.path)` | File to store the error reason. | +| `$(results.shp-error-message.path)` | File to store the error message. | + +Reason is intended to be a one-word, CamelCase classification of the error source, with the first letter capitalized. +Error details are only propagated if the build container terminates with a non-zero exit code. +This information will be available in the `.status.failureDetails` field of the BuildRun. + +```yaml +apiVersion: shipwright.io/v1alpha1 +kind: BuildRun +# [...] +status: + # [...] + failureDetails: + location: + container: step-source-default + pod: baran-build-buildrun-gzmv5-b7wbf-pod-bbpqr + message: The source repository does not exist, or you have insufficient permission + to access it. + reason: GitRemotePrivate +``` + ## Steps Resource Definition All strategies steps can include a definition of resources(_limits and requests_) for CPU, memory and disk. For strategies with more than one step, each step(_container_) could require more resources than others. Strategy admins are free to define the values that they consider the best fit for each step. Also, identical strategies with the same steps that are only different in their name and step resources can be installed on the cluster to allow users to create a build with smaller and larger resource requirements. diff --git a/pkg/apis/build/v1alpha1/buildrun_types.go b/pkg/apis/build/v1alpha1/buildrun_types.go index 2e144c1d90..1842c0e427 100644 --- a/pkg/apis/build/v1alpha1/buildrun_types.go +++ b/pkg/apis/build/v1alpha1/buildrun_types.go @@ -149,9 +149,13 @@ type BuildRunStatus struct { // +optional BuildSpec *BuildSpec `json:"buildSpec,omitempty"` - // FailedAt points to the resource where the BuildRun failed + // DEPRECATED: FailedAt points to the resource where the BuildRun failed // +optional FailedAt *FailedAt `json:"failedAt,omitempty"` + + // FailureDetails contains error details that are collected and surfaced from TaskRun + // +optional + FailureDetails *FailureDetails `json:"failureDetails,omitempty"` } // FailedAt describes the location where the failure happened @@ -160,6 +164,13 @@ type FailedAt struct { Container string `json:"container,omitempty"` } +// FailureDetails describes an error while building images +type FailureDetails struct { + Reason string `json:"reason,omitempty"` + Message string `json:"message,omitempty"` + Location *FailedAt `json:"location,omitempty"` +} + // BuildRef can be used to refer to a specific instance of a Build. type BuildRef struct { // Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names diff --git a/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go index a66bc797f7..c4220b3b0f 100644 --- a/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go @@ -246,6 +246,11 @@ func (in *BuildRunStatus) DeepCopyInto(out *BuildRunStatus) { *out = new(FailedAt) **out = **in } + if in.FailureDetails != nil { + in, out := &in.FailureDetails, &out.FailureDetails + *out = new(FailureDetails) + (*in).DeepCopyInto(*out) + } return } @@ -622,6 +627,27 @@ func (in *FailedAt) DeepCopy() *FailedAt { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FailureDetails) DeepCopyInto(out *FailureDetails) { + *out = *in + if in.Location != nil { + in, out := &in.Location, &out.Location + *out = new(FailedAt) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailureDetails. +func (in *FailureDetails) DeepCopy() *FailureDetails { + if in == nil { + return nil + } + out := new(FailureDetails) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GitSourceResult) DeepCopyInto(out *GitSourceResult) { *out = *in diff --git a/pkg/reconciler/buildrun/buildrun.go b/pkg/reconciler/buildrun/buildrun.go index 8ae3ce6d95..c9573da067 100644 --- a/pkg/reconciler/buildrun/buildrun.go +++ b/pkg/reconciler/buildrun/buildrun.go @@ -308,6 +308,7 @@ func (r *ReconcileBuildRun) Reconcile(request reconcile.Request) (reconcile.Resu return reconcile.Result{}, err } + resources.UpdateBuildRunUsingTaskFailures(ctx, r.client, buildRun, lastTaskRun) taskRunStatus := trCondition.Status // check if we should delete the generated service account by checking the build run spec and that the task run is complete diff --git a/pkg/reconciler/buildrun/resources/conditions.go b/pkg/reconciler/buildrun/resources/conditions.go index d20cd9e1c6..fa090aa655 100644 --- a/pkg/reconciler/buildrun/resources/conditions.go +++ b/pkg/reconciler/buildrun/resources/conditions.go @@ -14,7 +14,6 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" buildv1alpha1 "github.com/shipwright-io/build/pkg/apis/build/v1alpha1" "github.com/shipwright-io/build/pkg/ctxlog" @@ -70,8 +69,8 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie case v1beta1.TaskRunReasonFailed: if taskRun.Status.CompletionTime != nil { - var pod corev1.Pod - if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil { + pod, failedContainer, err := extractFailedPodAndContainer(ctx, client, taskRun) + if err != nil { // when trying to customize the Condition Message field, ensure the Message cover the case // when a Pod is deleted. // Note: this is an edge case, but not doing this prevent a BuildRun from being marked as Failed @@ -83,26 +82,11 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie return err } + //lint:ignore SA1019 we want to give users some time to adopt to failureDetails buildRun.Status.FailedAt = &buildv1alpha1.FailedAt{Pod: pod.Name} - // Since the container status list is not sorted, as a quick workaround mark all failed containers - var failures = make(map[string]struct{}) - for _, containerStatus := range pod.Status.ContainerStatuses { - if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 { - failures[containerStatus.Name] = struct{}{} - } - } - - // Find the first container that failed - var failedContainer *corev1.Container - for i, container := range pod.Spec.Containers { - if _, has := failures[container.Name]; has { - failedContainer = &pod.Spec.Containers[i] - break - } - } - if failedContainer != nil { + //lint:ignore SA1019 we want to give users some time to adopt to failureDetails buildRun.Status.FailedAt.Container = failedContainer.Name message = fmt.Sprintf("buildrun step %s failed in pod %s, for detailed information: kubectl --namespace %s logs %s --container=%s", failedContainer.Name, diff --git a/pkg/reconciler/buildrun/resources/failure_details.go b/pkg/reconciler/buildrun/resources/failure_details.go new file mode 100644 index 0000000000..e8fcb5e6ba --- /dev/null +++ b/pkg/reconciler/buildrun/resources/failure_details.go @@ -0,0 +1,116 @@ +// Copyright The Shipwright Contributors +// +// SPDX-License-Identifier: Apache-2.0 + +package resources + +import ( + "context" + "encoding/json" + "fmt" + "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "knative.dev/pkg/apis" + "sigs.k8s.io/controller-runtime/pkg/client" + + buildv1alpha1 "github.com/shipwright-io/build/pkg/apis/build/v1alpha1" +) + +const ( + resultErrorMessage = "error-message" + resultErrorReason = "error-reason" + prefixedResultErrorReason = prefixParamsResultsVolumes + "-" + resultErrorReason + prefixedResultErrorMessage = prefixParamsResultsVolumes + "-" + resultErrorMessage +) + +// UpdateBuildRunUsingTaskFailures is extracting failures from taskRun steps and adding them to buildRun (mutates) +func UpdateBuildRunUsingTaskFailures(ctx context.Context, client client.Client, buildRun *buildv1alpha1.BuildRun, taskRun *v1beta1.TaskRun) { + trCondition := taskRun.Status.GetCondition(apis.ConditionSucceeded) + + // only extract failures when failing condition is present + if trCondition != nil && v1beta1.TaskRunReason(trCondition.Reason) == v1beta1.TaskRunReasonFailed { + buildRun.Status.FailureDetails = extractFailureDetails(ctx, client, taskRun) + } +} + +func extractFailureReasonAndMessage(taskRun *v1beta1.TaskRun) (errorReason string, errorMessage string, hasReasonAndMessage bool) { + for _, step := range taskRun.Status.Steps { + message := step.Terminated.Message + var taskRunResults []v1beta1.PipelineResourceResult + + if err := json.Unmarshal([]byte(message), &taskRunResults); err != nil { + continue + } + + for _, result := range taskRunResults { + if result.Key == prefixedResultErrorMessage { + errorMessage = result.Value + } + + if result.Key == prefixedResultErrorReason { + errorReason = result.Value + } + } + } + + return errorReason, errorMessage, true +} + +func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *v1beta1.TaskRun) (*v1.Pod, *v1.Container, error) { + var pod v1.Pod + if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil { + return nil, nil, err + } + + failures := make(map[string]struct{}) + // Find the names of all containers with failure status + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 { + failures[containerStatus.Name] = struct{}{} + } + } + + // Find the first container that has a failure status + var failedContainer *v1.Container + for i, container := range pod.Spec.Containers { + if _, has := failures[container.Name]; has { + failedContainer = &pod.Spec.Containers[i] + break + } + } + + return &pod, failedContainer, nil +} + +func extractFailureDetails(ctx context.Context, client client.Client, taskRun *v1beta1.TaskRun) (failure *buildv1alpha1.FailureDetails) { + failure = &buildv1alpha1.FailureDetails{} + failure.Location = &buildv1alpha1.FailedAt{Pod: taskRun.Status.PodName} + + if reason, message, hasReasonAndMessage := extractFailureReasonAndMessage(taskRun); hasReasonAndMessage { + failure.Reason = reason + failure.Message = message + } + + pod, container, _ := extractFailedPodAndContainer(ctx, client, taskRun) + + if pod != nil && container != nil { + failure.Location.Pod = pod.Name + failure.Location.Container = container.Name + } + + return failure +} + +func getFailureDetailsTaskSpecResults() []v1beta1.TaskResult { + return []v1beta1.TaskResult{ + { + Name: fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorMessage), + Description: "The error description of the task run", + }, + { + Name: fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorReason), + Description: "The error reason of the task run", + }, + } +} diff --git a/pkg/reconciler/buildrun/resources/failure_details_test.go b/pkg/reconciler/buildrun/resources/failure_details_test.go new file mode 100644 index 0000000000..cc7cf1625a --- /dev/null +++ b/pkg/reconciler/buildrun/resources/failure_details_test.go @@ -0,0 +1,130 @@ +// Copyright The Shipwright Contributors +// +// SPDX-License-Identifier: Apache-2.0 + +package resources + +import ( + "context" + "encoding/json" + "fmt" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + pipelinev1beta1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1" + corev1 "k8s.io/api/core/v1" + "knative.dev/pkg/apis" + + + buildv1alpha1 "github.com/shipwright-io/build/pkg/apis/build/v1alpha1" + buildfakes "github.com/shipwright-io/build/pkg/controller/fakes" +) + +var _ = Describe("Surfacing errors", func() { + Context("resources.UpdateBuildRunUsingTaskFailures", func() { + ctx := context.Background() + client := &buildfakes.FakeClient{} + + It("surfaces errors to BuildRun in failed TaskRun", func() { + redTaskRun := pipelinev1beta1.TaskRun{} + redTaskRun.Status.Conditions = append(redTaskRun.Status.Conditions, + apis.Condition{Type: apis.ConditionSucceeded, Reason: pipelinev1beta1.TaskRunReasonFailed.String()}) + failedStep := pipelinev1beta1.StepState{} + + errorReasonValue := "PullBaseImageFailed" + errorMessageValue := "Failed to pull the base image." + errorReasonKey := fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorReason) + errorMessageKey := fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorMessage) + + errorReason := pipelinev1beta1.PipelineResourceResult{Key: errorReasonKey, Value: errorReasonValue} + errorMessage := pipelinev1beta1.PipelineResourceResult{Key: errorMessageKey, Value: errorMessageValue} + unrelated := pipelinev1beta1.PipelineResourceResult{Key: "unrelated-resource-key", Value: "Unrelated resource value"} + + message, _ := json.Marshal([]pipelinev1beta1.PipelineResourceResult{errorReason, errorMessage, unrelated}) + + failedStep.Terminated = &corev1.ContainerStateTerminated{Message: string(message)} + + redTaskRun.Status.Steps = append(redTaskRun.Status.Steps, failedStep) + redBuild := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &redBuild, &redTaskRun) + + Expect(redBuild.Status.FailureDetails.Message).To(Equal(errorMessageValue)) + Expect(redBuild.Status.FailureDetails.Reason).To(Equal(errorReasonValue)) + }) + + It("does not surface unrelated Tekton resources if the TaskRun fails", func() { + redTaskRun := pipelinev1beta1.TaskRun{} + redTaskRun.Status.Conditions = append(redTaskRun.Status.Conditions, + apis.Condition{Type: apis.ConditionSucceeded, Reason: pipelinev1beta1.TaskRunReasonFailed.String()}) + failedStep := pipelinev1beta1.StepState{} + + unrelated := pipelinev1beta1.PipelineResourceResult{Key: "unrelated", Value: "unrelated"} + + message, _ := json.Marshal([]pipelinev1beta1.PipelineResourceResult{unrelated}) + + failedStep.Terminated = &corev1.ContainerStateTerminated{Message: string(message)} + + redTaskRun.Status.Steps = append(redTaskRun.Status.Steps, failedStep) + redBuild := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &redBuild, &redTaskRun) + + Expect(redBuild.Status.FailureDetails.Reason).To(BeEmpty()) + Expect(redBuild.Status.FailureDetails.Message).To(BeEmpty()) + }) + + It("does not surface error results if the container terminated without failure", func() { + greenTaskRun := pipelinev1beta1.TaskRun{} + greenTaskRun.Status.Conditions = append(greenTaskRun.Status.Conditions, + apis.Condition{Type: apis.ConditionSucceeded, Reason: pipelinev1beta1. TaskRunReasonSuccessful.String()}) + failedStep := pipelinev1beta1.StepState{} + + errorReasonValue := "PullBaseImageFailed" + errorMessageValue := "Failed to pull the base image." + errorReasonKey := fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorReason) + errorMessageKey := fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorMessage) + + errorReason := pipelinev1beta1.PipelineResourceResult{Key: errorReasonKey, Value: errorReasonValue} + errorMessage := pipelinev1beta1.PipelineResourceResult{Key: errorMessageKey, Value: errorMessageValue} + message, _ := json.Marshal([]pipelinev1beta1.PipelineResourceResult{ errorReason, errorMessage }) + + failedStep.Terminated = &corev1.ContainerStateTerminated{Message: string(message)} + + greenTaskRun.Status.Steps = append(greenTaskRun.Status.Steps, failedStep) + greenBuildRun := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &greenBuildRun, &greenTaskRun) + + Expect(greenBuildRun.Status.FailureDetails).To(BeNil()) + }) + + It("should not surface errors for a successful TaskRun", func() { + greenTaskRun := pipelinev1beta1.TaskRun{} + greenTaskRun.Status.Conditions = append(greenTaskRun.Status.Conditions, apis.Condition{Type: apis.ConditionSucceeded}) + greenBuildRun := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &greenBuildRun, &greenTaskRun) + + Expect(greenBuildRun.Status.FailureDetails).To(BeNil()) + }) + + It("should not surface errors if the TaskRun does not have a Succeeded condition", func() { + unfinishedTaskRun := pipelinev1beta1.TaskRun{} + unfinishedTaskRun.Status.Conditions = append(unfinishedTaskRun.Status.Conditions, apis.Condition{Type: apis.ConditionReady}) + unfinishedBuildRun := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &unfinishedBuildRun, &unfinishedTaskRun) + Expect(unfinishedBuildRun.Status.FailureDetails).To(BeNil()) + }) + + It("should not surface errors if the TaskRun is in progress", func() { + unknownTaskRun := pipelinev1beta1.TaskRun{} + unknownTaskRun.Status.Conditions = append(unknownTaskRun.Status.Conditions, apis.Condition{Type: apis.ConditionSucceeded, Reason: "random"}) + unknownBuildRun := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &unknownBuildRun, &unknownTaskRun) + Expect(unknownBuildRun.Status.FailureDetails).To(BeNil()) + }) + }) +}) diff --git a/pkg/reconciler/buildrun/resources/taskrun.go b/pkg/reconciler/buildrun/resources/taskrun.go index 6591ed6a91..a537b9b7ba 100644 --- a/pkg/reconciler/buildrun/resources/taskrun.go +++ b/pkg/reconciler/buildrun/resources/taskrun.go @@ -113,7 +113,7 @@ func GenerateTaskSpec( }, } - generatedTaskSpec.Results = getTaskSpecResults() + generatedTaskSpec.Results = append(getTaskSpecResults(), getFailureDetailsTaskSpecResults()...) if build.Spec.Builder != nil { InputBuilder := v1beta1.ParamSpec{