diff --git a/.gitignore b/.gitignore index 2baa7086e..b45618b7b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ # Temporary Build Files .DS_Store -git shipwright-build-controller build/_output build/_test diff --git a/deploy/crds/shipwright.io_buildruns.yaml b/deploy/crds/shipwright.io_buildruns.yaml index 90dcac771..56342772f 100644 --- a/deploy/crds/shipwright.io_buildruns.yaml +++ b/deploy/crds/shipwright.io_buildruns.yaml @@ -558,13 +558,32 @@ spec: type: object type: array failedAt: - description: FailedAt points to the resource where the BuildRun failed + description: 'Deprecated: FailedAt points to the resource where the + BuildRun failed' properties: container: type: string pod: type: string type: object + failureDetails: + description: FailureDetails contains error details that are collected + and surfaced from TaskRun + properties: + location: + description: FailedAt describes the location where the failure + happened + properties: + container: + type: string + pod: + type: string + type: object + message: + type: string + reason: + type: string + type: object latestTaskRunRef: description: "LatestTaskRunRef is the name of the TaskRun responsible for executing this BuildRun. \n TODO: This should be called something diff --git a/docs/buildrun.md b/docs/buildrun.md index b378ee16f..88bd6557e 100644 --- a/docs/buildrun.md +++ b/docs/buildrun.md @@ -269,10 +269,29 @@ _Note_: We heavily rely on the Tekton TaskRun [Conditions](https://github.com/te ### Understanding failed BuildRuns -To make it easier for users to understand why did a BuildRun failed, users can infer from the `Status.FailedAt` field, the pod and container where the failure took place. +[DEPRECATED] To make it easier for users to understand why did a BuildRun failed, users can infer from the `Status.FailedAt` field, the pod and container where the failure took place. In addition, the `Status.Conditions` will host under the `Message` field a compacted message containing the `kubectl` command to trigger, in order to retrieve the logs. +Lastly, users can check `Status.FailureDetails` field, which includes the same information available in the `Status.FailedAt` field, +as well as a humanly-readable error message and reason. +The message and reason are only included if the build strategy provides them. + +Example of failed BuildRun: + +```yaml +# [...] +status: + # [...] + failureDetails: + location: + container: step-source-default + pod: baran-build-buildrun-gzmv5-b7wbf-pod-bbpqr + message: The source repository does not exist, or you have insufficient permission + to access it. + reason: GitRemotePrivate +``` + ### Step Results in BuildRun Status After the successful completion of a `BuildRun`, the `.status` field contains the results (`.status.taskResults`) emitted from the `TaskRun` steps generate by the `BuildRun` controller as part of processing the `BuildRun`. These results contain valuable metadata for users, like the _image digest_ or the _commit sha_ of the source code used for building. diff --git a/docs/buildstrategies.md b/docs/buildstrategies.md index a37a45ace..3e86120ec 100644 --- a/docs/buildstrategies.md +++ b/docs/buildstrategies.md @@ -296,10 +296,36 @@ status: # [...] output: digest: sha256:07626e3c7fdd28d5328a8d6df8d29cd3da760c7f5e2070b534f9b880ed093a53 - size: "1989004" + size: 1989004 # [...] ``` +Additionally, you can store error details for debugging purposes when a BuildRun fails using your strategy. + +| Result file | Description | +| ---------------------------------- | ----------------------------------------------- | +| `$(results.shp-error-reason.path)` | File to store the error reason. | +| `$(results.shp-error-message.path)` | File to store the error message. | + +Reason is intended to be a one-word, CamelCase classification of the error source, with the first letter capitalized. +Error details are only propagated if the build container terminates with a non-zero exit code. +This information will be available in the `.status.failureDetails` field of the BuildRun. + +```yaml +apiVersion: shipwright.io/v1alpha1 +kind: BuildRun +# [...] +status: + # [...] + failureDetails: + location: + container: step-source-default + pod: baran-build-buildrun-gzmv5-b7wbf-pod-bbpqr + message: The source repository does not exist, or you have insufficient permission + to access it. + reason: GitRemotePrivate +``` + ## Steps Resource Definition All strategies steps can include a definition of resources(_limits and requests_) for CPU, memory and disk. For strategies with more than one step, each step(_container_) could require more resources than others. Strategy admins are free to define the values that they consider the best fit for each step. Also, identical strategies with the same steps that are only different in their name and step resources can be installed on the cluster to allow users to create a build with smaller and larger resource requirements. diff --git a/pkg/apis/build/v1alpha1/buildrun_types.go b/pkg/apis/build/v1alpha1/buildrun_types.go index 2e144c1d9..d48aa67ec 100644 --- a/pkg/apis/build/v1alpha1/buildrun_types.go +++ b/pkg/apis/build/v1alpha1/buildrun_types.go @@ -149,9 +149,13 @@ type BuildRunStatus struct { // +optional BuildSpec *BuildSpec `json:"buildSpec,omitempty"` - // FailedAt points to the resource where the BuildRun failed + // Deprecated: FailedAt points to the resource where the BuildRun failed // +optional FailedAt *FailedAt `json:"failedAt,omitempty"` + + // FailureDetails contains error details that are collected and surfaced from TaskRun + // +optional + FailureDetails *FailureDetails `json:"failureDetails,omitempty"` } // FailedAt describes the location where the failure happened @@ -160,6 +164,13 @@ type FailedAt struct { Container string `json:"container,omitempty"` } +// FailureDetails describes an error while building images +type FailureDetails struct { + Reason string `json:"reason,omitempty"` + Message string `json:"message,omitempty"` + Location *FailedAt `json:"location,omitempty"` +} + // BuildRef can be used to refer to a specific instance of a Build. type BuildRef struct { // Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names diff --git a/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go index a66bc797f..c4220b3b0 100644 --- a/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/build/v1alpha1/zz_generated.deepcopy.go @@ -246,6 +246,11 @@ func (in *BuildRunStatus) DeepCopyInto(out *BuildRunStatus) { *out = new(FailedAt) **out = **in } + if in.FailureDetails != nil { + in, out := &in.FailureDetails, &out.FailureDetails + *out = new(FailureDetails) + (*in).DeepCopyInto(*out) + } return } @@ -622,6 +627,27 @@ func (in *FailedAt) DeepCopy() *FailedAt { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FailureDetails) DeepCopyInto(out *FailureDetails) { + *out = *in + if in.Location != nil { + in, out := &in.Location, &out.Location + *out = new(FailedAt) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailureDetails. +func (in *FailureDetails) DeepCopy() *FailureDetails { + if in == nil { + return nil + } + out := new(FailureDetails) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GitSourceResult) DeepCopyInto(out *GitSourceResult) { *out = *in diff --git a/pkg/reconciler/buildrun/buildrun.go b/pkg/reconciler/buildrun/buildrun.go index 71454e6c3..79c1d7295 100644 --- a/pkg/reconciler/buildrun/buildrun.go +++ b/pkg/reconciler/buildrun/buildrun.go @@ -306,6 +306,7 @@ func (r *ReconcileBuildRun) Reconcile(ctx context.Context, request reconcile.Req return reconcile.Result{}, err } + resources.UpdateBuildRunUsingTaskFailures(ctx, r.client, buildRun, lastTaskRun) taskRunStatus := trCondition.Status // check if we should delete the generated service account by checking the build run spec and that the task run is complete diff --git a/pkg/reconciler/buildrun/resources/conditions.go b/pkg/reconciler/buildrun/resources/conditions.go index d20cd9e1c..fa090aa65 100644 --- a/pkg/reconciler/buildrun/resources/conditions.go +++ b/pkg/reconciler/buildrun/resources/conditions.go @@ -14,7 +14,6 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" buildv1alpha1 "github.com/shipwright-io/build/pkg/apis/build/v1alpha1" "github.com/shipwright-io/build/pkg/ctxlog" @@ -70,8 +69,8 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie case v1beta1.TaskRunReasonFailed: if taskRun.Status.CompletionTime != nil { - var pod corev1.Pod - if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil { + pod, failedContainer, err := extractFailedPodAndContainer(ctx, client, taskRun) + if err != nil { // when trying to customize the Condition Message field, ensure the Message cover the case // when a Pod is deleted. // Note: this is an edge case, but not doing this prevent a BuildRun from being marked as Failed @@ -83,26 +82,11 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie return err } + //lint:ignore SA1019 we want to give users some time to adopt to failureDetails buildRun.Status.FailedAt = &buildv1alpha1.FailedAt{Pod: pod.Name} - // Since the container status list is not sorted, as a quick workaround mark all failed containers - var failures = make(map[string]struct{}) - for _, containerStatus := range pod.Status.ContainerStatuses { - if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 { - failures[containerStatus.Name] = struct{}{} - } - } - - // Find the first container that failed - var failedContainer *corev1.Container - for i, container := range pod.Spec.Containers { - if _, has := failures[container.Name]; has { - failedContainer = &pod.Spec.Containers[i] - break - } - } - if failedContainer != nil { + //lint:ignore SA1019 we want to give users some time to adopt to failureDetails buildRun.Status.FailedAt.Container = failedContainer.Name message = fmt.Sprintf("buildrun step %s failed in pod %s, for detailed information: kubectl --namespace %s logs %s --container=%s", failedContainer.Name, diff --git a/pkg/reconciler/buildrun/resources/failure_details.go b/pkg/reconciler/buildrun/resources/failure_details.go new file mode 100644 index 000000000..e8fcb5e6b --- /dev/null +++ b/pkg/reconciler/buildrun/resources/failure_details.go @@ -0,0 +1,116 @@ +// Copyright The Shipwright Contributors +// +// SPDX-License-Identifier: Apache-2.0 + +package resources + +import ( + "context" + "encoding/json" + "fmt" + "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "knative.dev/pkg/apis" + "sigs.k8s.io/controller-runtime/pkg/client" + + buildv1alpha1 "github.com/shipwright-io/build/pkg/apis/build/v1alpha1" +) + +const ( + resultErrorMessage = "error-message" + resultErrorReason = "error-reason" + prefixedResultErrorReason = prefixParamsResultsVolumes + "-" + resultErrorReason + prefixedResultErrorMessage = prefixParamsResultsVolumes + "-" + resultErrorMessage +) + +// UpdateBuildRunUsingTaskFailures is extracting failures from taskRun steps and adding them to buildRun (mutates) +func UpdateBuildRunUsingTaskFailures(ctx context.Context, client client.Client, buildRun *buildv1alpha1.BuildRun, taskRun *v1beta1.TaskRun) { + trCondition := taskRun.Status.GetCondition(apis.ConditionSucceeded) + + // only extract failures when failing condition is present + if trCondition != nil && v1beta1.TaskRunReason(trCondition.Reason) == v1beta1.TaskRunReasonFailed { + buildRun.Status.FailureDetails = extractFailureDetails(ctx, client, taskRun) + } +} + +func extractFailureReasonAndMessage(taskRun *v1beta1.TaskRun) (errorReason string, errorMessage string, hasReasonAndMessage bool) { + for _, step := range taskRun.Status.Steps { + message := step.Terminated.Message + var taskRunResults []v1beta1.PipelineResourceResult + + if err := json.Unmarshal([]byte(message), &taskRunResults); err != nil { + continue + } + + for _, result := range taskRunResults { + if result.Key == prefixedResultErrorMessage { + errorMessage = result.Value + } + + if result.Key == prefixedResultErrorReason { + errorReason = result.Value + } + } + } + + return errorReason, errorMessage, true +} + +func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *v1beta1.TaskRun) (*v1.Pod, *v1.Container, error) { + var pod v1.Pod + if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil { + return nil, nil, err + } + + failures := make(map[string]struct{}) + // Find the names of all containers with failure status + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 { + failures[containerStatus.Name] = struct{}{} + } + } + + // Find the first container that has a failure status + var failedContainer *v1.Container + for i, container := range pod.Spec.Containers { + if _, has := failures[container.Name]; has { + failedContainer = &pod.Spec.Containers[i] + break + } + } + + return &pod, failedContainer, nil +} + +func extractFailureDetails(ctx context.Context, client client.Client, taskRun *v1beta1.TaskRun) (failure *buildv1alpha1.FailureDetails) { + failure = &buildv1alpha1.FailureDetails{} + failure.Location = &buildv1alpha1.FailedAt{Pod: taskRun.Status.PodName} + + if reason, message, hasReasonAndMessage := extractFailureReasonAndMessage(taskRun); hasReasonAndMessage { + failure.Reason = reason + failure.Message = message + } + + pod, container, _ := extractFailedPodAndContainer(ctx, client, taskRun) + + if pod != nil && container != nil { + failure.Location.Pod = pod.Name + failure.Location.Container = container.Name + } + + return failure +} + +func getFailureDetailsTaskSpecResults() []v1beta1.TaskResult { + return []v1beta1.TaskResult{ + { + Name: fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorMessage), + Description: "The error description of the task run", + }, + { + Name: fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorReason), + Description: "The error reason of the task run", + }, + } +} diff --git a/pkg/reconciler/buildrun/resources/failure_details_test.go b/pkg/reconciler/buildrun/resources/failure_details_test.go new file mode 100644 index 000000000..cc7cf1625 --- /dev/null +++ b/pkg/reconciler/buildrun/resources/failure_details_test.go @@ -0,0 +1,130 @@ +// Copyright The Shipwright Contributors +// +// SPDX-License-Identifier: Apache-2.0 + +package resources + +import ( + "context" + "encoding/json" + "fmt" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + pipelinev1beta1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1" + corev1 "k8s.io/api/core/v1" + "knative.dev/pkg/apis" + + + buildv1alpha1 "github.com/shipwright-io/build/pkg/apis/build/v1alpha1" + buildfakes "github.com/shipwright-io/build/pkg/controller/fakes" +) + +var _ = Describe("Surfacing errors", func() { + Context("resources.UpdateBuildRunUsingTaskFailures", func() { + ctx := context.Background() + client := &buildfakes.FakeClient{} + + It("surfaces errors to BuildRun in failed TaskRun", func() { + redTaskRun := pipelinev1beta1.TaskRun{} + redTaskRun.Status.Conditions = append(redTaskRun.Status.Conditions, + apis.Condition{Type: apis.ConditionSucceeded, Reason: pipelinev1beta1.TaskRunReasonFailed.String()}) + failedStep := pipelinev1beta1.StepState{} + + errorReasonValue := "PullBaseImageFailed" + errorMessageValue := "Failed to pull the base image." + errorReasonKey := fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorReason) + errorMessageKey := fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorMessage) + + errorReason := pipelinev1beta1.PipelineResourceResult{Key: errorReasonKey, Value: errorReasonValue} + errorMessage := pipelinev1beta1.PipelineResourceResult{Key: errorMessageKey, Value: errorMessageValue} + unrelated := pipelinev1beta1.PipelineResourceResult{Key: "unrelated-resource-key", Value: "Unrelated resource value"} + + message, _ := json.Marshal([]pipelinev1beta1.PipelineResourceResult{errorReason, errorMessage, unrelated}) + + failedStep.Terminated = &corev1.ContainerStateTerminated{Message: string(message)} + + redTaskRun.Status.Steps = append(redTaskRun.Status.Steps, failedStep) + redBuild := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &redBuild, &redTaskRun) + + Expect(redBuild.Status.FailureDetails.Message).To(Equal(errorMessageValue)) + Expect(redBuild.Status.FailureDetails.Reason).To(Equal(errorReasonValue)) + }) + + It("does not surface unrelated Tekton resources if the TaskRun fails", func() { + redTaskRun := pipelinev1beta1.TaskRun{} + redTaskRun.Status.Conditions = append(redTaskRun.Status.Conditions, + apis.Condition{Type: apis.ConditionSucceeded, Reason: pipelinev1beta1.TaskRunReasonFailed.String()}) + failedStep := pipelinev1beta1.StepState{} + + unrelated := pipelinev1beta1.PipelineResourceResult{Key: "unrelated", Value: "unrelated"} + + message, _ := json.Marshal([]pipelinev1beta1.PipelineResourceResult{unrelated}) + + failedStep.Terminated = &corev1.ContainerStateTerminated{Message: string(message)} + + redTaskRun.Status.Steps = append(redTaskRun.Status.Steps, failedStep) + redBuild := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &redBuild, &redTaskRun) + + Expect(redBuild.Status.FailureDetails.Reason).To(BeEmpty()) + Expect(redBuild.Status.FailureDetails.Message).To(BeEmpty()) + }) + + It("does not surface error results if the container terminated without failure", func() { + greenTaskRun := pipelinev1beta1.TaskRun{} + greenTaskRun.Status.Conditions = append(greenTaskRun.Status.Conditions, + apis.Condition{Type: apis.ConditionSucceeded, Reason: pipelinev1beta1. TaskRunReasonSuccessful.String()}) + failedStep := pipelinev1beta1.StepState{} + + errorReasonValue := "PullBaseImageFailed" + errorMessageValue := "Failed to pull the base image." + errorReasonKey := fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorReason) + errorMessageKey := fmt.Sprintf("%s-%s", prefixParamsResultsVolumes, resultErrorMessage) + + errorReason := pipelinev1beta1.PipelineResourceResult{Key: errorReasonKey, Value: errorReasonValue} + errorMessage := pipelinev1beta1.PipelineResourceResult{Key: errorMessageKey, Value: errorMessageValue} + message, _ := json.Marshal([]pipelinev1beta1.PipelineResourceResult{ errorReason, errorMessage }) + + failedStep.Terminated = &corev1.ContainerStateTerminated{Message: string(message)} + + greenTaskRun.Status.Steps = append(greenTaskRun.Status.Steps, failedStep) + greenBuildRun := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &greenBuildRun, &greenTaskRun) + + Expect(greenBuildRun.Status.FailureDetails).To(BeNil()) + }) + + It("should not surface errors for a successful TaskRun", func() { + greenTaskRun := pipelinev1beta1.TaskRun{} + greenTaskRun.Status.Conditions = append(greenTaskRun.Status.Conditions, apis.Condition{Type: apis.ConditionSucceeded}) + greenBuildRun := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &greenBuildRun, &greenTaskRun) + + Expect(greenBuildRun.Status.FailureDetails).To(BeNil()) + }) + + It("should not surface errors if the TaskRun does not have a Succeeded condition", func() { + unfinishedTaskRun := pipelinev1beta1.TaskRun{} + unfinishedTaskRun.Status.Conditions = append(unfinishedTaskRun.Status.Conditions, apis.Condition{Type: apis.ConditionReady}) + unfinishedBuildRun := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &unfinishedBuildRun, &unfinishedTaskRun) + Expect(unfinishedBuildRun.Status.FailureDetails).To(BeNil()) + }) + + It("should not surface errors if the TaskRun is in progress", func() { + unknownTaskRun := pipelinev1beta1.TaskRun{} + unknownTaskRun.Status.Conditions = append(unknownTaskRun.Status.Conditions, apis.Condition{Type: apis.ConditionSucceeded, Reason: "random"}) + unknownBuildRun := buildv1alpha1.BuildRun{} + + UpdateBuildRunUsingTaskFailures(ctx, client, &unknownBuildRun, &unknownTaskRun) + Expect(unknownBuildRun.Status.FailureDetails).To(BeNil()) + }) + }) +}) diff --git a/pkg/reconciler/buildrun/resources/taskrun.go b/pkg/reconciler/buildrun/resources/taskrun.go index 6591ed6a9..a537b9b7b 100644 --- a/pkg/reconciler/buildrun/resources/taskrun.go +++ b/pkg/reconciler/buildrun/resources/taskrun.go @@ -113,7 +113,7 @@ func GenerateTaskSpec( }, } - generatedTaskSpec.Results = getTaskSpecResults() + generatedTaskSpec.Results = append(getTaskSpecResults(), getFailureDetailsTaskSpecResults()...) if build.Spec.Builder != nil { InputBuilder := v1beta1.ParamSpec{