From 0b8eafd82827a11ddd51d4c0e03174e2eb75cb88 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 2 Mar 2023 13:51:09 -0500 Subject: [PATCH] handle `FSM.Apply` errors in `raftApply` (#16287) The signature of the `raftApply` function requires that the caller unwrap the first returned value (the response from `FSM.Apply`) to see if it's an error. This puts the burden on the caller to remember to check two different places for errors, and we've done so inconsistently. Update `raftApply` to do the unwrapping for us and return any `FSM.Apply` error as the error value. Similar work was done in Consul in https://github.com/hashicorp/consul/pull/9991. This eliminates some boilerplate and surfaces a few minor bugs in the process: * job deregistrations of already-GC'd jobs were still emitting evals * reconcile job summaries does not return scheduler errors * node updates did not report errors associated with inconsistent service discovery or CSI plugin states Note that although _most_ of the `FSM.Apply` functions return only errors (which makes it tempting to remove the first return value entirely), there are few that return `bool` for some reason and Variables relies on the response value for proper CAS checking. --- .changelog/16287.txt | 11 ++++++ api/jobs_test.go | 2 +- api/nodes_test.go | 1 + nomad/csi_endpoint.go | 38 ++++-------------- nomad/drainer_shims.go | 19 ++------- nomad/job_endpoint.go | 23 +++++------ nomad/job_endpoint_test.go | 54 ++++---------------------- nomad/namespace_endpoint.go | 14 +------ nomad/operator_endpoint.go | 6 +-- nomad/periodic.go | 5 +-- nomad/rpc.go | 13 +++++-- nomad/service_registration_endpoint.go | 14 +------ 12 files changed, 56 insertions(+), 144 deletions(-) create mode 100644 .changelog/16287.txt diff --git a/.changelog/16287.txt b/.changelog/16287.txt new file mode 100644 index 000000000000..5496bb2b1db6 --- /dev/null +++ b/.changelog/16287.txt @@ -0,0 +1,11 @@ +```release-note:bug +server: Fixed a bug where deregistering a job that was already garbage collected would create a new evaluation +``` + +```release-note:bug +server: Fixed a bug where the `system reconcile summaries` command and API would not return any scheduler-related errors +``` + +```release-note:bug +server: Fixed a bug where node updates that produced errors from service discovery or CSI plugin updates were not logged +``` diff --git a/api/jobs_test.go b/api/jobs_test.go index 860928f44056..b3beda480105 100644 --- a/api/jobs_test.go +++ b/api/jobs_test.go @@ -1552,7 +1552,7 @@ func TestJobs_Deregister(t *testing.T) { must.NoError(t, err) assertWriteMeta(t, wm) - // Attempting delete on non-existing job returns an error + // Attempting delete on non-existing job does not return an error _, _, err = jobs.Deregister("nope", false, nil) must.NoError(t, err) diff --git a/api/nodes_test.go b/api/nodes_test.go index c8a658a69e08..2d1b4101eb5e 100644 --- a/api/nodes_test.go +++ b/api/nodes_test.go @@ -13,6 +13,7 @@ import ( ) func queryNodeList(t *testing.T, nodes *Nodes) ([]*NodeListStub, *QueryMeta) { + t.Helper() var ( nodeListStub []*NodeListStub queryMeta *QueryMeta diff --git a/nomad/csi_endpoint.go b/nomad/csi_endpoint.go index 36b2c9bf7aa5..7b05de670e53 100644 --- a/nomad/csi_endpoint.go +++ b/nomad/csi_endpoint.go @@ -378,14 +378,11 @@ func (v *CSIVolume) Register(args *structs.CSIVolumeRegisterRequest, reply *stru } } - resp, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, args) + _, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, args) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "register") return err } - if respErr, ok := resp.(error); ok { - return respErr - } reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) @@ -415,14 +412,11 @@ func (v *CSIVolume) Deregister(args *structs.CSIVolumeDeregisterRequest, reply * return fmt.Errorf("missing volume IDs") } - resp, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, args) + _, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, args) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "deregister") return err } - if respErr, ok := resp.(error); ok { - return respErr - } reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) @@ -470,14 +464,11 @@ func (v *CSIVolume) Claim(args *structs.CSIVolumeClaimRequest, reply *structs.CS args.NodeID = alloc.NodeID } - resp, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, args) + _, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, args) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "claim") return err } - if respErr, ok := resp.(error); ok { - return respErr - } if isNewClaim { // if this is a new claim, add a Volume and PublishContext from the @@ -937,14 +928,11 @@ func (v *CSIVolume) checkpointClaim(vol *structs.CSIVolume, claim *structs.CSIVo Namespace: vol.Namespace, }, } - resp, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, req) + _, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, req) if err != nil { v.logger.Error("csi raft apply failed", "error", err) return err } - if respErr, ok := resp.(error); ok { - return respErr - } vol.ModifyIndex = index return nil } @@ -1025,13 +1013,10 @@ func (v *CSIVolume) Create(args *structs.CSIVolumeCreateRequest, reply *structs. } } - resp, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, regArgs) + _, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, regArgs) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "register") - return err - } - if respErr, ok := resp.(error); ok { - multierror.Append(&mErr, respErr) + multierror.Append(&mErr, err) } err = mErr.ErrorOrNil() @@ -1120,14 +1105,11 @@ func (v *CSIVolume) Delete(args *structs.CSIVolumeDeleteRequest, reply *structs. VolumeIDs: args.VolumeIDs, WriteRequest: args.WriteRequest, } - resp, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, deregArgs) + _, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, deregArgs) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "deregister") return err } - if respErr, ok := resp.(error); ok { - return respErr - } reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) @@ -1570,16 +1552,12 @@ func (v *CSIPlugin) Delete(args *structs.CSIPluginDeleteRequest, reply *structs. return fmt.Errorf("missing plugin ID") } - resp, index, err := v.srv.raftApply(structs.CSIPluginDeleteRequestType, args) + _, index, err := v.srv.raftApply(structs.CSIPluginDeleteRequestType, args) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "delete") return err } - if respErr, ok := resp.(error); ok { - return respErr - } - reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) return nil diff --git a/nomad/drainer_shims.go b/nomad/drainer_shims.go index 1df9b9aa47e6..2186d606862a 100644 --- a/nomad/drainer_shims.go +++ b/nomad/drainer_shims.go @@ -28,8 +28,8 @@ func (d drainerShim) NodesDrainComplete(nodes []string, event *structs.NodeEvent } } - resp, index, err := d.s.raftApply(structs.BatchNodeUpdateDrainRequestType, args) - return d.convertApplyErrors(resp, index, err) + _, index, err := d.s.raftApply(structs.BatchNodeUpdateDrainRequestType, args) + return index, err } func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) { @@ -38,19 +38,6 @@ func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.Des Evals: evals, WriteRequest: structs.WriteRequest{Region: d.s.config.Region}, } - resp, index, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args) - return d.convertApplyErrors(resp, index, err) -} - -// convertApplyErrors parses the results of a raftApply and returns the index at -// which it was applied and any error that occurred. Raft Apply returns two -// separate errors, Raft library errors and user returned errors from the FSM. -// This helper, joins the errors by inspecting the applyResponse for an error. -func (d drainerShim) convertApplyErrors(applyResp interface{}, index uint64, err error) (uint64, error) { - if applyResp != nil { - if fsmErr, ok := applyResp.(error); ok && fsmErr != nil { - return index, fsmErr - } - } + _, index, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args) return index, err } diff --git a/nomad/job_endpoint.go b/nomad/job_endpoint.go index 2af34f0856be..17c73cc0d515 100644 --- a/nomad/job_endpoint.go +++ b/nomad/job_endpoint.go @@ -368,13 +368,9 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis args.Deployment = j.multiregionCreateDeployment(job, eval) // Commit this update via Raft - fsmErr, index, err := j.srv.raftApply(structs.JobRegisterRequestType, args) - if err, ok := fsmErr.(error); ok && err != nil { - j.logger.Error("registering job failed", "error", err, "fsm", true) - return err - } + _, index, err := j.srv.raftApply(structs.JobRegisterRequestType, args) if err != nil { - j.logger.Error("registering job failed", "error", err, "raft", true) + j.logger.Error("registering job failed", "error", err) return err } @@ -812,6 +808,9 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD if err != nil { return err } + if job == nil { + return nil + } var eval *structs.Evaluation @@ -820,7 +819,7 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD now := time.Now().UnixNano() // If the job is periodic or parameterized, we don't create an eval. - if job == nil || !(job.IsPeriodic() || job.IsParameterized()) { + if !(job.IsPeriodic() || job.IsParameterized()) { // The evaluation priority is determined by several factors. It // defaults to the job default priority and is overridden by the @@ -829,7 +828,7 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD // If the user supplied an eval priority override, we subsequently // use this. priority := structs.JobDefaultPriority - if job != nil { + if job.Priority > 0 { priority = job.Priority } if args.EvalPriority > 0 { @@ -1930,13 +1929,9 @@ func (j *Job) Dispatch(args *structs.JobDispatchRequest, reply *structs.JobDispa } // Commit this update via Raft - fsmErr, jobCreateIndex, err := j.srv.raftApply(structs.JobRegisterRequestType, regReq) - if err, ok := fsmErr.(error); ok && err != nil { - j.logger.Error("dispatched job register failed", "error", err, "fsm", true) - return err - } + _, jobCreateIndex, err := j.srv.raftApply(structs.JobRegisterRequestType, regReq) if err != nil { - j.logger.Error("dispatched job register failed", "error", err, "raft", true) + j.logger.Error("dispatched job register failed", "error") return err } diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index 3a82367fb635..de9bedb78180 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -3413,13 +3413,11 @@ func TestJobEndpoint_Deregister_ACL(t *testing.T) { require.NotZero(eval.CreateTime) require.NotZero(eval.ModifyTime) - // Deregistration is not idempotent, produces a new eval after the job is - // deregistered. TODO(langmartin) make it idempotent. + // Deregistration is idempotent var validResp2 structs.JobDeregisterResponse err = msgpackrpc.CallWithCodec(codec, "Job.Deregister", req, &validResp2) - require.NoError(err) - require.NotEqual("", validResp2.EvalID) - require.NotEqual(validResp.EvalID, validResp2.EvalID) + must.NoError(t, err) + must.Eq(t, "", validResp2.EvalID) } func TestJobEndpoint_Deregister_Nonexistent(t *testing.T) { @@ -3442,51 +3440,15 @@ func TestJobEndpoint_Deregister_Nonexistent(t *testing.T) { }, } var resp2 structs.JobDeregisterResponse - if err := msgpackrpc.CallWithCodec(codec, "Job.Deregister", dereg, &resp2); err != nil { - t.Fatalf("err: %v", err) - } - if resp2.JobModifyIndex == 0 { - t.Fatalf("bad index: %d", resp2.Index) - } + must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Deregister", dereg, &resp2)) + must.Eq(t, 0, resp2.JobModifyIndex, must.Sprint("expected no modify index")) // Lookup the evaluation state := s1.fsm.State() ws := memdb.NewWatchSet() - eval, err := state.EvalByID(ws, resp2.EvalID) - if err != nil { - t.Fatalf("err: %v", err) - } - if eval == nil { - t.Fatalf("expected eval") - } - if eval.CreateIndex != resp2.EvalCreateIndex { - t.Fatalf("index mis-match") - } - - if eval.Priority != structs.JobDefaultPriority { - t.Fatalf("bad: %#v", eval) - } - if eval.Type != structs.JobTypeService { - t.Fatalf("bad: %#v", eval) - } - if eval.TriggeredBy != structs.EvalTriggerJobDeregister { - t.Fatalf("bad: %#v", eval) - } - if eval.JobID != jobID { - t.Fatalf("bad: %#v", eval) - } - if eval.JobModifyIndex != resp2.JobModifyIndex { - t.Fatalf("bad: %#v", eval) - } - if eval.Status != structs.EvalStatusPending { - t.Fatalf("bad: %#v", eval) - } - if eval.CreateTime == 0 { - t.Fatalf("eval CreateTime is unset: %#v", eval) - } - if eval.ModifyTime == 0 { - t.Fatalf("eval ModifyTime is unset: %#v", eval) - } + eval, err := state.EvalsByJob(ws, structs.DefaultNamespace, jobID) + must.NoError(t, err) + must.Nil(t, eval) } func TestJobEndpoint_Deregister_EvalPriority(t *testing.T) { diff --git a/nomad/namespace_endpoint.go b/nomad/namespace_endpoint.go index 351405d3c0e1..526bfb5ebdb1 100644 --- a/nomad/namespace_endpoint.go +++ b/nomad/namespace_endpoint.go @@ -47,16 +47,11 @@ func (n *Namespace) UpsertNamespaces(args *structs.NamespaceUpsertRequest, } // Update via Raft - out, index, err := n.srv.raftApply(structs.NamespaceUpsertRequestType, args) + _, index, err := n.srv.raftApply(structs.NamespaceUpsertRequestType, args) if err != nil { return err } - // Check if there was an error when applying. - if err, ok := out.(error); ok && err != nil { - return err - } - // Update the index reply.Index = index return nil @@ -105,16 +100,11 @@ func (n *Namespace) DeleteNamespaces(args *structs.NamespaceDeleteRequest, reply } // Update via Raft - out, index, err := n.srv.raftApply(structs.NamespaceDeleteRequestType, args) + _, index, err := n.srv.raftApply(structs.NamespaceDeleteRequestType, args) if err != nil { return err } - // Check if there was an error when applying. - if err, ok := out.(error); ok && err != nil { - return err - } - // Update the index reply.Index = index return nil diff --git a/nomad/operator_endpoint.go b/nomad/operator_endpoint.go index 43786cb10a7b..4ee26c7ac4d7 100644 --- a/nomad/operator_endpoint.go +++ b/nomad/operator_endpoint.go @@ -258,9 +258,6 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe op.logger.Error("failed applying AutoPilot configuration", "error", err) return err } - if respErr, ok := resp.(error); ok { - return respErr - } // Check if the return type is a bool. if respBool, ok := resp.(bool); ok { @@ -325,9 +322,8 @@ func (op *Operator) SchedulerSetConfiguration(args *structs.SchedulerSetConfigRe if err != nil { op.logger.Error("failed applying Scheduler configuration", "error", err) return err - } else if respErr, ok := resp.(error); ok { - return respErr } + // If CAS request, raft returns a boolean indicating if the update was applied. // Otherwise, assume success reply.Updated = true diff --git a/nomad/periodic.go b/nomad/periodic.go index 5b7aa5fdddda..0f47f57e7ba3 100644 --- a/nomad/periodic.go +++ b/nomad/periodic.go @@ -68,10 +68,7 @@ func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) { Namespace: job.Namespace, }, } - fsmErr, index, err := s.raftApply(structs.JobRegisterRequestType, req) - if err, ok := fsmErr.(error); ok && err != nil { - return nil, err - } + _, index, err := s.raftApply(structs.JobRegisterRequestType, req) if err != nil { return nil, err } diff --git a/nomad/rpc.go b/nomad/rpc.go index 0dcf434b3cf9..500c4d72ba56 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -765,9 +765,10 @@ func (s *Server) raftApplyFuture(t structs.MessageType, msg interface{}) (raft.A // raftApplyFn is the function signature for applying a msg to Raft type raftApplyFn func(t structs.MessageType, msg interface{}) (interface{}, uint64, error) -// raftApply is used to encode a message, run it through raft, and return -// the FSM response along with any errors -func (s *Server) raftApply(t structs.MessageType, msg interface{}) (interface{}, uint64, error) { +// raftApply is used to encode a message, run it through raft, and return the +// FSM response along with any errors. If the FSM.Apply response is an error it +// will be returned as the error return value with a nil response. +func (s *Server) raftApply(t structs.MessageType, msg any) (any, uint64, error) { future, err := s.raftApplyFuture(t, msg) if err != nil { return nil, 0, err @@ -775,7 +776,11 @@ func (s *Server) raftApply(t structs.MessageType, msg interface{}) (interface{}, if err := future.Error(); err != nil { return nil, 0, err } - return future.Response(), future.Index(), nil + resp := future.Response() + if err, ok := resp.(error); ok && err != nil { + return nil, future.Index(), err + } + return resp, future.Index(), nil } // setQueryMeta is used to populate the QueryMeta data for an RPC call diff --git a/nomad/service_registration_endpoint.go b/nomad/service_registration_endpoint.go index 5a6bff543b5c..33d7d92ec6b2 100644 --- a/nomad/service_registration_endpoint.go +++ b/nomad/service_registration_endpoint.go @@ -79,16 +79,11 @@ func (s *ServiceRegistration) Upsert( } // Update via Raft. - out, index, err := s.srv.raftApply(structs.ServiceRegistrationUpsertRequestType, args) + _, index, err := s.srv.raftApply(structs.ServiceRegistrationUpsertRequestType, args) if err != nil { return err } - // Check if the FSM response, which is an interface, contains an error. - if err, ok := out.(error); ok && err != nil { - return err - } - // Update the index. There is no need to floor this as we are writing to // state and therefore will get a non-zero index response. reply.Index = index @@ -152,16 +147,11 @@ func (s *ServiceRegistration) DeleteByID( } // Update via Raft. - out, index, err := s.srv.raftApply(structs.ServiceRegistrationDeleteByIDRequestType, args) + _, index, err := s.srv.raftApply(structs.ServiceRegistrationDeleteByIDRequestType, args) if err != nil { return err } - // Check if the FSM response, which is an interface, contains an error. - if err, ok := out.(error); ok && err != nil { - return err - } - // Update the index. There is no need to floor this as we are writing to // state and therefore will get a non-zero index response. reply.Index = index