From b6fbf3b6d623deec6544b9affb742ee51b7fdc1d Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Fri, 28 Jul 2023 00:19:34 +0000 Subject: [PATCH 01/14] Add ADR for etcd snapshot CRD migration Signed-off-by: Brad Davidson --- docs/adrs/etcd-snapshot-cr.md | 53 +++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 docs/adrs/etcd-snapshot-cr.md diff --git a/docs/adrs/etcd-snapshot-cr.md b/docs/adrs/etcd-snapshot-cr.md new file mode 100644 index 000000000000..87b3a47a0574 --- /dev/null +++ b/docs/adrs/etcd-snapshot-cr.md @@ -0,0 +1,53 @@ +# Store etcd snapshot metadata in a Custom Resource + +Date: 2023-07-27 + +## Status + +Accepted + +## Context + +K3s currently stores a list of etcd snapshots and associated metadata in a ConfigMap. Other downstream +projects and controllers consume the content of this ConfigMap in order to present cluster administrators with +a list of snapshots that can be restored. + +On clusters with more than a handful of nodes, and reasonable snapshot intervals and retention periods, the snapshot +list ConfigMap frequently reaches the maximum size allowed by Kubernetes, and fails to store any additional information. +The snapshots are still created, but they cannot be discovered by users or accessed by tools that consume information +from the ConfigMap. + +When this occurs, the K3s service log shows errors such as: +``` +level=error msg="failed to save local snapshot data to configmap: ConfigMap \"k3s-etcd-snapshots\" is invalid: []: Too long: must have at most 1048576 bytes" +``` + +Reference: +* https://github.com/rancher/rke2/issues/4495 +* https://github.com/k3s-io/k3s/blob/36645e7311e9bdbbf2adb79ecd8bd68556bc86f6/pkg/etcd/etcd.go#L1503-L1516 + +### Existing Work + +Rancher already has a `rke.cattle.io/v1 ETCDSnapshot` Custom Resource that contains the same information after it's been +imported by the management cluster: +* https://github.com/rancher/rancher/blob/027246f77f03b82660dc2e91df6bf2cd549163f0/pkg/apis/rke.cattle.io/v1/etcd.go#L48-L74 + +It is unlikely that we would want to use this custom resource in its current package; we may be able to negotiate moving +it into a neutral project for use by both projects. + +## Decision + +1. Instead of populating snapshots into a ConfigMap using the JSON serialization of the private `snapshotFile` type, K3s + will manage creation of an new Custom Resource Definition with similar fields. +2. Metadata on each snapshot will be stored in a distinct Custom Resource. +3. The new Custom Resource will be cluster-scoped, as etcd and its snapshots are a cluster-level resource. +4. Downstream consumers of etcd snapshot lists will migrate to watching the Custom Resource, instead of the ConfigMap. +5. K3s will observe a three minor version transition period, where both the new Custom Resource, and the existing + ConfigMap, will both be used. +6. During the transition period, older snapshot metadata may be removed from the ConfigMap while those snapshots still + exist and are referenced by new Custom Resources, if the ConfigMap exceeds a preset size or key count limit. + +## Consequences + +* Snapshot metadata will no longer be lost when the number of snapshots exceeds what can be stored in the ConfigMap. +* There will be some additional complexity in managing the new Custom Resource, and working with other projects to migrate to using it. From 96188cf47ea31391341ecccd954cbc9552906d34 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Mon, 14 Aug 2023 20:53:48 +0000 Subject: [PATCH 02/14] Minor updates as per design review discussion Signed-off-by: Brad Davidson --- docs/adrs/etcd-snapshot-cr.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/adrs/etcd-snapshot-cr.md b/docs/adrs/etcd-snapshot-cr.md index 87b3a47a0574..369cbdba64d5 100644 --- a/docs/adrs/etcd-snapshot-cr.md +++ b/docs/adrs/etcd-snapshot-cr.md @@ -22,6 +22,8 @@ When this occurs, the K3s service log shows errors such as: level=error msg="failed to save local snapshot data to configmap: ConfigMap \"k3s-etcd-snapshots\" is invalid: []: Too long: must have at most 1048576 bytes" ``` +A side-effect of this is that snapshot metadata is lost if the ConfigMap cannot be updated, as the list is the only place that it is stored. + Reference: * https://github.com/rancher/rke2/issues/4495 * https://github.com/k3s-io/k3s/blob/36645e7311e9bdbbf2adb79ecd8bd68556bc86f6/pkg/etcd/etcd.go#L1503-L1516 @@ -41,10 +43,12 @@ it into a neutral project for use by both projects. will manage creation of an new Custom Resource Definition with similar fields. 2. Metadata on each snapshot will be stored in a distinct Custom Resource. 3. The new Custom Resource will be cluster-scoped, as etcd and its snapshots are a cluster-level resource. -4. Downstream consumers of etcd snapshot lists will migrate to watching the Custom Resource, instead of the ConfigMap. -5. K3s will observe a three minor version transition period, where both the new Custom Resource, and the existing +4. Snapshot metadata will also be written alongside snapshot files created on disk and/or uploaded to S3. The metadata + files will have the same basename as their corresponding snapshot file. +5. Downstream consumers of etcd snapshot lists will migrate to watching Custom Resource types, instead of the ConfigMap. +6. K3s will observe a three minor version transition period, where both the new Custom Resources, and the existing ConfigMap, will both be used. -6. During the transition period, older snapshot metadata may be removed from the ConfigMap while those snapshots still +7. During the transition period, older snapshot metadata may be removed from the ConfigMap while those snapshots still exist and are referenced by new Custom Resources, if the ConfigMap exceeds a preset size or key count limit. ## Consequences From 4583892b71ce4825c4c4d4adb213857be487b80d Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Fri, 8 Sep 2023 16:35:14 +0000 Subject: [PATCH 03/14] Add new CRD for etcd snapshots Also adds a hack go script to print the embedded CRDs, for developer use. Signed-off-by: Brad Davidson --- hack/crdgen.go | 13 ++ pkg/apis/k3s.cattle.io/v1/types.go | 89 ++++++++- .../k3s.cattle.io/v1/zz_generated_deepcopy.go | 165 ++++++++++++++++ .../v1/zz_generated_list_types.go | 17 ++ .../k3s.cattle.io/v1/zz_generated_register.go | 5 +- pkg/codegen/main.go | 1 + pkg/crd/crds.go | 21 +- .../k3s.cattle.io/v1/etcdsnapshotfile.go | 184 ++++++++++++++++++ .../v1/fake/fake_etcdsnapshotfile.go | 132 +++++++++++++ .../v1/fake/fake_k3s.cattle.io_client.go | 4 + .../k3s.cattle.io/v1/generated_expansion.go | 2 + .../k3s.cattle.io/v1/k3s.cattle.io_client.go | 5 + .../k3s.cattle.io/v1/etcdsnapshotfile.go | 161 +++++++++++++++ .../controllers/k3s.cattle.io/v1/interface.go | 5 + 14 files changed, 795 insertions(+), 9 deletions(-) create mode 100644 hack/crdgen.go create mode 100644 pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/etcdsnapshotfile.go create mode 100644 pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/fake/fake_etcdsnapshotfile.go create mode 100644 pkg/generated/controllers/k3s.cattle.io/v1/etcdsnapshotfile.go diff --git a/hack/crdgen.go b/hack/crdgen.go new file mode 100644 index 000000000000..fed1083d0b38 --- /dev/null +++ b/hack/crdgen.go @@ -0,0 +1,13 @@ +package main + +import ( + "os" + + k3scrd "github.com/k3s-io/k3s/pkg/crd" + _ "github.com/k3s-io/k3s/pkg/generated/controllers/k3s.cattle.io/v1" + "github.com/rancher/wrangler/pkg/crd" +) + +func main() { + crd.Print(os.Stdout, k3scrd.List()) +} diff --git a/pkg/apis/k3s.cattle.io/v1/types.go b/pkg/apis/k3s.cattle.io/v1/types.go index 79ae04f77a0e..c52e8eee518b 100644 --- a/pkg/apis/k3s.cattle.io/v1/types.go +++ b/pkg/apis/k3s.cattle.io/v1/types.go @@ -1,20 +1,105 @@ package v1 import ( + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) // +genclient +// +genclient:noStatus // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// Addon is used to track application of a manifest file on disk. It mostly exists so that the wrangler DesiredSet +// Apply controller has an object to track as the owner, and ensure that all created resources are tracked when the +// manifest is modified or removed. type Addon struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` + // Spec provides information about the on-disk manifest backing this resource. Spec AddonSpec `json:"spec,omitempty"` } type AddonSpec struct { - Source string `json:"source,omitempty"` - Checksum string `json:"checksum,omitempty"` + // Source is the Path on disk to the manifest file that this Addon tracks. + Source string `json:"source,omitempty" column:""` + // Checksum is the SHA256 checksum of the most recently successfully applied manifest file. + Checksum string `json:"checksum,omitempty" column:""` +} + +// +genclient +// +genclient:nonNamespaced +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// ETCDSnapshot tracks a point-in-time snapshot of the etcd datastore. +type ETCDSnapshotFile struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Spec defines properties of an etcd snapshot file + Spec ETCDSnapshotSpec `json:"spec,omitempty"` + // Status represents current information about a snapshot. + Status ETCDSnapshotStatus `json:"status,omitempty"` +} + +// ETCDSnapshotSpec desribes an etcd snapshot file +type ETCDSnapshotSpec struct { + // SnapshotName contains the base name of the snapshot file. CLI actions that act + // on snapshots stored locally or within a pre-configured S3 bucket and + // prefix usually take the snapshot name as their argument. + SnapshotName string `json:"snapshotName" column:""` + // NodeName contains the name of the node that took the snapshot. + NodeName string `json:"nodeName" column:"name=Node"` + // Location is the absolute file:// or s3:// URI address of the snapshot. + Location string `json:"location" column:""` + // Metadata contains point-in-time snapshot of the contents of the + // k3s-etcd-snapshot-extra-metadata ConfigMap's data field, at the time the + // snapshot was taken. This is intended to contain data about cluster state + // that may be important for an external system to have available when restoring + // the snapshot. + Metadata map[string]string `json:"metadata,omitempty"` + // S3 contains extra metadata about the S3 storage system holding the + // snapshot. This is guaranteed to be set for all snapshots uploaded to S3. + // If not specified, the snapshot was not uploaded to S3. + S3 *ETCDSnapshotS3 `json:"s3,omitempty"` +} + +// ETCDSnapshotS3 holds information about the S3 storage system holding the snapshot. +type ETCDSnapshotS3 struct { + // Endpoint is the host or host:port of the S3 service + Endpoint string `json:"endpoint,omitempty"` + // EndpointCA is the path on disk to the S3 service's trusted CA list. Leave empty to use the OS CA bundle. + EndpointCA string `json:"endpointCA,omitempty"` + // SkipSSLVerify is true if TLS certificate verification is disabled + SkipSSLVerify bool `json:"skipSSLVerify,omitempty"` + // Bucket is the bucket holding the snapshot + Bucket string `json:"bucket,omitempty"` + // Region is the region of the S3 service + Region string `json:"region,omitempty"` + // Prefix is the prefix in which the snapshot file is stored. + Prefix string `json:"prefix,omitempty"` + // Insecure is true if the S3 service uses HTTP instead of HTTPS + Insecure bool `json:"insecure,omitempty"` +} + +// ETCDSnapshotStatus is the status of the ETCDSnapshotFile object. +type ETCDSnapshotStatus struct { + // Size is the size of the snapshot file, in bytes. If not specified, the snapshot failed. + Size *resource.Quantity `json:"size,omitempty" column:""` + // CreationTime is the timestamp when the snapshot was taken by etcd. + CreationTime *metav1.Time `json:"creationTime,omitempty" column:""` + // ReadyToUse indicates that the snapshot is available to be restored. + ReadyToUse *bool `json:"readyToUse,omitempty"` + // Error is the last observed error during snapshot creation, if any. + // If the snapshot is retried, this field will be cleared on success. + Error *ETCDSnapshotError `json:"error,omitempty"` +} + +// ETCDSnapshotError describes an error encountered during snapshot creation. +type ETCDSnapshotError struct { + // Time is the timestamp when the error was encountered. + Time *metav1.Time `json:"time,omitempty"` + // Message is a string detailing the encountered error during snapshot creation if specified. + // NOTE: message may be logged, and it should not contain sensitive information. + Message *string `json:"message,omitempty"` } diff --git a/pkg/apis/k3s.cattle.io/v1/zz_generated_deepcopy.go b/pkg/apis/k3s.cattle.io/v1/zz_generated_deepcopy.go index 69011aa78d22..1679c1e7fff5 100644 --- a/pkg/apis/k3s.cattle.io/v1/zz_generated_deepcopy.go +++ b/pkg/apis/k3s.cattle.io/v1/zz_generated_deepcopy.go @@ -100,3 +100,168 @@ func (in *AddonSpec) DeepCopy() *AddonSpec { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCDSnapshotError) DeepCopyInto(out *ETCDSnapshotError) { + *out = *in + if in.Time != nil { + in, out := &in.Time, &out.Time + *out = (*in).DeepCopy() + } + if in.Message != nil { + in, out := &in.Message, &out.Message + *out = new(string) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCDSnapshotError. +func (in *ETCDSnapshotError) DeepCopy() *ETCDSnapshotError { + if in == nil { + return nil + } + out := new(ETCDSnapshotError) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCDSnapshotFile) DeepCopyInto(out *ETCDSnapshotFile) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCDSnapshotFile. +func (in *ETCDSnapshotFile) DeepCopy() *ETCDSnapshotFile { + if in == nil { + return nil + } + out := new(ETCDSnapshotFile) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ETCDSnapshotFile) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCDSnapshotFileList) DeepCopyInto(out *ETCDSnapshotFileList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ETCDSnapshotFile, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCDSnapshotFileList. +func (in *ETCDSnapshotFileList) DeepCopy() *ETCDSnapshotFileList { + if in == nil { + return nil + } + out := new(ETCDSnapshotFileList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ETCDSnapshotFileList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCDSnapshotS3) DeepCopyInto(out *ETCDSnapshotS3) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCDSnapshotS3. +func (in *ETCDSnapshotS3) DeepCopy() *ETCDSnapshotS3 { + if in == nil { + return nil + } + out := new(ETCDSnapshotS3) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCDSnapshotSpec) DeepCopyInto(out *ETCDSnapshotSpec) { + *out = *in + if in.Metadata != nil { + in, out := &in.Metadata, &out.Metadata + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.S3 != nil { + in, out := &in.S3, &out.S3 + *out = new(ETCDSnapshotS3) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCDSnapshotSpec. +func (in *ETCDSnapshotSpec) DeepCopy() *ETCDSnapshotSpec { + if in == nil { + return nil + } + out := new(ETCDSnapshotSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCDSnapshotStatus) DeepCopyInto(out *ETCDSnapshotStatus) { + *out = *in + if in.Size != nil { + in, out := &in.Size, &out.Size + x := (*in).DeepCopy() + *out = &x + } + if in.CreationTime != nil { + in, out := &in.CreationTime, &out.CreationTime + *out = (*in).DeepCopy() + } + if in.ReadyToUse != nil { + in, out := &in.ReadyToUse, &out.ReadyToUse + *out = new(bool) + **out = **in + } + if in.Error != nil { + in, out := &in.Error, &out.Error + *out = new(ETCDSnapshotError) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCDSnapshotStatus. +func (in *ETCDSnapshotStatus) DeepCopy() *ETCDSnapshotStatus { + if in == nil { + return nil + } + out := new(ETCDSnapshotStatus) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/apis/k3s.cattle.io/v1/zz_generated_list_types.go b/pkg/apis/k3s.cattle.io/v1/zz_generated_list_types.go index 52955028638c..c00d6ac70ae7 100644 --- a/pkg/apis/k3s.cattle.io/v1/zz_generated_list_types.go +++ b/pkg/apis/k3s.cattle.io/v1/zz_generated_list_types.go @@ -40,3 +40,20 @@ func NewAddon(namespace, name string, obj Addon) *Addon { obj.Namespace = namespace return &obj } + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// ETCDSnapshotFileList is a list of ETCDSnapshotFile resources +type ETCDSnapshotFileList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata"` + + Items []ETCDSnapshotFile `json:"items"` +} + +func NewETCDSnapshotFile(namespace, name string, obj ETCDSnapshotFile) *ETCDSnapshotFile { + obj.APIVersion, obj.Kind = SchemeGroupVersion.WithKind("ETCDSnapshotFile").ToAPIVersionAndKind() + obj.Name = name + obj.Namespace = namespace + return &obj +} diff --git a/pkg/apis/k3s.cattle.io/v1/zz_generated_register.go b/pkg/apis/k3s.cattle.io/v1/zz_generated_register.go index 80e8f3629831..90761711f75d 100644 --- a/pkg/apis/k3s.cattle.io/v1/zz_generated_register.go +++ b/pkg/apis/k3s.cattle.io/v1/zz_generated_register.go @@ -28,7 +28,8 @@ import ( ) var ( - AddonResourceName = "addons" + AddonResourceName = "addons" + ETCDSnapshotFileResourceName = "etcdsnapshotfiles" ) // SchemeGroupVersion is group version used to register these objects @@ -54,6 +55,8 @@ func addKnownTypes(scheme *runtime.Scheme) error { scheme.AddKnownTypes(SchemeGroupVersion, &Addon{}, &AddonList{}, + &ETCDSnapshotFile{}, + &ETCDSnapshotFileList{}, ) metav1.AddToGroupVersion(scheme, SchemeGroupVersion) return nil diff --git a/pkg/codegen/main.go b/pkg/codegen/main.go index e9b6e370de65..afb2d622ec2b 100644 --- a/pkg/codegen/main.go +++ b/pkg/codegen/main.go @@ -74,6 +74,7 @@ func main() { "k3s.cattle.io": { Types: []interface{}{ v1.Addon{}, + v1.ETCDSnapshotFile{}, }, GenerateTypes: true, GenerateClients: true, diff --git a/pkg/crd/crds.go b/pkg/crd/crds.go index 634f555087ea..0a1a918dbe24 100644 --- a/pkg/crd/crds.go +++ b/pkg/crd/crds.go @@ -6,10 +6,19 @@ import ( ) func List() []crd.CRD { - addon := crd.NamespacedType("Addon.k3s.cattle.io/v1"). - WithSchemaFromStruct(v1.Addon{}). - WithColumn("Source", ".spec.source"). - WithColumn("Checksum", ".spec.checksum") - - return []crd.CRD{addon} + addon := v1.Addon{} + etcdSnapshotFile := v1.ETCDSnapshotFile{} + return []crd.CRD{ + crd.NamespacedType("Addon.k3s.cattle.io/v1"). + WithSchemaFromStruct(addon). + WithColumn("Source", ".spec.source"). + WithColumn("Checksum", ".spec.checksum"), + crd.NonNamespacedType("ETCDSnapshotFile.k3s.cattle.io/v1"). + WithSchemaFromStruct(etcdSnapshotFile). + WithColumn("SnapshotName", ".spec.snapshotName"). + WithColumn("Node", ".spec.nodeName"). + WithColumn("Location", ".spec.location"). + WithColumn("Size", ".status.size"). + WithColumn("CreationTime", ".status.creationTime"), + } } diff --git a/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/etcdsnapshotfile.go b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/etcdsnapshotfile.go new file mode 100644 index 000000000000..148cd2af8340 --- /dev/null +++ b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/etcdsnapshotfile.go @@ -0,0 +1,184 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by main. DO NOT EDIT. + +package v1 + +import ( + "context" + "time" + + v1 "github.com/k3s-io/k3s/pkg/apis/k3s.cattle.io/v1" + scheme "github.com/k3s-io/k3s/pkg/generated/clientset/versioned/scheme" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// ETCDSnapshotFilesGetter has a method to return a ETCDSnapshotFileInterface. +// A group's client should implement this interface. +type ETCDSnapshotFilesGetter interface { + ETCDSnapshotFiles() ETCDSnapshotFileInterface +} + +// ETCDSnapshotFileInterface has methods to work with ETCDSnapshotFile resources. +type ETCDSnapshotFileInterface interface { + Create(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.CreateOptions) (*v1.ETCDSnapshotFile, error) + Update(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.UpdateOptions) (*v1.ETCDSnapshotFile, error) + UpdateStatus(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.UpdateOptions) (*v1.ETCDSnapshotFile, error) + Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error + Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.ETCDSnapshotFile, error) + List(ctx context.Context, opts metav1.ListOptions) (*v1.ETCDSnapshotFileList, error) + Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.ETCDSnapshotFile, err error) + ETCDSnapshotFileExpansion +} + +// eTCDSnapshotFiles implements ETCDSnapshotFileInterface +type eTCDSnapshotFiles struct { + client rest.Interface +} + +// newETCDSnapshotFiles returns a ETCDSnapshotFiles +func newETCDSnapshotFiles(c *K3sV1Client) *eTCDSnapshotFiles { + return &eTCDSnapshotFiles{ + client: c.RESTClient(), + } +} + +// Get takes name of the eTCDSnapshotFile, and returns the corresponding eTCDSnapshotFile object, and an error if there is any. +func (c *eTCDSnapshotFiles) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.ETCDSnapshotFile, err error) { + result = &v1.ETCDSnapshotFile{} + err = c.client.Get(). + Resource("etcdsnapshotfiles"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(ctx). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of ETCDSnapshotFiles that match those selectors. +func (c *eTCDSnapshotFiles) List(ctx context.Context, opts metav1.ListOptions) (result *v1.ETCDSnapshotFileList, err error) { + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + result = &v1.ETCDSnapshotFileList{} + err = c.client.Get(). + Resource("etcdsnapshotfiles"). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Do(ctx). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested eTCDSnapshotFiles. +func (c *eTCDSnapshotFiles) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) { + var timeout time.Duration + if opts.TimeoutSeconds != nil { + timeout = time.Duration(*opts.TimeoutSeconds) * time.Second + } + opts.Watch = true + return c.client.Get(). + Resource("etcdsnapshotfiles"). + VersionedParams(&opts, scheme.ParameterCodec). + Timeout(timeout). + Watch(ctx) +} + +// Create takes the representation of a eTCDSnapshotFile and creates it. Returns the server's representation of the eTCDSnapshotFile, and an error, if there is any. +func (c *eTCDSnapshotFiles) Create(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.CreateOptions) (result *v1.ETCDSnapshotFile, err error) { + result = &v1.ETCDSnapshotFile{} + err = c.client.Post(). + Resource("etcdsnapshotfiles"). + VersionedParams(&opts, scheme.ParameterCodec). + Body(eTCDSnapshotFile). + Do(ctx). + Into(result) + return +} + +// Update takes the representation of a eTCDSnapshotFile and updates it. Returns the server's representation of the eTCDSnapshotFile, and an error, if there is any. +func (c *eTCDSnapshotFiles) Update(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.UpdateOptions) (result *v1.ETCDSnapshotFile, err error) { + result = &v1.ETCDSnapshotFile{} + err = c.client.Put(). + Resource("etcdsnapshotfiles"). + Name(eTCDSnapshotFile.Name). + VersionedParams(&opts, scheme.ParameterCodec). + Body(eTCDSnapshotFile). + Do(ctx). + Into(result) + return +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *eTCDSnapshotFiles) UpdateStatus(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.UpdateOptions) (result *v1.ETCDSnapshotFile, err error) { + result = &v1.ETCDSnapshotFile{} + err = c.client.Put(). + Resource("etcdsnapshotfiles"). + Name(eTCDSnapshotFile.Name). + SubResource("status"). + VersionedParams(&opts, scheme.ParameterCodec). + Body(eTCDSnapshotFile). + Do(ctx). + Into(result) + return +} + +// Delete takes name of the eTCDSnapshotFile and deletes it. Returns an error if one occurs. +func (c *eTCDSnapshotFiles) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + return c.client.Delete(). + Resource("etcdsnapshotfiles"). + Name(name). + Body(&opts). + Do(ctx). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *eTCDSnapshotFiles) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error { + var timeout time.Duration + if listOpts.TimeoutSeconds != nil { + timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second + } + return c.client.Delete(). + Resource("etcdsnapshotfiles"). + VersionedParams(&listOpts, scheme.ParameterCodec). + Timeout(timeout). + Body(&opts). + Do(ctx). + Error() +} + +// Patch applies the patch and returns the patched eTCDSnapshotFile. +func (c *eTCDSnapshotFiles) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.ETCDSnapshotFile, err error) { + result = &v1.ETCDSnapshotFile{} + err = c.client.Patch(pt). + Resource("etcdsnapshotfiles"). + Name(name). + SubResource(subresources...). + VersionedParams(&opts, scheme.ParameterCodec). + Body(data). + Do(ctx). + Into(result) + return +} diff --git a/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/fake/fake_etcdsnapshotfile.go b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/fake/fake_etcdsnapshotfile.go new file mode 100644 index 000000000000..b4ad567c34d4 --- /dev/null +++ b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/fake/fake_etcdsnapshotfile.go @@ -0,0 +1,132 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by main. DO NOT EDIT. + +package fake + +import ( + "context" + + v1 "github.com/k3s-io/k3s/pkg/apis/k3s.cattle.io/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeETCDSnapshotFiles implements ETCDSnapshotFileInterface +type FakeETCDSnapshotFiles struct { + Fake *FakeK3sV1 +} + +var etcdsnapshotfilesResource = v1.SchemeGroupVersion.WithResource("etcdsnapshotfiles") + +var etcdsnapshotfilesKind = v1.SchemeGroupVersion.WithKind("ETCDSnapshotFile") + +// Get takes name of the eTCDSnapshotFile, and returns the corresponding eTCDSnapshotFile object, and an error if there is any. +func (c *FakeETCDSnapshotFiles) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.ETCDSnapshotFile, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootGetAction(etcdsnapshotfilesResource, name), &v1.ETCDSnapshotFile{}) + if obj == nil { + return nil, err + } + return obj.(*v1.ETCDSnapshotFile), err +} + +// List takes label and field selectors, and returns the list of ETCDSnapshotFiles that match those selectors. +func (c *FakeETCDSnapshotFiles) List(ctx context.Context, opts metav1.ListOptions) (result *v1.ETCDSnapshotFileList, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootListAction(etcdsnapshotfilesResource, etcdsnapshotfilesKind, opts), &v1.ETCDSnapshotFileList{}) + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1.ETCDSnapshotFileList{ListMeta: obj.(*v1.ETCDSnapshotFileList).ListMeta} + for _, item := range obj.(*v1.ETCDSnapshotFileList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested eTCDSnapshotFiles. +func (c *FakeETCDSnapshotFiles) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewRootWatchAction(etcdsnapshotfilesResource, opts)) +} + +// Create takes the representation of a eTCDSnapshotFile and creates it. Returns the server's representation of the eTCDSnapshotFile, and an error, if there is any. +func (c *FakeETCDSnapshotFiles) Create(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.CreateOptions) (result *v1.ETCDSnapshotFile, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootCreateAction(etcdsnapshotfilesResource, eTCDSnapshotFile), &v1.ETCDSnapshotFile{}) + if obj == nil { + return nil, err + } + return obj.(*v1.ETCDSnapshotFile), err +} + +// Update takes the representation of a eTCDSnapshotFile and updates it. Returns the server's representation of the eTCDSnapshotFile, and an error, if there is any. +func (c *FakeETCDSnapshotFiles) Update(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.UpdateOptions) (result *v1.ETCDSnapshotFile, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootUpdateAction(etcdsnapshotfilesResource, eTCDSnapshotFile), &v1.ETCDSnapshotFile{}) + if obj == nil { + return nil, err + } + return obj.(*v1.ETCDSnapshotFile), err +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *FakeETCDSnapshotFiles) UpdateStatus(ctx context.Context, eTCDSnapshotFile *v1.ETCDSnapshotFile, opts metav1.UpdateOptions) (*v1.ETCDSnapshotFile, error) { + obj, err := c.Fake. + Invokes(testing.NewRootUpdateSubresourceAction(etcdsnapshotfilesResource, "status", eTCDSnapshotFile), &v1.ETCDSnapshotFile{}) + if obj == nil { + return nil, err + } + return obj.(*v1.ETCDSnapshotFile), err +} + +// Delete takes name of the eTCDSnapshotFile and deletes it. Returns an error if one occurs. +func (c *FakeETCDSnapshotFiles) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewRootDeleteActionWithOptions(etcdsnapshotfilesResource, name, opts), &v1.ETCDSnapshotFile{}) + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeETCDSnapshotFiles) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error { + action := testing.NewRootDeleteCollectionAction(etcdsnapshotfilesResource, listOpts) + + _, err := c.Fake.Invokes(action, &v1.ETCDSnapshotFileList{}) + return err +} + +// Patch applies the patch and returns the patched eTCDSnapshotFile. +func (c *FakeETCDSnapshotFiles) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.ETCDSnapshotFile, err error) { + obj, err := c.Fake. + Invokes(testing.NewRootPatchSubresourceAction(etcdsnapshotfilesResource, name, pt, data, subresources...), &v1.ETCDSnapshotFile{}) + if obj == nil { + return nil, err + } + return obj.(*v1.ETCDSnapshotFile), err +} diff --git a/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/fake/fake_k3s.cattle.io_client.go b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/fake/fake_k3s.cattle.io_client.go index 562baa963898..7167f94bf941 100644 --- a/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/fake/fake_k3s.cattle.io_client.go +++ b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/fake/fake_k3s.cattle.io_client.go @@ -32,6 +32,10 @@ func (c *FakeK3sV1) Addons(namespace string) v1.AddonInterface { return &FakeAddons{c, namespace} } +func (c *FakeK3sV1) ETCDSnapshotFiles() v1.ETCDSnapshotFileInterface { + return &FakeETCDSnapshotFiles{c} +} + // RESTClient returns a RESTClient that is used to communicate // with API server by this client implementation. func (c *FakeK3sV1) RESTClient() rest.Interface { diff --git a/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/generated_expansion.go b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/generated_expansion.go index 1b681d3f1fe3..d152245a2913 100644 --- a/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/generated_expansion.go +++ b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/generated_expansion.go @@ -19,3 +19,5 @@ limitations under the License. package v1 type AddonExpansion interface{} + +type ETCDSnapshotFileExpansion interface{} diff --git a/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/k3s.cattle.io_client.go b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/k3s.cattle.io_client.go index a1e0d1fbafa6..77bd599332e5 100644 --- a/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/k3s.cattle.io_client.go +++ b/pkg/generated/clientset/versioned/typed/k3s.cattle.io/v1/k3s.cattle.io_client.go @@ -29,6 +29,7 @@ import ( type K3sV1Interface interface { RESTClient() rest.Interface AddonsGetter + ETCDSnapshotFilesGetter } // K3sV1Client is used to interact with features provided by the k3s.cattle.io group. @@ -40,6 +41,10 @@ func (c *K3sV1Client) Addons(namespace string) AddonInterface { return newAddons(c, namespace) } +func (c *K3sV1Client) ETCDSnapshotFiles() ETCDSnapshotFileInterface { + return newETCDSnapshotFiles(c) +} + // NewForConfig creates a new K3sV1Client for the given config. // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), // where httpClient was generated with rest.HTTPClientFor(c). diff --git a/pkg/generated/controllers/k3s.cattle.io/v1/etcdsnapshotfile.go b/pkg/generated/controllers/k3s.cattle.io/v1/etcdsnapshotfile.go new file mode 100644 index 000000000000..8358d1ae3b12 --- /dev/null +++ b/pkg/generated/controllers/k3s.cattle.io/v1/etcdsnapshotfile.go @@ -0,0 +1,161 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by main. DO NOT EDIT. + +package v1 + +import ( + "context" + "time" + + v1 "github.com/k3s-io/k3s/pkg/apis/k3s.cattle.io/v1" + "github.com/rancher/wrangler/pkg/apply" + "github.com/rancher/wrangler/pkg/condition" + "github.com/rancher/wrangler/pkg/generic" + "github.com/rancher/wrangler/pkg/kv" + "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +// ETCDSnapshotFileController interface for managing ETCDSnapshotFile resources. +type ETCDSnapshotFileController interface { + generic.NonNamespacedControllerInterface[*v1.ETCDSnapshotFile, *v1.ETCDSnapshotFileList] +} + +// ETCDSnapshotFileClient interface for managing ETCDSnapshotFile resources in Kubernetes. +type ETCDSnapshotFileClient interface { + generic.NonNamespacedClientInterface[*v1.ETCDSnapshotFile, *v1.ETCDSnapshotFileList] +} + +// ETCDSnapshotFileCache interface for retrieving ETCDSnapshotFile resources in memory. +type ETCDSnapshotFileCache interface { + generic.NonNamespacedCacheInterface[*v1.ETCDSnapshotFile] +} + +type ETCDSnapshotFileStatusHandler func(obj *v1.ETCDSnapshotFile, status v1.ETCDSnapshotStatus) (v1.ETCDSnapshotStatus, error) + +type ETCDSnapshotFileGeneratingHandler func(obj *v1.ETCDSnapshotFile, status v1.ETCDSnapshotStatus) ([]runtime.Object, v1.ETCDSnapshotStatus, error) + +func RegisterETCDSnapshotFileStatusHandler(ctx context.Context, controller ETCDSnapshotFileController, condition condition.Cond, name string, handler ETCDSnapshotFileStatusHandler) { + statusHandler := &eTCDSnapshotFileStatusHandler{ + client: controller, + condition: condition, + handler: handler, + } + controller.AddGenericHandler(ctx, name, generic.FromObjectHandlerToHandler(statusHandler.sync)) +} + +func RegisterETCDSnapshotFileGeneratingHandler(ctx context.Context, controller ETCDSnapshotFileController, apply apply.Apply, + condition condition.Cond, name string, handler ETCDSnapshotFileGeneratingHandler, opts *generic.GeneratingHandlerOptions) { + statusHandler := &eTCDSnapshotFileGeneratingHandler{ + ETCDSnapshotFileGeneratingHandler: handler, + apply: apply, + name: name, + gvk: controller.GroupVersionKind(), + } + if opts != nil { + statusHandler.opts = *opts + } + controller.OnChange(ctx, name, statusHandler.Remove) + RegisterETCDSnapshotFileStatusHandler(ctx, controller, condition, name, statusHandler.Handle) +} + +type eTCDSnapshotFileStatusHandler struct { + client ETCDSnapshotFileClient + condition condition.Cond + handler ETCDSnapshotFileStatusHandler +} + +func (a *eTCDSnapshotFileStatusHandler) sync(key string, obj *v1.ETCDSnapshotFile) (*v1.ETCDSnapshotFile, error) { + if obj == nil { + return obj, nil + } + + origStatus := obj.Status.DeepCopy() + obj = obj.DeepCopy() + newStatus, err := a.handler(obj, obj.Status) + if err != nil { + // Revert to old status on error + newStatus = *origStatus.DeepCopy() + } + + if a.condition != "" { + if errors.IsConflict(err) { + a.condition.SetError(&newStatus, "", nil) + } else { + a.condition.SetError(&newStatus, "", err) + } + } + if !equality.Semantic.DeepEqual(origStatus, &newStatus) { + if a.condition != "" { + // Since status has changed, update the lastUpdatedTime + a.condition.LastUpdated(&newStatus, time.Now().UTC().Format(time.RFC3339)) + } + + var newErr error + obj.Status = newStatus + newObj, newErr := a.client.UpdateStatus(obj) + if err == nil { + err = newErr + } + if newErr == nil { + obj = newObj + } + } + return obj, err +} + +type eTCDSnapshotFileGeneratingHandler struct { + ETCDSnapshotFileGeneratingHandler + apply apply.Apply + opts generic.GeneratingHandlerOptions + gvk schema.GroupVersionKind + name string +} + +func (a *eTCDSnapshotFileGeneratingHandler) Remove(key string, obj *v1.ETCDSnapshotFile) (*v1.ETCDSnapshotFile, error) { + if obj != nil { + return obj, nil + } + + obj = &v1.ETCDSnapshotFile{} + obj.Namespace, obj.Name = kv.RSplit(key, "/") + obj.SetGroupVersionKind(a.gvk) + + return nil, generic.ConfigureApplyForObject(a.apply, obj, &a.opts). + WithOwner(obj). + WithSetID(a.name). + ApplyObjects() +} + +func (a *eTCDSnapshotFileGeneratingHandler) Handle(obj *v1.ETCDSnapshotFile, status v1.ETCDSnapshotStatus) (v1.ETCDSnapshotStatus, error) { + if !obj.DeletionTimestamp.IsZero() { + return status, nil + } + + objs, newStatus, err := a.ETCDSnapshotFileGeneratingHandler(obj, status) + if err != nil { + return newStatus, err + } + + return newStatus, generic.ConfigureApplyForObject(a.apply, obj, &a.opts). + WithOwner(obj). + WithSetID(a.name). + ApplyObjects(objs...) +} diff --git a/pkg/generated/controllers/k3s.cattle.io/v1/interface.go b/pkg/generated/controllers/k3s.cattle.io/v1/interface.go index 656a013c2f4d..6a80c591acb4 100644 --- a/pkg/generated/controllers/k3s.cattle.io/v1/interface.go +++ b/pkg/generated/controllers/k3s.cattle.io/v1/interface.go @@ -32,6 +32,7 @@ func init() { type Interface interface { Addon() AddonController + ETCDSnapshotFile() ETCDSnapshotFileController } func New(controllerFactory controller.SharedControllerFactory) Interface { @@ -47,3 +48,7 @@ type version struct { func (v *version) Addon() AddonController { return generic.NewController[*v1.Addon, *v1.AddonList](schema.GroupVersionKind{Group: "k3s.cattle.io", Version: "v1", Kind: "Addon"}, "addons", true, v.controllerFactory) } + +func (v *version) ETCDSnapshotFile() ETCDSnapshotFileController { + return generic.NewNonNamespacedController[*v1.ETCDSnapshotFile, *v1.ETCDSnapshotFileList](schema.GroupVersionKind{Group: "k3s.cattle.io", Version: "v1", Kind: "ETCDSnapshotFile"}, "etcdsnapshotfiles", v.controllerFactory) +} From cb08a45012288d9471faa6fd0761cab5d2864e2b Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Thu, 28 Sep 2023 00:28:03 +0000 Subject: [PATCH 04/14] Move etcd snapshot code into separate file Signed-off-by: Brad Davidson --- pkg/etcd/etcd.go | 869 +---------------------------------------- pkg/etcd/snapshot.go | 897 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 898 insertions(+), 868 deletions(-) create mode 100644 pkg/etcd/snapshot.go diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index cb21a0a3dfa3..eb3e5cdac7dd 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -1,23 +1,18 @@ package etcd import ( - "archive/zip" "bytes" "context" "crypto/tls" - "encoding/base64" "encoding/json" "fmt" - "io" "io/fs" - "math/rand" "net" "net/http" "net/url" "os" "path/filepath" "regexp" - "runtime" "sort" "strconv" "strings" @@ -33,7 +28,6 @@ import ( "github.com/k3s-io/k3s/pkg/version" "github.com/k3s-io/kine/pkg/client" endpoint2 "github.com/k3s-io/kine/pkg/endpoint" - "github.com/minio/minio-go/v7" cp "github.com/otiai10/copy" "github.com/pkg/errors" certutil "github.com/rancher/dynamiclistener/cert" @@ -47,12 +41,9 @@ import ( "go.etcd.io/etcd/etcdutl/v3/snapshot" "go.uber.org/zap" "golang.org/x/sync/semaphore" - v1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilnet "k8s.io/apimachinery/pkg/util/net" "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/util/retry" ) const ( @@ -71,9 +62,7 @@ const ( defaultKeepAliveTime = 30 * time.Second defaultKeepAliveTimeout = 10 * time.Second - maxBackupRetention = 5 - maxConcurrentSnapshots = 1 - compressedExtension = ".zip" + maxBackupRetention = 5 ) var ( @@ -81,22 +70,6 @@ var ( // AddressKey will contain the value of api addresses list AddressKey = version.Program + "/apiaddresses" - snapshotExtraMetadataConfigMapName = version.Program + "-etcd-snapshot-extra-metadata" - snapshotConfigMapName = version.Program + "-etcd-snapshots" - - // snapshotDataBackoff will retry at increasing steps for up to ~30 seconds. - // If the ConfigMap update fails, the list won't be reconciled again until next time - // the server starts, so we should be fairly persistent in retrying. - snapshotDataBackoff = wait.Backoff{ - Steps: 9, - Duration: 10 * time.Millisecond, - Factor: 3.0, - Jitter: 0.1, - } - - // cronLogger wraps logrus's Printf output as cron-compatible logger - cronLogger = cron.VerbosePrintfLogger(logrus.StandardLogger()) - NodeNameAnnotation = "etcd." + version.Program + ".cattle.io/node-name" NodeAddressAnnotation = "etcd." + version.Program + ".cattle.io/node-address" @@ -1252,803 +1225,6 @@ members: return clientURLs, memberList, nil } -// snapshotDir ensures that the snapshot directory exists, and then returns its path. -func snapshotDir(config *config.Control, create bool) (string, error) { - if config.EtcdSnapshotDir == "" { - // we have to create the snapshot dir if we are using - // the default snapshot dir if it doesn't exist - defaultSnapshotDir := filepath.Join(config.DataDir, "db", "snapshots") - s, err := os.Stat(defaultSnapshotDir) - if err != nil { - if create && os.IsNotExist(err) { - if err := os.MkdirAll(defaultSnapshotDir, 0700); err != nil { - return "", err - } - return defaultSnapshotDir, nil - } - return "", err - } - if s.IsDir() { - return defaultSnapshotDir, nil - } - } - return config.EtcdSnapshotDir, nil -} - -// preSnapshotSetup checks to see if the necessary components are in place -// to perform an Etcd snapshot. This is necessary primarily for on-demand -// snapshots since they're performed before normal Etcd setup is completed. -func (e *ETCD) preSnapshotSetup(ctx context.Context) error { - if e.snapshotSem == nil { - e.snapshotSem = semaphore.NewWeighted(maxConcurrentSnapshots) - } - return nil -} - -// compressSnapshot compresses the given snapshot and provides the -// caller with the path to the file. -func (e *ETCD) compressSnapshot(snapshotDir, snapshotName, snapshotPath string) (string, error) { - logrus.Info("Compressing etcd snapshot file: " + snapshotName) - - zippedSnapshotName := snapshotName + compressedExtension - zipPath := filepath.Join(snapshotDir, zippedSnapshotName) - - zf, err := os.Create(zipPath) - if err != nil { - return "", err - } - defer zf.Close() - - zipWriter := zip.NewWriter(zf) - defer zipWriter.Close() - - uncompressedPath := filepath.Join(snapshotDir, snapshotName) - fileToZip, err := os.Open(uncompressedPath) - if err != nil { - os.Remove(zipPath) - return "", err - } - defer fileToZip.Close() - - info, err := fileToZip.Stat() - if err != nil { - os.Remove(zipPath) - return "", err - } - - header, err := zip.FileInfoHeader(info) - if err != nil { - os.Remove(zipPath) - return "", err - } - - header.Name = snapshotName - header.Method = zip.Deflate - header.Modified = time.Now() - - writer, err := zipWriter.CreateHeader(header) - if err != nil { - os.Remove(zipPath) - return "", err - } - _, err = io.Copy(writer, fileToZip) - - return zipPath, err -} - -// decompressSnapshot decompresses the given snapshot and provides the caller -// with the full path to the uncompressed snapshot. -func (e *ETCD) decompressSnapshot(snapshotDir, snapshotFile string) (string, error) { - logrus.Info("Decompressing etcd snapshot file: " + snapshotFile) - - r, err := zip.OpenReader(snapshotFile) - if err != nil { - return "", err - } - defer r.Close() - - var decompressed *os.File - for _, sf := range r.File { - decompressed, err = os.OpenFile(strings.Replace(sf.Name, compressedExtension, "", -1), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, sf.Mode()) - if err != nil { - return "", err - } - defer decompressed.Close() - - ss, err := sf.Open() - if err != nil { - return "", err - } - defer ss.Close() - - if _, err := io.Copy(decompressed, ss); err != nil { - os.Remove("") - return "", err - } - } - - return decompressed.Name(), nil -} - -// Snapshot attempts to save a new snapshot to the configured directory, and then clean up any old and failed -// snapshots in excess of the retention limits. This method is used in the internal cron snapshot -// system as well as used to do on-demand snapshots. -func (e *ETCD) Snapshot(ctx context.Context) error { - if err := e.preSnapshotSetup(ctx); err != nil { - return err - } - if !e.snapshotSem.TryAcquire(maxConcurrentSnapshots) { - return fmt.Errorf("%d snapshots already in progress", maxConcurrentSnapshots) - } - defer e.snapshotSem.Release(maxConcurrentSnapshots) - - // make sure the core.Factory is initialized before attempting to add snapshot metadata - var extraMetadata string - if e.config.Runtime.Core == nil { - logrus.Debugf("Cannot retrieve extra metadata from %s ConfigMap: runtime core not ready", snapshotExtraMetadataConfigMapName) - } else { - logrus.Debugf("Attempting to retrieve extra metadata from %s ConfigMap", snapshotExtraMetadataConfigMapName) - if snapshotExtraMetadataConfigMap, err := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotExtraMetadataConfigMapName, metav1.GetOptions{}); err != nil { - logrus.Debugf("Error encountered attempting to retrieve extra metadata from %s ConfigMap, error: %v", snapshotExtraMetadataConfigMapName, err) - } else { - if m, err := json.Marshal(snapshotExtraMetadataConfigMap.Data); err != nil { - logrus.Debugf("Error attempting to marshal extra metadata contained in %s ConfigMap, error: %v", snapshotExtraMetadataConfigMapName, err) - } else { - logrus.Debugf("Setting extra metadata from %s ConfigMap", snapshotExtraMetadataConfigMapName) - logrus.Tracef("Marshalled extra metadata in %s ConfigMap was: %s", snapshotExtraMetadataConfigMapName, string(m)) - extraMetadata = base64.StdEncoding.EncodeToString(m) - } - } - } - - endpoints := getEndpoints(e.config) - var client *clientv3.Client - var err error - - // Use the internal client if possible, or create a new one - // if run from the CLI. - if e.client != nil { - client = e.client - } else { - client, err = getClient(ctx, e.config, endpoints...) - if err != nil { - return err - } - defer client.Close() - } - - status, err := client.Status(ctx, endpoints[0]) - if err != nil { - return errors.Wrap(err, "failed to check etcd status for snapshot") - } - - if status.IsLearner { - logrus.Warnf("Unable to take snapshot: not supported for learner") - return nil - } - - snapshotDir, err := snapshotDir(e.config, true) - if err != nil { - return errors.Wrap(err, "failed to get the snapshot dir") - } - - cfg, err := getClientConfig(ctx, e.config) - if err != nil { - return errors.Wrap(err, "failed to get config for etcd snapshot") - } - - nodeName := os.Getenv("NODE_NAME") - now := time.Now() - snapshotName := fmt.Sprintf("%s-%s-%d", e.config.EtcdSnapshotName, nodeName, now.Unix()) - snapshotPath := filepath.Join(snapshotDir, snapshotName) - - logrus.Infof("Saving etcd snapshot to %s", snapshotPath) - - var sf *snapshotFile - - lg, err := logutil.CreateDefaultZapLogger(zap.InfoLevel) - if err != nil { - return err - } - - if err := snapshot.NewV3(lg).Save(ctx, *cfg, snapshotPath); err != nil { - sf = &snapshotFile{ - Name: snapshotName, - Location: "", - Metadata: extraMetadata, - NodeName: nodeName, - CreatedAt: &metav1.Time{ - Time: now, - }, - Status: failedSnapshotStatus, - Message: base64.StdEncoding.EncodeToString([]byte(err.Error())), - Size: 0, - Compressed: e.config.EtcdSnapshotCompress, - } - logrus.Errorf("Failed to take etcd snapshot: %v", err) - if err := e.addSnapshotData(*sf); err != nil { - return errors.Wrap(err, "failed to save local snapshot failure data to configmap") - } - } - - if e.config.EtcdSnapshotCompress { - zipPath, err := e.compressSnapshot(snapshotDir, snapshotName, snapshotPath) - if err != nil { - return err - } - if err := os.Remove(snapshotPath); err != nil { - return err - } - snapshotPath = zipPath - logrus.Info("Compressed snapshot: " + snapshotPath) - } - - // If the snapshot attempt was successful, sf will be nil as we did not set it. - if sf == nil { - f, err := os.Stat(snapshotPath) - if err != nil { - return errors.Wrap(err, "unable to retrieve snapshot information from local snapshot") - } - sf = &snapshotFile{ - Name: f.Name(), - Metadata: extraMetadata, - Location: "file://" + snapshotPath, - NodeName: nodeName, - CreatedAt: &metav1.Time{ - Time: f.ModTime(), - }, - Status: successfulSnapshotStatus, - Size: f.Size(), - Compressed: e.config.EtcdSnapshotCompress, - } - - if err := e.addSnapshotData(*sf); err != nil { - return errors.Wrap(err, "failed to save local snapshot data to configmap") - } - if err := snapshotRetention(e.config.EtcdSnapshotRetention, e.config.EtcdSnapshotName, snapshotDir); err != nil { - return errors.Wrap(err, "failed to apply local snapshot retention policy") - } - - if e.config.EtcdS3 { - logrus.Infof("Saving etcd snapshot %s to S3", snapshotName) - // Set sf to nil so that we can attempt to now upload the snapshot to S3 if needed - sf = nil - if err := e.initS3IfNil(ctx); err != nil { - logrus.Warnf("Unable to initialize S3 client: %v", err) - sf = &snapshotFile{ - Name: filepath.Base(snapshotPath), - Metadata: extraMetadata, - NodeName: "s3", - CreatedAt: &metav1.Time{ - Time: now, - }, - Message: base64.StdEncoding.EncodeToString([]byte(err.Error())), - Size: 0, - Status: failedSnapshotStatus, - S3: &s3Config{ - Endpoint: e.config.EtcdS3Endpoint, - EndpointCA: e.config.EtcdS3EndpointCA, - SkipSSLVerify: e.config.EtcdS3SkipSSLVerify, - Bucket: e.config.EtcdS3BucketName, - Region: e.config.EtcdS3Region, - Folder: e.config.EtcdS3Folder, - Insecure: e.config.EtcdS3Insecure, - }, - } - } - // sf should be nil if we were able to successfully initialize the S3 client. - if sf == nil { - sf, err = e.s3.upload(ctx, snapshotPath, extraMetadata, now) - if err != nil { - return err - } - logrus.Infof("S3 upload complete for %s", snapshotName) - if err := e.s3.snapshotRetention(ctx); err != nil { - return errors.Wrap(err, "failed to apply s3 snapshot retention policy") - } - } - if err := e.addSnapshotData(*sf); err != nil { - return errors.Wrap(err, "failed to save snapshot data to configmap") - } - } - } - - return e.ReconcileSnapshotData(ctx) -} - -type s3Config struct { - Endpoint string `json:"endpoint,omitempty"` - EndpointCA string `json:"endpointCA,omitempty"` - SkipSSLVerify bool `json:"skipSSLVerify,omitempty"` - Bucket string `json:"bucket,omitempty"` - Region string `json:"region,omitempty"` - Folder string `json:"folder,omitempty"` - Insecure bool `json:"insecure,omitempty"` -} - -type snapshotStatus string - -const ( - successfulSnapshotStatus snapshotStatus = "successful" - failedSnapshotStatus snapshotStatus = "failed" -) - -// snapshotFile represents a single snapshot and it's -// metadata. -type snapshotFile struct { - Name string `json:"name"` - // Location contains the full path of the snapshot. For - // local paths, the location will be prefixed with "file://". - Location string `json:"location,omitempty"` - Metadata string `json:"metadata,omitempty"` - Message string `json:"message,omitempty"` - NodeName string `json:"nodeName,omitempty"` - CreatedAt *metav1.Time `json:"createdAt,omitempty"` - Size int64 `json:"size,omitempty"` - Status snapshotStatus `json:"status,omitempty"` - S3 *s3Config `json:"s3Config,omitempty"` - Compressed bool `json:"compressed"` -} - -// listLocalSnapshots provides a list of the currently stored -// snapshots on disk along with their relevant -// metadata. -func (e *ETCD) listLocalSnapshots() (map[string]snapshotFile, error) { - snapshots := make(map[string]snapshotFile) - snapshotDir, err := snapshotDir(e.config, true) - if err != nil { - return snapshots, errors.Wrap(err, "failed to get the snapshot dir") - } - - dirEntries, err := os.ReadDir(snapshotDir) - if err != nil { - return nil, err - } - - nodeName := os.Getenv("NODE_NAME") - - for _, de := range dirEntries { - file, err := de.Info() - if err != nil { - return nil, err - } - sf := snapshotFile{ - Name: file.Name(), - Location: "file://" + filepath.Join(snapshotDir, file.Name()), - NodeName: nodeName, - CreatedAt: &metav1.Time{ - Time: file.ModTime(), - }, - Size: file.Size(), - Status: successfulSnapshotStatus, - } - sfKey := generateSnapshotConfigMapKey(sf) - snapshots[sfKey] = sf - } - - return snapshots, nil -} - -// listS3Snapshots provides a list of currently stored -// snapshots in S3 along with their relevant -// metadata. -func (e *ETCD) listS3Snapshots(ctx context.Context) (map[string]snapshotFile, error) { - snapshots := make(map[string]snapshotFile) - - if e.config.EtcdS3 { - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - if err := e.initS3IfNil(ctx); err != nil { - return nil, err - } - - var loo minio.ListObjectsOptions - if e.config.EtcdS3Folder != "" { - loo = minio.ListObjectsOptions{ - Prefix: e.config.EtcdS3Folder, - Recursive: true, - } - } - - objects := e.s3.client.ListObjects(ctx, e.config.EtcdS3BucketName, loo) - - for obj := range objects { - if obj.Err != nil { - return nil, obj.Err - } - if obj.Size == 0 { - continue - } - - ca, err := time.Parse(time.RFC3339, obj.LastModified.Format(time.RFC3339)) - if err != nil { - return nil, err - } - - sf := snapshotFile{ - Name: filepath.Base(obj.Key), - NodeName: "s3", - CreatedAt: &metav1.Time{ - Time: ca, - }, - Size: obj.Size, - S3: &s3Config{ - Endpoint: e.config.EtcdS3Endpoint, - EndpointCA: e.config.EtcdS3EndpointCA, - SkipSSLVerify: e.config.EtcdS3SkipSSLVerify, - Bucket: e.config.EtcdS3BucketName, - Region: e.config.EtcdS3Region, - Folder: e.config.EtcdS3Folder, - Insecure: e.config.EtcdS3Insecure, - }, - Status: successfulSnapshotStatus, - } - sfKey := generateSnapshotConfigMapKey(sf) - snapshots[sfKey] = sf - } - } - return snapshots, nil -} - -// initS3IfNil initializes the S3 client -// if it hasn't yet been initialized. -func (e *ETCD) initS3IfNil(ctx context.Context) error { - if e.s3 == nil { - s3, err := NewS3(ctx, e.config) - if err != nil { - return err - } - e.s3 = s3 - } - - return nil -} - -// PruneSnapshots performs a retention run with the given -// retention duration and removes expired snapshots. -func (e *ETCD) PruneSnapshots(ctx context.Context) error { - snapshotDir, err := snapshotDir(e.config, false) - if err != nil { - return errors.Wrap(err, "failed to get the snapshot dir") - } - if err := snapshotRetention(e.config.EtcdSnapshotRetention, e.config.EtcdSnapshotName, snapshotDir); err != nil { - logrus.Errorf("Error applying snapshot retention policy: %v", err) - } - - if e.config.EtcdS3 { - if err := e.initS3IfNil(ctx); err != nil { - logrus.Warnf("Unable to initialize S3 client during prune: %v", err) - } else { - if err := e.s3.snapshotRetention(ctx); err != nil { - logrus.Errorf("Error applying S3 snapshot retention policy: %v", err) - } - } - } - - return e.ReconcileSnapshotData(ctx) -} - -// ListSnapshots is an exported wrapper method that wraps an -// unexported method of the same name. -func (e *ETCD) ListSnapshots(ctx context.Context) (map[string]snapshotFile, error) { - if e.config.EtcdS3 { - return e.listS3Snapshots(ctx) - } - return e.listLocalSnapshots() -} - -// deleteSnapshots removes the given snapshots from -// either local storage or S3. -func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error { - snapshotDir, err := snapshotDir(e.config, false) - if err != nil { - return errors.Wrap(err, "failed to get the snapshot dir") - } - - if e.config.EtcdS3 { - logrus.Info("Removing the given etcd snapshot(s) from S3") - logrus.Debugf("Removing the given etcd snapshot(s) from S3: %v", snapshots) - - if e.initS3IfNil(ctx); err != nil { - return err - } - - objectsCh := make(chan minio.ObjectInfo) - - ctx, cancel := context.WithTimeout(ctx, e.config.EtcdS3Timeout) - defer cancel() - - go func() { - defer close(objectsCh) - - opts := minio.ListObjectsOptions{ - Recursive: true, - } - - for obj := range e.s3.client.ListObjects(ctx, e.config.EtcdS3BucketName, opts) { - if obj.Err != nil { - logrus.Error(obj.Err) - return - } - - // iterate through the given snapshots and only - // add them to the channel for remove if they're - // actually found from the bucket listing. - for _, snapshot := range snapshots { - if snapshot == obj.Key { - objectsCh <- obj - } - } - } - }() - - err = func() error { - for { - select { - case <-ctx.Done(): - logrus.Errorf("Unable to delete snapshot: %v", ctx.Err()) - return e.ReconcileSnapshotData(ctx) - case <-time.After(time.Millisecond * 100): - continue - case err, ok := <-e.s3.client.RemoveObjects(ctx, e.config.EtcdS3BucketName, objectsCh, minio.RemoveObjectsOptions{}): - if err.Err != nil { - logrus.Errorf("Unable to delete snapshot: %v", err.Err) - } - if !ok { - return e.ReconcileSnapshotData(ctx) - } - } - } - }() - if err != nil { - return err - } - } - - logrus.Info("Removing the given locally stored etcd snapshot(s)") - logrus.Debugf("Attempting to remove the given locally stored etcd snapshot(s): %v", snapshots) - - for _, s := range snapshots { - // check if the given snapshot exists. If it does, - // remove it, otherwise continue. - sf := filepath.Join(snapshotDir, s) - if _, err := os.Stat(sf); os.IsNotExist(err) { - logrus.Infof("Snapshot %s, does not exist", s) - continue - } - if err := os.Remove(sf); err != nil { - return err - } - logrus.Debug("Removed snapshot ", s) - } - - return e.ReconcileSnapshotData(ctx) -} - -// AddSnapshotData adds the given snapshot file information to the snapshot configmap, using the existing extra metadata -// available at the time. -func (e *ETCD) addSnapshotData(sf snapshotFile) error { - return retry.OnError(snapshotDataBackoff, func(err error) bool { - return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) - }, func() error { - // make sure the core.Factory is initialized. There can - // be a race between this core code startup. - for e.config.Runtime.Core == nil { - runtime.Gosched() - } - snapshotConfigMap, getErr := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) - - sfKey := generateSnapshotConfigMapKey(sf) - marshalledSnapshotFile, err := json.Marshal(sf) - if err != nil { - return err - } - if apierrors.IsNotFound(getErr) { - cm := v1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: snapshotConfigMapName, - Namespace: metav1.NamespaceSystem, - }, - Data: map[string]string{sfKey: string(marshalledSnapshotFile)}, - } - _, err := e.config.Runtime.Core.Core().V1().ConfigMap().Create(&cm) - return err - } - - if snapshotConfigMap.Data == nil { - snapshotConfigMap.Data = make(map[string]string) - } - - snapshotConfigMap.Data[sfKey] = string(marshalledSnapshotFile) - - _, err = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) - return err - }) -} - -func generateSnapshotConfigMapKey(sf snapshotFile) string { - name := invalidKeyChars.ReplaceAllString(sf.Name, "_") - if sf.NodeName == "s3" { - return "s3-" + name - } - return "local-" + name -} - -// ReconcileSnapshotData reconciles snapshot data in the snapshot ConfigMap. -// It will reconcile snapshot data from disk locally always, and if S3 is enabled, will attempt to list S3 snapshots -// and reconcile snapshots from S3. Notably, -func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { - logrus.Infof("Reconciling etcd snapshot data in %s ConfigMap", snapshotConfigMapName) - defer logrus.Infof("Reconciliation of snapshot data in %s ConfigMap complete", snapshotConfigMapName) - return retry.OnError(retry.DefaultBackoff, func(err error) bool { - return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) - }, func() error { - // make sure the core.Factory is initialize. There can - // be a race between this core code startup. - for e.config.Runtime.Core == nil { - runtime.Gosched() - } - - logrus.Debug("core.Factory is initialized") - - snapshotConfigMap, getErr := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) - if apierrors.IsNotFound(getErr) { - // Can't reconcile what doesn't exist. - return errors.New("No snapshot configmap found") - } - - logrus.Debugf("Attempting to reconcile etcd snapshot data for configmap generation %d", snapshotConfigMap.Generation) - - // if the snapshot config map data is nil, no need to reconcile. - if snapshotConfigMap.Data == nil { - return nil - } - - snapshotFiles, err := e.listLocalSnapshots() - if err != nil { - return err - } - - // s3ListSuccessful is set to true if we are successful at listing snapshots from S3 to eliminate accidental - // clobbering of S3 snapshots in the configmap due to misconfigured S3 credentials/details - s3ListSuccessful := false - - if e.config.EtcdS3 { - if s3Snapshots, err := e.listS3Snapshots(ctx); err != nil { - logrus.Errorf("error retrieving S3 snapshots for reconciliation: %v", err) - } else { - for k, v := range s3Snapshots { - snapshotFiles[k] = v - } - s3ListSuccessful = true - } - } - - nodeName := os.Getenv("NODE_NAME") - - // deletedSnapshots is a map[string]string where key is the configmap key and the value is the marshalled snapshot file - // it will be populated below with snapshots that are either from S3 or on the local node. Notably, deletedSnapshots will - // not contain snapshots that are in the "failed" status - deletedSnapshots := make(map[string]string) - // failedSnapshots is a slice of unmarshaled snapshot files sourced from the configmap - // These are stored unmarshaled so we can sort based on name. - var failedSnapshots []snapshotFile - var failedS3Snapshots []snapshotFile - - // remove entries for this node and s3 (if S3 is enabled) only - for k, v := range snapshotConfigMap.Data { - var sf snapshotFile - if err := json.Unmarshal([]byte(v), &sf); err != nil { - return err - } - if (sf.NodeName == nodeName || (sf.NodeName == "s3" && s3ListSuccessful)) && sf.Status != failedSnapshotStatus { - // Only delete the snapshot if the snapshot was not failed - // sf.Status != FailedSnapshotStatus is intentional, as it is possible we are reconciling snapshots stored from older versions that did not set status - deletedSnapshots[generateSnapshotConfigMapKey(sf)] = v // store a copy of the snapshot - delete(snapshotConfigMap.Data, k) - } else if sf.Status == failedSnapshotStatus && sf.NodeName == nodeName && e.config.EtcdSnapshotRetention >= 1 { - // Handle locally failed snapshots. - failedSnapshots = append(failedSnapshots, sf) - delete(snapshotConfigMap.Data, k) - } else if sf.Status == failedSnapshotStatus && e.config.EtcdS3 && sf.NodeName == "s3" && strings.HasPrefix(sf.Name, e.config.EtcdSnapshotName+"-"+nodeName) && e.config.EtcdSnapshotRetention >= 1 { - // If we're operating against S3, we can clean up failed S3 snapshots that failed on this node. - failedS3Snapshots = append(failedS3Snapshots, sf) - delete(snapshotConfigMap.Data, k) - } - } - - // Apply the failed snapshot retention policy to locally failed snapshots - if len(failedSnapshots) > 0 && e.config.EtcdSnapshotRetention >= 1 { - sort.Slice(failedSnapshots, func(i, j int) bool { - return failedSnapshots[i].Name > failedSnapshots[j].Name - }) - - var keepCount int - if e.config.EtcdSnapshotRetention >= len(failedSnapshots) { - keepCount = len(failedSnapshots) - } else { - keepCount = e.config.EtcdSnapshotRetention - } - for _, dfs := range failedSnapshots[:keepCount] { - sfKey := generateSnapshotConfigMapKey(dfs) - marshalledSnapshot, err := json.Marshal(dfs) - if err != nil { - logrus.Errorf("unable to marshal snapshot to store in configmap %v", err) - } else { - snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) - } - } - } - - // Apply the failed snapshot retention policy to the S3 snapshots - if len(failedS3Snapshots) > 0 && e.config.EtcdSnapshotRetention >= 1 { - sort.Slice(failedS3Snapshots, func(i, j int) bool { - return failedS3Snapshots[i].Name > failedS3Snapshots[j].Name - }) - - var keepCount int - if e.config.EtcdSnapshotRetention >= len(failedS3Snapshots) { - keepCount = len(failedS3Snapshots) - } else { - keepCount = e.config.EtcdSnapshotRetention - } - for _, dfs := range failedS3Snapshots[:keepCount] { - sfKey := generateSnapshotConfigMapKey(dfs) - marshalledSnapshot, err := json.Marshal(dfs) - if err != nil { - logrus.Errorf("unable to marshal snapshot to store in configmap %v", err) - } else { - snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) - } - } - } - - // save the local entries to the ConfigMap if they are still on disk or in S3. - for _, snapshot := range snapshotFiles { - var sf snapshotFile - sfKey := generateSnapshotConfigMapKey(snapshot) - if v, ok := deletedSnapshots[sfKey]; ok { - // use the snapshot file we have from the existing configmap, and unmarshal it so we can manipulate it - if err := json.Unmarshal([]byte(v), &sf); err != nil { - logrus.Errorf("error unmarshaling snapshot file: %v", err) - // use the snapshot with info we sourced from disk/S3 (will be missing metadata, but something is better than nothing) - sf = snapshot - } - } else { - sf = snapshot - } - - sf.Status = successfulSnapshotStatus // if the snapshot is on disk or in S3, it was successful. - - marshalledSnapshot, err := json.Marshal(sf) - if err != nil { - logrus.Warnf("unable to marshal snapshot metadata %s to store in configmap, received error: %v", sf.Name, err) - } else { - snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) - } - } - - logrus.Debugf("Updating snapshot ConfigMap (%s) with %d entries", snapshotConfigMapName, len(snapshotConfigMap.Data)) - _, err = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) - return err - }) -} - -// setSnapshotFunction schedules snapshots at the configured interval. -func (e *ETCD) setSnapshotFunction(ctx context.Context) { - skipJob := cron.SkipIfStillRunning(cronLogger) - e.cron.AddJob(e.config.EtcdSnapshotCron, skipJob(cron.FuncJob(func() { - // Add a small amount of jitter to the actual snapshot execution. On clusters with multiple servers, - // having all the nodes take a snapshot at the exact same time can lead to excessive retry thrashing - // when updating the snapshot list configmap. - time.Sleep(time.Duration(rand.Float64() * float64(snapshotJitterMax))) - if err := e.Snapshot(ctx); err != nil { - logrus.Error(err) - } - }))) -} - // Restore performs a restore of the ETCD datastore from // the given snapshot path. This operation exists upon // completion. @@ -2102,49 +1278,6 @@ func (e *ETCD) Restore(ctx context.Context) error { }) } -// snapshotRetention iterates through the snapshots and removes the oldest -// leaving the desired number of snapshots. -func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) error { - if retention < 1 { - return nil - } - - logrus.Infof("Applying local snapshot retention policy: retention: %d, snapshotPrefix: %s, directory: %s", retention, snapshotPrefix, snapshotDir) - - var snapshotFiles []os.FileInfo - if err := filepath.Walk(snapshotDir, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if strings.HasPrefix(info.Name(), snapshotPrefix) { - snapshotFiles = append(snapshotFiles, info) - } - return nil - }); err != nil { - return err - } - if len(snapshotFiles) <= retention { - return nil - } - sort.Slice(snapshotFiles, func(firstSnapshot, secondSnapshot int) bool { - // it takes the name from the snapshot file ex: etcd-snapshot-example-{date}, makes the split using "-" to find the date, takes the date and sort by date - firstSnapshotName, secondSnapshotName := strings.Split(snapshotFiles[firstSnapshot].Name(), "-"), strings.Split(snapshotFiles[secondSnapshot].Name(), "-") - firstSnapshotDate, secondSnapshotDate := firstSnapshotName[len(firstSnapshotName)-1], secondSnapshotName[len(secondSnapshotName)-1] - return firstSnapshotDate < secondSnapshotDate - }) - - delCount := len(snapshotFiles) - retention - for _, df := range snapshotFiles[:delCount] { - snapshotPath := filepath.Join(snapshotDir, df.Name()) - logrus.Infof("Removing local snapshot %s", snapshotPath) - if err := os.Remove(snapshotPath); err != nil { - return err - } - } - - return nil -} - // backupDirWithRetention will move the dir to a backup dir // and will keep only maxBackupRetention of dirs. func backupDirWithRetention(dir string, maxBackupRetention int) (string, error) { diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go new file mode 100644 index 000000000000..e1a12e9e44ea --- /dev/null +++ b/pkg/etcd/snapshot.go @@ -0,0 +1,897 @@ +package etcd + +import ( + "archive/zip" + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "math/rand" + "os" + "path/filepath" + "runtime" + "sort" + "strings" + "time" + + "github.com/k3s-io/k3s/pkg/daemons/config" + "github.com/k3s-io/k3s/pkg/version" + "github.com/minio/minio-go/v7" + "github.com/pkg/errors" + "github.com/robfig/cron/v3" + "github.com/sirupsen/logrus" + "go.etcd.io/etcd/client/pkg/v3/logutil" + clientv3 "go.etcd.io/etcd/client/v3" + "go.etcd.io/etcd/etcdutl/v3/snapshot" + "go.uber.org/zap" + "golang.org/x/sync/semaphore" + v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" +) + +const ( + maxConcurrentSnapshots = 1 + compressedExtension = ".zip" +) + +var ( + snapshotExtraMetadataConfigMapName = version.Program + "-etcd-snapshot-extra-metadata" + snapshotConfigMapName = version.Program + "-etcd-snapshots" + + // snapshotDataBackoff will retry at increasing steps for up to ~30 seconds. + // If the ConfigMap update fails, the list won't be reconciled again until next time + // the server starts, so we should be fairly persistent in retrying. + snapshotDataBackoff = wait.Backoff{ + Steps: 9, + Duration: 10 * time.Millisecond, + Factor: 3.0, + Jitter: 0.1, + } + + // cronLogger wraps logrus's Printf output as cron-compatible logger + cronLogger = cron.VerbosePrintfLogger(logrus.StandardLogger()) +) + +// snapshotDir ensures that the snapshot directory exists, and then returns its path. +func snapshotDir(config *config.Control, create bool) (string, error) { + if config.EtcdSnapshotDir == "" { + // we have to create the snapshot dir if we are using + // the default snapshot dir if it doesn't exist + defaultSnapshotDir := filepath.Join(config.DataDir, "db", "snapshots") + s, err := os.Stat(defaultSnapshotDir) + if err != nil { + if create && os.IsNotExist(err) { + if err := os.MkdirAll(defaultSnapshotDir, 0700); err != nil { + return "", err + } + return defaultSnapshotDir, nil + } + return "", err + } + if s.IsDir() { + return defaultSnapshotDir, nil + } + } + return config.EtcdSnapshotDir, nil +} + +// preSnapshotSetup checks to see if the necessary components are in place +// to perform an Etcd snapshot. This is necessary primarily for on-demand +// snapshots since they're performed before normal Etcd setup is completed. +func (e *ETCD) preSnapshotSetup(ctx context.Context) error { + if e.snapshotSem == nil { + e.snapshotSem = semaphore.NewWeighted(maxConcurrentSnapshots) + } + return nil +} + +// compressSnapshot compresses the given snapshot and provides the +// caller with the path to the file. +func (e *ETCD) compressSnapshot(snapshotDir, snapshotName, snapshotPath string) (string, error) { + logrus.Info("Compressing etcd snapshot file: " + snapshotName) + + zippedSnapshotName := snapshotName + compressedExtension + zipPath := filepath.Join(snapshotDir, zippedSnapshotName) + + zf, err := os.Create(zipPath) + if err != nil { + return "", err + } + defer zf.Close() + + zipWriter := zip.NewWriter(zf) + defer zipWriter.Close() + + uncompressedPath := filepath.Join(snapshotDir, snapshotName) + fileToZip, err := os.Open(uncompressedPath) + if err != nil { + os.Remove(zipPath) + return "", err + } + defer fileToZip.Close() + + info, err := fileToZip.Stat() + if err != nil { + os.Remove(zipPath) + return "", err + } + + header, err := zip.FileInfoHeader(info) + if err != nil { + os.Remove(zipPath) + return "", err + } + + header.Name = snapshotName + header.Method = zip.Deflate + header.Modified = time.Now() + + writer, err := zipWriter.CreateHeader(header) + if err != nil { + os.Remove(zipPath) + return "", err + } + _, err = io.Copy(writer, fileToZip) + + return zipPath, err +} + +// decompressSnapshot decompresses the given snapshot and provides the caller +// with the full path to the uncompressed snapshot. +func (e *ETCD) decompressSnapshot(snapshotDir, snapshotFile string) (string, error) { + logrus.Info("Decompressing etcd snapshot file: " + snapshotFile) + + r, err := zip.OpenReader(snapshotFile) + if err != nil { + return "", err + } + defer r.Close() + + var decompressed *os.File + for _, sf := range r.File { + decompressed, err = os.OpenFile(strings.Replace(sf.Name, compressedExtension, "", -1), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, sf.Mode()) + if err != nil { + return "", err + } + defer decompressed.Close() + + ss, err := sf.Open() + if err != nil { + return "", err + } + defer ss.Close() + + if _, err := io.Copy(decompressed, ss); err != nil { + os.Remove("") + return "", err + } + } + + return decompressed.Name(), nil +} + +// Snapshot attempts to save a new snapshot to the configured directory, and then clean up any old and failed +// snapshots in excess of the retention limits. This method is used in the internal cron snapshot +// system as well as used to do on-demand snapshots. +func (e *ETCD) Snapshot(ctx context.Context) error { + if err := e.preSnapshotSetup(ctx); err != nil { + return err + } + if !e.snapshotSem.TryAcquire(maxConcurrentSnapshots) { + return fmt.Errorf("%d snapshots already in progress", maxConcurrentSnapshots) + } + defer e.snapshotSem.Release(maxConcurrentSnapshots) + + // make sure the core.Factory is initialized before attempting to add snapshot metadata + var extraMetadata string + if e.config.Runtime.Core == nil { + logrus.Debugf("Cannot retrieve extra metadata from %s ConfigMap: runtime core not ready", snapshotExtraMetadataConfigMapName) + } else { + logrus.Debugf("Attempting to retrieve extra metadata from %s ConfigMap", snapshotExtraMetadataConfigMapName) + if snapshotExtraMetadataConfigMap, err := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotExtraMetadataConfigMapName, metav1.GetOptions{}); err != nil { + logrus.Debugf("Error encountered attempting to retrieve extra metadata from %s ConfigMap, error: %v", snapshotExtraMetadataConfigMapName, err) + } else { + if m, err := json.Marshal(snapshotExtraMetadataConfigMap.Data); err != nil { + logrus.Debugf("Error attempting to marshal extra metadata contained in %s ConfigMap, error: %v", snapshotExtraMetadataConfigMapName, err) + } else { + logrus.Debugf("Setting extra metadata from %s ConfigMap", snapshotExtraMetadataConfigMapName) + logrus.Tracef("Marshalled extra metadata in %s ConfigMap was: %s", snapshotExtraMetadataConfigMapName, string(m)) + extraMetadata = base64.StdEncoding.EncodeToString(m) + } + } + } + + endpoints := getEndpoints(e.config) + var client *clientv3.Client + var err error + + // Use the internal client if possible, or create a new one + // if run from the CLI. + if e.client != nil { + client = e.client + } else { + client, err = getClient(ctx, e.config, endpoints...) + if err != nil { + return err + } + defer client.Close() + } + + status, err := client.Status(ctx, endpoints[0]) + if err != nil { + return errors.Wrap(err, "failed to check etcd status for snapshot") + } + + if status.IsLearner { + logrus.Warnf("Unable to take snapshot: not supported for learner") + return nil + } + + snapshotDir, err := snapshotDir(e.config, true) + if err != nil { + return errors.Wrap(err, "failed to get the snapshot dir") + } + + cfg, err := getClientConfig(ctx, e.config) + if err != nil { + return errors.Wrap(err, "failed to get config for etcd snapshot") + } + + nodeName := os.Getenv("NODE_NAME") + now := time.Now() + snapshotName := fmt.Sprintf("%s-%s-%d", e.config.EtcdSnapshotName, nodeName, now.Unix()) + snapshotPath := filepath.Join(snapshotDir, snapshotName) + + logrus.Infof("Saving etcd snapshot to %s", snapshotPath) + + var sf *snapshotFile + + lg, err := logutil.CreateDefaultZapLogger(zap.InfoLevel) + if err != nil { + return err + } + + if err := snapshot.NewV3(lg).Save(ctx, *cfg, snapshotPath); err != nil { + sf = &snapshotFile{ + Name: snapshotName, + Location: "", + Metadata: extraMetadata, + NodeName: nodeName, + CreatedAt: &metav1.Time{ + Time: now, + }, + Status: failedSnapshotStatus, + Message: base64.StdEncoding.EncodeToString([]byte(err.Error())), + Size: 0, + Compressed: e.config.EtcdSnapshotCompress, + } + logrus.Errorf("Failed to take etcd snapshot: %v", err) + if err := e.addSnapshotData(*sf); err != nil { + return errors.Wrap(err, "failed to save local snapshot failure data to configmap") + } + } + + if e.config.EtcdSnapshotCompress { + zipPath, err := e.compressSnapshot(snapshotDir, snapshotName, snapshotPath) + if err != nil { + return err + } + if err := os.Remove(snapshotPath); err != nil { + return err + } + snapshotPath = zipPath + logrus.Info("Compressed snapshot: " + snapshotPath) + } + + // If the snapshot attempt was successful, sf will be nil as we did not set it. + if sf == nil { + f, err := os.Stat(snapshotPath) + if err != nil { + return errors.Wrap(err, "unable to retrieve snapshot information from local snapshot") + } + sf = &snapshotFile{ + Name: f.Name(), + Metadata: extraMetadata, + Location: "file://" + snapshotPath, + NodeName: nodeName, + CreatedAt: &metav1.Time{ + Time: f.ModTime(), + }, + Status: successfulSnapshotStatus, + Size: f.Size(), + Compressed: e.config.EtcdSnapshotCompress, + } + + if err := e.addSnapshotData(*sf); err != nil { + return errors.Wrap(err, "failed to save local snapshot data to configmap") + } + if err := snapshotRetention(e.config.EtcdSnapshotRetention, e.config.EtcdSnapshotName, snapshotDir); err != nil { + return errors.Wrap(err, "failed to apply local snapshot retention policy") + } + + if e.config.EtcdS3 { + logrus.Infof("Saving etcd snapshot %s to S3", snapshotName) + // Set sf to nil so that we can attempt to now upload the snapshot to S3 if needed + sf = nil + if err := e.initS3IfNil(ctx); err != nil { + logrus.Warnf("Unable to initialize S3 client: %v", err) + sf = &snapshotFile{ + Name: filepath.Base(snapshotPath), + Metadata: extraMetadata, + NodeName: "s3", + CreatedAt: &metav1.Time{ + Time: now, + }, + Message: base64.StdEncoding.EncodeToString([]byte(err.Error())), + Size: 0, + Status: failedSnapshotStatus, + S3: &s3Config{ + Endpoint: e.config.EtcdS3Endpoint, + EndpointCA: e.config.EtcdS3EndpointCA, + SkipSSLVerify: e.config.EtcdS3SkipSSLVerify, + Bucket: e.config.EtcdS3BucketName, + Region: e.config.EtcdS3Region, + Folder: e.config.EtcdS3Folder, + Insecure: e.config.EtcdS3Insecure, + }, + } + } + // sf should be nil if we were able to successfully initialize the S3 client. + if sf == nil { + sf, err = e.s3.upload(ctx, snapshotPath, extraMetadata, now) + if err != nil { + return err + } + logrus.Infof("S3 upload complete for %s", snapshotName) + if err := e.s3.snapshotRetention(ctx); err != nil { + return errors.Wrap(err, "failed to apply s3 snapshot retention policy") + } + } + if err := e.addSnapshotData(*sf); err != nil { + return errors.Wrap(err, "failed to save snapshot data to configmap") + } + } + } + + return e.ReconcileSnapshotData(ctx) +} + +type s3Config struct { + Endpoint string `json:"endpoint,omitempty"` + EndpointCA string `json:"endpointCA,omitempty"` + SkipSSLVerify bool `json:"skipSSLVerify,omitempty"` + Bucket string `json:"bucket,omitempty"` + Region string `json:"region,omitempty"` + Folder string `json:"folder,omitempty"` + Insecure bool `json:"insecure,omitempty"` +} + +type snapshotStatus string + +const ( + successfulSnapshotStatus snapshotStatus = "successful" + failedSnapshotStatus snapshotStatus = "failed" +) + +// snapshotFile represents a single snapshot and it's +// metadata. +type snapshotFile struct { + Name string `json:"name"` + // Location contains the full path of the snapshot. For + // local paths, the location will be prefixed with "file://". + Location string `json:"location,omitempty"` + Metadata string `json:"metadata,omitempty"` + Message string `json:"message,omitempty"` + NodeName string `json:"nodeName,omitempty"` + CreatedAt *metav1.Time `json:"createdAt,omitempty"` + Size int64 `json:"size,omitempty"` + Status snapshotStatus `json:"status,omitempty"` + S3 *s3Config `json:"s3Config,omitempty"` + Compressed bool `json:"compressed"` +} + +// listLocalSnapshots provides a list of the currently stored +// snapshots on disk along with their relevant +// metadata. +func (e *ETCD) listLocalSnapshots() (map[string]snapshotFile, error) { + snapshots := make(map[string]snapshotFile) + snapshotDir, err := snapshotDir(e.config, true) + if err != nil { + return snapshots, errors.Wrap(err, "failed to get the snapshot dir") + } + + dirEntries, err := os.ReadDir(snapshotDir) + if err != nil { + return nil, err + } + + nodeName := os.Getenv("NODE_NAME") + + for _, de := range dirEntries { + file, err := de.Info() + if err != nil { + return nil, err + } + sf := snapshotFile{ + Name: file.Name(), + Location: "file://" + filepath.Join(snapshotDir, file.Name()), + NodeName: nodeName, + CreatedAt: &metav1.Time{ + Time: file.ModTime(), + }, + Size: file.Size(), + Status: successfulSnapshotStatus, + } + sfKey := generateSnapshotConfigMapKey(sf) + snapshots[sfKey] = sf + } + + return snapshots, nil +} + +// listS3Snapshots provides a list of currently stored +// snapshots in S3 along with their relevant +// metadata. +func (e *ETCD) listS3Snapshots(ctx context.Context) (map[string]snapshotFile, error) { + snapshots := make(map[string]snapshotFile) + + if e.config.EtcdS3 { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + if err := e.initS3IfNil(ctx); err != nil { + return nil, err + } + + var loo minio.ListObjectsOptions + if e.config.EtcdS3Folder != "" { + loo = minio.ListObjectsOptions{ + Prefix: e.config.EtcdS3Folder, + Recursive: true, + } + } + + objects := e.s3.client.ListObjects(ctx, e.config.EtcdS3BucketName, loo) + + for obj := range objects { + if obj.Err != nil { + return nil, obj.Err + } + if obj.Size == 0 { + continue + } + + ca, err := time.Parse(time.RFC3339, obj.LastModified.Format(time.RFC3339)) + if err != nil { + return nil, err + } + + sf := snapshotFile{ + Name: filepath.Base(obj.Key), + NodeName: "s3", + CreatedAt: &metav1.Time{ + Time: ca, + }, + Size: obj.Size, + S3: &s3Config{ + Endpoint: e.config.EtcdS3Endpoint, + EndpointCA: e.config.EtcdS3EndpointCA, + SkipSSLVerify: e.config.EtcdS3SkipSSLVerify, + Bucket: e.config.EtcdS3BucketName, + Region: e.config.EtcdS3Region, + Folder: e.config.EtcdS3Folder, + Insecure: e.config.EtcdS3Insecure, + }, + Status: successfulSnapshotStatus, + } + sfKey := generateSnapshotConfigMapKey(sf) + snapshots[sfKey] = sf + } + } + return snapshots, nil +} + +// initS3IfNil initializes the S3 client +// if it hasn't yet been initialized. +func (e *ETCD) initS3IfNil(ctx context.Context) error { + if e.s3 == nil { + s3, err := NewS3(ctx, e.config) + if err != nil { + return err + } + e.s3 = s3 + } + + return nil +} + +// PruneSnapshots performs a retention run with the given +// retention duration and removes expired snapshots. +func (e *ETCD) PruneSnapshots(ctx context.Context) error { + snapshotDir, err := snapshotDir(e.config, false) + if err != nil { + return errors.Wrap(err, "failed to get the snapshot dir") + } + if err := snapshotRetention(e.config.EtcdSnapshotRetention, e.config.EtcdSnapshotName, snapshotDir); err != nil { + logrus.Errorf("Error applying snapshot retention policy: %v", err) + } + + if e.config.EtcdS3 { + if err := e.initS3IfNil(ctx); err != nil { + logrus.Warnf("Unable to initialize S3 client during prune: %v", err) + } else { + if err := e.s3.snapshotRetention(ctx); err != nil { + logrus.Errorf("Error applying S3 snapshot retention policy: %v", err) + } + } + } + + return e.ReconcileSnapshotData(ctx) +} + +// ListSnapshots is an exported wrapper method that wraps an +// unexported method of the same name. +func (e *ETCD) ListSnapshots(ctx context.Context) (map[string]snapshotFile, error) { + if e.config.EtcdS3 { + return e.listS3Snapshots(ctx) + } + return e.listLocalSnapshots() +} + +// deleteSnapshots removes the given snapshots from +// either local storage or S3. +func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error { + snapshotDir, err := snapshotDir(e.config, false) + if err != nil { + return errors.Wrap(err, "failed to get the snapshot dir") + } + + if e.config.EtcdS3 { + logrus.Info("Removing the given etcd snapshot(s) from S3") + logrus.Debugf("Removing the given etcd snapshot(s) from S3: %v", snapshots) + + if e.initS3IfNil(ctx); err != nil { + return err + } + + objectsCh := make(chan minio.ObjectInfo) + + ctx, cancel := context.WithTimeout(ctx, e.config.EtcdS3Timeout) + defer cancel() + + go func() { + defer close(objectsCh) + + opts := minio.ListObjectsOptions{ + Recursive: true, + } + + for obj := range e.s3.client.ListObjects(ctx, e.config.EtcdS3BucketName, opts) { + if obj.Err != nil { + logrus.Error(obj.Err) + return + } + + // iterate through the given snapshots and only + // add them to the channel for remove if they're + // actually found from the bucket listing. + for _, snapshot := range snapshots { + if snapshot == obj.Key { + objectsCh <- obj + } + } + } + }() + + err = func() error { + for { + select { + case <-ctx.Done(): + logrus.Errorf("Unable to delete snapshot: %v", ctx.Err()) + return e.ReconcileSnapshotData(ctx) + case <-time.After(time.Millisecond * 100): + continue + case err, ok := <-e.s3.client.RemoveObjects(ctx, e.config.EtcdS3BucketName, objectsCh, minio.RemoveObjectsOptions{}): + if err.Err != nil { + logrus.Errorf("Unable to delete snapshot: %v", err.Err) + } + if !ok { + return e.ReconcileSnapshotData(ctx) + } + } + } + }() + if err != nil { + return err + } + } + + logrus.Info("Removing the given locally stored etcd snapshot(s)") + logrus.Debugf("Attempting to remove the given locally stored etcd snapshot(s): %v", snapshots) + + for _, s := range snapshots { + // check if the given snapshot exists. If it does, + // remove it, otherwise continue. + sf := filepath.Join(snapshotDir, s) + if _, err := os.Stat(sf); os.IsNotExist(err) { + logrus.Infof("Snapshot %s, does not exist", s) + continue + } + if err := os.Remove(sf); err != nil { + return err + } + logrus.Debug("Removed snapshot ", s) + } + + return e.ReconcileSnapshotData(ctx) +} + +// AddSnapshotData adds the given snapshot file information to the snapshot configmap, using the existing extra metadata +// available at the time. +func (e *ETCD) addSnapshotData(sf snapshotFile) error { + return retry.OnError(snapshotDataBackoff, func(err error) bool { + return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) + }, func() error { + // make sure the core.Factory is initialized. There can + // be a race between this core code startup. + for e.config.Runtime.Core == nil { + runtime.Gosched() + } + snapshotConfigMap, getErr := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) + + sfKey := generateSnapshotConfigMapKey(sf) + marshalledSnapshotFile, err := json.Marshal(sf) + if err != nil { + return err + } + if apierrors.IsNotFound(getErr) { + cm := v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: snapshotConfigMapName, + Namespace: metav1.NamespaceSystem, + }, + Data: map[string]string{sfKey: string(marshalledSnapshotFile)}, + } + _, err := e.config.Runtime.Core.Core().V1().ConfigMap().Create(&cm) + return err + } + + if snapshotConfigMap.Data == nil { + snapshotConfigMap.Data = make(map[string]string) + } + + snapshotConfigMap.Data[sfKey] = string(marshalledSnapshotFile) + + _, err = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) + return err + }) +} + +func generateSnapshotConfigMapKey(sf snapshotFile) string { + name := invalidKeyChars.ReplaceAllString(sf.Name, "_") + if sf.NodeName == "s3" { + return "s3-" + name + } + return "local-" + name +} + +// ReconcileSnapshotData reconciles snapshot data in the snapshot ConfigMap. +// It will reconcile snapshot data from disk locally always, and if S3 is enabled, will attempt to list S3 snapshots +// and reconcile snapshots from S3. Notably, +func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { + logrus.Infof("Reconciling etcd snapshot data in %s ConfigMap", snapshotConfigMapName) + defer logrus.Infof("Reconciliation of snapshot data in %s ConfigMap complete", snapshotConfigMapName) + return retry.OnError(retry.DefaultBackoff, func(err error) bool { + return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) + }, func() error { + // make sure the core.Factory is initialize. There can + // be a race between this core code startup. + for e.config.Runtime.Core == nil { + runtime.Gosched() + } + + logrus.Debug("core.Factory is initialized") + + snapshotConfigMap, getErr := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) + if apierrors.IsNotFound(getErr) { + // Can't reconcile what doesn't exist. + return errors.New("No snapshot configmap found") + } + + logrus.Debugf("Attempting to reconcile etcd snapshot data for configmap generation %d", snapshotConfigMap.Generation) + + // if the snapshot config map data is nil, no need to reconcile. + if snapshotConfigMap.Data == nil { + return nil + } + + snapshotFiles, err := e.listLocalSnapshots() + if err != nil { + return err + } + + // s3ListSuccessful is set to true if we are successful at listing snapshots from S3 to eliminate accidental + // clobbering of S3 snapshots in the configmap due to misconfigured S3 credentials/details + s3ListSuccessful := false + + if e.config.EtcdS3 { + if s3Snapshots, err := e.listS3Snapshots(ctx); err != nil { + logrus.Errorf("error retrieving S3 snapshots for reconciliation: %v", err) + } else { + for k, v := range s3Snapshots { + snapshotFiles[k] = v + } + s3ListSuccessful = true + } + } + + nodeName := os.Getenv("NODE_NAME") + + // deletedSnapshots is a map[string]string where key is the configmap key and the value is the marshalled snapshot file + // it will be populated below with snapshots that are either from S3 or on the local node. Notably, deletedSnapshots will + // not contain snapshots that are in the "failed" status + deletedSnapshots := make(map[string]string) + // failedSnapshots is a slice of unmarshaled snapshot files sourced from the configmap + // These are stored unmarshaled so we can sort based on name. + var failedSnapshots []snapshotFile + var failedS3Snapshots []snapshotFile + + // remove entries for this node and s3 (if S3 is enabled) only + for k, v := range snapshotConfigMap.Data { + var sf snapshotFile + if err := json.Unmarshal([]byte(v), &sf); err != nil { + return err + } + if (sf.NodeName == nodeName || (sf.NodeName == "s3" && s3ListSuccessful)) && sf.Status != failedSnapshotStatus { + // Only delete the snapshot if the snapshot was not failed + // sf.Status != FailedSnapshotStatus is intentional, as it is possible we are reconciling snapshots stored from older versions that did not set status + deletedSnapshots[generateSnapshotConfigMapKey(sf)] = v // store a copy of the snapshot + delete(snapshotConfigMap.Data, k) + } else if sf.Status == failedSnapshotStatus && sf.NodeName == nodeName && e.config.EtcdSnapshotRetention >= 1 { + // Handle locally failed snapshots. + failedSnapshots = append(failedSnapshots, sf) + delete(snapshotConfigMap.Data, k) + } else if sf.Status == failedSnapshotStatus && e.config.EtcdS3 && sf.NodeName == "s3" && strings.HasPrefix(sf.Name, e.config.EtcdSnapshotName+"-"+nodeName) && e.config.EtcdSnapshotRetention >= 1 { + // If we're operating against S3, we can clean up failed S3 snapshots that failed on this node. + failedS3Snapshots = append(failedS3Snapshots, sf) + delete(snapshotConfigMap.Data, k) + } + } + + // Apply the failed snapshot retention policy to locally failed snapshots + if len(failedSnapshots) > 0 && e.config.EtcdSnapshotRetention >= 1 { + sort.Slice(failedSnapshots, func(i, j int) bool { + return failedSnapshots[i].Name > failedSnapshots[j].Name + }) + + var keepCount int + if e.config.EtcdSnapshotRetention >= len(failedSnapshots) { + keepCount = len(failedSnapshots) + } else { + keepCount = e.config.EtcdSnapshotRetention + } + for _, dfs := range failedSnapshots[:keepCount] { + sfKey := generateSnapshotConfigMapKey(dfs) + marshalledSnapshot, err := json.Marshal(dfs) + if err != nil { + logrus.Errorf("unable to marshal snapshot to store in configmap %v", err) + } else { + snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) + } + } + } + + // Apply the failed snapshot retention policy to the S3 snapshots + if len(failedS3Snapshots) > 0 && e.config.EtcdSnapshotRetention >= 1 { + sort.Slice(failedS3Snapshots, func(i, j int) bool { + return failedS3Snapshots[i].Name > failedS3Snapshots[j].Name + }) + + var keepCount int + if e.config.EtcdSnapshotRetention >= len(failedS3Snapshots) { + keepCount = len(failedS3Snapshots) + } else { + keepCount = e.config.EtcdSnapshotRetention + } + for _, dfs := range failedS3Snapshots[:keepCount] { + sfKey := generateSnapshotConfigMapKey(dfs) + marshalledSnapshot, err := json.Marshal(dfs) + if err != nil { + logrus.Errorf("unable to marshal snapshot to store in configmap %v", err) + } else { + snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) + } + } + } + + // save the local entries to the ConfigMap if they are still on disk or in S3. + for _, snapshot := range snapshotFiles { + var sf snapshotFile + sfKey := generateSnapshotConfigMapKey(snapshot) + if v, ok := deletedSnapshots[sfKey]; ok { + // use the snapshot file we have from the existing configmap, and unmarshal it so we can manipulate it + if err := json.Unmarshal([]byte(v), &sf); err != nil { + logrus.Errorf("error unmarshaling snapshot file: %v", err) + // use the snapshot with info we sourced from disk/S3 (will be missing metadata, but something is better than nothing) + sf = snapshot + } + } else { + sf = snapshot + } + + sf.Status = successfulSnapshotStatus // if the snapshot is on disk or in S3, it was successful. + + marshalledSnapshot, err := json.Marshal(sf) + if err != nil { + logrus.Warnf("unable to marshal snapshot metadata %s to store in configmap, received error: %v", sf.Name, err) + } else { + snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) + } + } + + logrus.Debugf("Updating snapshot ConfigMap (%s) with %d entries", snapshotConfigMapName, len(snapshotConfigMap.Data)) + _, err = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) + return err + }) +} + +// setSnapshotFunction schedules snapshots at the configured interval. +func (e *ETCD) setSnapshotFunction(ctx context.Context) { + skipJob := cron.SkipIfStillRunning(cronLogger) + e.cron.AddJob(e.config.EtcdSnapshotCron, skipJob(cron.FuncJob(func() { + // Add a small amount of jitter to the actual snapshot execution. On clusters with multiple servers, + // having all the nodes take a snapshot at the exact same time can lead to excessive retry thrashing + // when updating the snapshot list configmap. + time.Sleep(time.Duration(rand.Float64() * float64(snapshotJitterMax))) + if err := e.Snapshot(ctx); err != nil { + logrus.Error(err) + } + }))) +} + +// snapshotRetention iterates through the snapshots and removes the oldest +// leaving the desired number of snapshots. +func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) error { + if retention < 1 { + return nil + } + + logrus.Infof("Applying local snapshot retention policy: retention: %d, snapshotPrefix: %s, directory: %s", retention, snapshotPrefix, snapshotDir) + + var snapshotFiles []os.FileInfo + if err := filepath.Walk(snapshotDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if strings.HasPrefix(info.Name(), snapshotPrefix) { + snapshotFiles = append(snapshotFiles, info) + } + return nil + }); err != nil { + return err + } + if len(snapshotFiles) <= retention { + return nil + } + sort.Slice(snapshotFiles, func(firstSnapshot, secondSnapshot int) bool { + // it takes the name from the snapshot file ex: etcd-snapshot-example-{date}, makes the split using "-" to find the date, takes the date and sort by date + firstSnapshotName, secondSnapshotName := strings.Split(snapshotFiles[firstSnapshot].Name(), "-"), strings.Split(snapshotFiles[secondSnapshot].Name(), "-") + firstSnapshotDate, secondSnapshotDate := firstSnapshotName[len(firstSnapshotName)-1], secondSnapshotName[len(secondSnapshotName)-1] + return firstSnapshotDate < secondSnapshotDate + }) + + delCount := len(snapshotFiles) - retention + for _, df := range snapshotFiles[:delCount] { + snapshotPath := filepath.Join(snapshotDir, df.Name()) + logrus.Infof("Removing local snapshot %s", snapshotPath) + if err := os.Remove(snapshotPath); err != nil { + return err + } + } + + return nil +} From 69b549dfc4e9b2810fdbb59433a22c7278e8421a Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Fri, 29 Sep 2023 02:28:11 +0000 Subject: [PATCH 05/14] Elide old snapshot data when apiserver rejects configmap with ErrRequestEntityTooLarge Signed-off-by: Brad Davidson --- pkg/etcd/s3.go | 7 +- pkg/etcd/snapshot.go | 237 +++++++++++++++++++++++++++---------------- 2 files changed, 155 insertions(+), 89 deletions(-) diff --git a/pkg/etcd/s3.go b/pkg/etcd/s3.go index fe15f8f1f129..0bd09a2c2478 100644 --- a/pkg/etcd/s3.go +++ b/pkg/etcd/s3.go @@ -20,6 +20,7 @@ import ( "github.com/minio/minio-go/v7/pkg/credentials" "github.com/pkg/errors" "github.com/sirupsen/logrus" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -92,7 +93,7 @@ func NewS3(ctx context.Context, config *config.Control) (*S3, error) { // upload uploads the given snapshot to the configured S3 // compatible backend. -func (s *S3) upload(ctx context.Context, snapshot, extraMetadata string, now time.Time) (*snapshotFile, error) { +func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.ConfigMap, now time.Time) (*snapshotFile, error) { logrus.Infof("Uploading snapshot %s to S3", snapshot) basename := filepath.Base(snapshot) var snapshotFileName string @@ -115,7 +116,6 @@ func (s *S3) upload(ctx context.Context, snapshot, extraMetadata string, now tim if err != nil { sf = snapshotFile{ Name: filepath.Base(uploadInfo.Key), - Metadata: extraMetadata, NodeName: "s3", CreatedAt: &metav1.Time{ Time: now, @@ -132,6 +132,7 @@ func (s *S3) upload(ctx context.Context, snapshot, extraMetadata string, now tim Folder: s.config.EtcdS3Folder, Insecure: s.config.EtcdS3Insecure, }, + metadataSource: extraMetadata, } logrus.Errorf("Error received during snapshot upload to S3: %s", err) } else { @@ -142,7 +143,6 @@ func (s *S3) upload(ctx context.Context, snapshot, extraMetadata string, now tim sf = snapshotFile{ Name: filepath.Base(uploadInfo.Key), - Metadata: extraMetadata, NodeName: "s3", CreatedAt: &metav1.Time{ Time: ca, @@ -158,6 +158,7 @@ func (s *S3) upload(ctx context.Context, snapshot, extraMetadata string, now tim Folder: s.config.EtcdS3Folder, Insecure: s.config.EtcdS3Insecure, }, + metadataSource: extraMetadata, } } return &sf, nil diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go index e1a12e9e44ea..9c8d210884b0 100644 --- a/pkg/etcd/snapshot.go +++ b/pkg/etcd/snapshot.go @@ -12,6 +12,7 @@ import ( "path/filepath" "runtime" "sort" + "strconv" "strings" "time" @@ -35,6 +36,7 @@ import ( const ( maxConcurrentSnapshots = 1 + pruneStepSize = 5 compressedExtension = ".zip" ) @@ -187,7 +189,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { defer e.snapshotSem.Release(maxConcurrentSnapshots) // make sure the core.Factory is initialized before attempting to add snapshot metadata - var extraMetadata string + var extraMetadata *v1.ConfigMap if e.config.Runtime.Core == nil { logrus.Debugf("Cannot retrieve extra metadata from %s ConfigMap: runtime core not ready", snapshotExtraMetadataConfigMapName) } else { @@ -195,13 +197,8 @@ func (e *ETCD) Snapshot(ctx context.Context) error { if snapshotExtraMetadataConfigMap, err := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotExtraMetadataConfigMapName, metav1.GetOptions{}); err != nil { logrus.Debugf("Error encountered attempting to retrieve extra metadata from %s ConfigMap, error: %v", snapshotExtraMetadataConfigMapName, err) } else { - if m, err := json.Marshal(snapshotExtraMetadataConfigMap.Data); err != nil { - logrus.Debugf("Error attempting to marshal extra metadata contained in %s ConfigMap, error: %v", snapshotExtraMetadataConfigMapName, err) - } else { - logrus.Debugf("Setting extra metadata from %s ConfigMap", snapshotExtraMetadataConfigMapName) - logrus.Tracef("Marshalled extra metadata in %s ConfigMap was: %s", snapshotExtraMetadataConfigMapName, string(m)) - extraMetadata = base64.StdEncoding.EncodeToString(m) - } + logrus.Debugf("Setting extra metadata from %s ConfigMap", snapshotExtraMetadataConfigMapName) + extraMetadata = snapshotExtraMetadataConfigMap } } @@ -259,15 +256,15 @@ func (e *ETCD) Snapshot(ctx context.Context) error { sf = &snapshotFile{ Name: snapshotName, Location: "", - Metadata: extraMetadata, NodeName: nodeName, CreatedAt: &metav1.Time{ Time: now, }, - Status: failedSnapshotStatus, - Message: base64.StdEncoding.EncodeToString([]byte(err.Error())), - Size: 0, - Compressed: e.config.EtcdSnapshotCompress, + Status: failedSnapshotStatus, + Message: base64.StdEncoding.EncodeToString([]byte(err.Error())), + Size: 0, + Compressed: e.config.EtcdSnapshotCompress, + metadataSource: extraMetadata, } logrus.Errorf("Failed to take etcd snapshot: %v", err) if err := e.addSnapshotData(*sf); err != nil { @@ -295,15 +292,15 @@ func (e *ETCD) Snapshot(ctx context.Context) error { } sf = &snapshotFile{ Name: f.Name(), - Metadata: extraMetadata, Location: "file://" + snapshotPath, NodeName: nodeName, CreatedAt: &metav1.Time{ Time: f.ModTime(), }, - Status: successfulSnapshotStatus, - Size: f.Size(), - Compressed: e.config.EtcdSnapshotCompress, + Status: successfulSnapshotStatus, + Size: f.Size(), + Compressed: e.config.EtcdSnapshotCompress, + metadataSource: extraMetadata, } if err := e.addSnapshotData(*sf); err != nil { @@ -321,7 +318,6 @@ func (e *ETCD) Snapshot(ctx context.Context) error { logrus.Warnf("Unable to initialize S3 client: %v", err) sf = &snapshotFile{ Name: filepath.Base(snapshotPath), - Metadata: extraMetadata, NodeName: "s3", CreatedAt: &metav1.Time{ Time: now, @@ -338,6 +334,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { Folder: e.config.EtcdS3Folder, Insecure: e.config.EtcdS3Insecure, }, + metadataSource: extraMetadata, } } // sf should be nil if we were able to successfully initialize the S3 client. @@ -392,6 +389,8 @@ type snapshotFile struct { Status snapshotStatus `json:"status,omitempty"` S3 *s3Config `json:"s3Config,omitempty"` Compressed bool `json:"compressed"` + + metadataSource *v1.ConfigMap `json:"-"` } // listLocalSnapshots provides a list of the currently stored @@ -572,7 +571,7 @@ func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error { for obj := range e.s3.client.ListObjects(ctx, e.config.EtcdS3BucketName, opts) { if obj.Err != nil { - logrus.Error(obj.Err) + logrus.Errorf("Failed to list snapshots from S3: %v", obj.Err) return } @@ -630,24 +629,40 @@ func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error { return e.ReconcileSnapshotData(ctx) } +func marshalSnapshotFile(sf snapshotFile) ([]byte, error) { + if sf.metadataSource != nil { + if m, err := json.Marshal(sf.metadataSource.Data); err != nil { + logrus.Debugf("Error attempting to marshal extra metadata contained in %s ConfigMap, error: %v", snapshotExtraMetadataConfigMapName, err) + } else { + logrus.Tracef("Marshalled extra metadata in %s ConfigMap was: %s", snapshotExtraMetadataConfigMapName, string(m)) + sf.Metadata = base64.StdEncoding.EncodeToString(m) + } + } + return json.Marshal(sf) +} + // AddSnapshotData adds the given snapshot file information to the snapshot configmap, using the existing extra metadata // available at the time. func (e *ETCD) addSnapshotData(sf snapshotFile) error { + // make sure the core.Factory is initialized. There can + // be a race between this core code startup. + for e.config.Runtime.Core == nil { + runtime.Gosched() + } + + sfKey := generateSnapshotConfigMapKey(sf) + marshalledSnapshotFile, err := marshalSnapshotFile(sf) + if err != nil { + return err + } + + pruneCount := pruneStepSize + var lastErr error return retry.OnError(snapshotDataBackoff, func(err error) bool { - return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) + return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) || isTooLargeError(err) }, func() error { - // make sure the core.Factory is initialized. There can - // be a race between this core code startup. - for e.config.Runtime.Core == nil { - runtime.Gosched() - } snapshotConfigMap, getErr := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) - sfKey := generateSnapshotConfigMapKey(sf) - marshalledSnapshotFile, err := json.Marshal(sf) - if err != nil { - return err - } if apierrors.IsNotFound(getErr) { cm := v1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -664,10 +679,21 @@ func (e *ETCD) addSnapshotData(sf snapshotFile) error { snapshotConfigMap.Data = make(map[string]string) } + // If the configmap update was rejected due to size, drop the oldest entries from the map. + // We will continue to remove an increasing number of old snapshots from the map until the request succeeds, + // or the number we would attempt to remove exceeds the number stored. + if isTooLargeError(lastErr) { + logrus.Warnf("Snapshot configmap is too large, attempting to elide %d oldest snapshots from list", pruneCount) + if err := pruneConfigMap(snapshotConfigMap, pruneCount); err != nil { + return err + } + pruneCount += pruneStepSize + } + snapshotConfigMap.Data[sfKey] = string(marshalledSnapshotFile) - _, err = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) - return err + _, lastErr = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) + return lastErr }) } @@ -679,34 +705,68 @@ func generateSnapshotConfigMapKey(sf snapshotFile) string { return "local-" + name } +// pruneConfigMap drops the oldest entries from the configMap. +// Note that the actual snapshot files are not removed, just the entries that track them in the configmap. +func pruneConfigMap(snapshotConfigMap *v1.ConfigMap, pruneCount int) error { + if pruneCount > len(snapshotConfigMap.Data) { + return errors.New("unable to reduce snapshot ConfigMap size by eliding old snapshots") + } + + var snapshotFiles []snapshotFile + retention := len(snapshotConfigMap.Data) - pruneCount + for name := range snapshotConfigMap.Data { + basename, compressed := strings.CutSuffix(name, compressedExtension) + ts, _ := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) + snapshotFiles = append(snapshotFiles, snapshotFile{Name: name, CreatedAt: &metav1.Time{Time: time.Unix(ts, 0)}, Compressed: compressed}) + } + + // sort newest-first so we can prune entries past the retention count + sort.Slice(snapshotFiles, func(i, j int) bool { + return snapshotFiles[j].CreatedAt.Before(snapshotFiles[i].CreatedAt) + }) + + for _, snapshotFile := range snapshotFiles[retention:] { + delete(snapshotConfigMap.Data, snapshotFile.Name) + } + return nil +} + // ReconcileSnapshotData reconciles snapshot data in the snapshot ConfigMap. // It will reconcile snapshot data from disk locally always, and if S3 is enabled, will attempt to list S3 snapshots -// and reconcile snapshots from S3. Notably, +// and reconcile snapshots from S3. func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { + // make sure the core.Factory is initialized. There can + // be a race between this core code startup. + for e.config.Runtime.Core == nil { + runtime.Gosched() + } + logrus.Infof("Reconciling etcd snapshot data in %s ConfigMap", snapshotConfigMapName) defer logrus.Infof("Reconciliation of snapshot data in %s ConfigMap complete", snapshotConfigMapName) + + pruneCount := pruneStepSize + var lastErr error return retry.OnError(retry.DefaultBackoff, func(err error) bool { - return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) + return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) || isTooLargeError(err) }, func() error { - // make sure the core.Factory is initialize. There can - // be a race between this core code startup. - for e.config.Runtime.Core == nil { - runtime.Gosched() - } - - logrus.Debug("core.Factory is initialized") - snapshotConfigMap, getErr := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) if apierrors.IsNotFound(getErr) { - // Can't reconcile what doesn't exist. - return errors.New("No snapshot configmap found") + cm := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: snapshotConfigMapName, + Namespace: metav1.NamespaceSystem, + }, + } + cm, err := e.config.Runtime.Core.Core().V1().ConfigMap().Create(cm) + if err != nil { + return err + } + snapshotConfigMap = cm } logrus.Debugf("Attempting to reconcile etcd snapshot data for configmap generation %d", snapshotConfigMap.Generation) - - // if the snapshot config map data is nil, no need to reconcile. if snapshotConfigMap.Data == nil { - return nil + snapshotConfigMap.Data = map[string]string{} } snapshotFiles, err := e.listLocalSnapshots() @@ -716,11 +776,11 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { // s3ListSuccessful is set to true if we are successful at listing snapshots from S3 to eliminate accidental // clobbering of S3 snapshots in the configmap due to misconfigured S3 credentials/details - s3ListSuccessful := false + var s3ListSuccessful bool if e.config.EtcdS3 { if s3Snapshots, err := e.listS3Snapshots(ctx); err != nil { - logrus.Errorf("error retrieving S3 snapshots for reconciliation: %v", err) + logrus.Errorf("Error retrieving S3 snapshots for reconciliation: %v", err) } else { for k, v := range s3Snapshots { snapshotFiles[k] = v @@ -764,21 +824,16 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { // Apply the failed snapshot retention policy to locally failed snapshots if len(failedSnapshots) > 0 && e.config.EtcdSnapshotRetention >= 1 { + // sort newest-first so we can record only the retention count sort.Slice(failedSnapshots, func(i, j int) bool { - return failedSnapshots[i].Name > failedSnapshots[j].Name + return failedSnapshots[j].CreatedAt.Before(failedSnapshots[i].CreatedAt) }) - var keepCount int - if e.config.EtcdSnapshotRetention >= len(failedSnapshots) { - keepCount = len(failedSnapshots) - } else { - keepCount = e.config.EtcdSnapshotRetention - } - for _, dfs := range failedSnapshots[:keepCount] { + for _, dfs := range failedSnapshots[:e.config.EtcdSnapshotRetention] { sfKey := generateSnapshotConfigMapKey(dfs) - marshalledSnapshot, err := json.Marshal(dfs) + marshalledSnapshot, err := marshalSnapshotFile(dfs) if err != nil { - logrus.Errorf("unable to marshal snapshot to store in configmap %v", err) + logrus.Errorf("Failed to marshal snapshot to store in configmap %v", err) } else { snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) } @@ -787,21 +842,16 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { // Apply the failed snapshot retention policy to the S3 snapshots if len(failedS3Snapshots) > 0 && e.config.EtcdSnapshotRetention >= 1 { + // sort newest-first so we can record only the retention count sort.Slice(failedS3Snapshots, func(i, j int) bool { - return failedS3Snapshots[i].Name > failedS3Snapshots[j].Name + return failedS3Snapshots[j].CreatedAt.Before(failedS3Snapshots[i].CreatedAt) }) - var keepCount int - if e.config.EtcdSnapshotRetention >= len(failedS3Snapshots) { - keepCount = len(failedS3Snapshots) - } else { - keepCount = e.config.EtcdSnapshotRetention - } - for _, dfs := range failedS3Snapshots[:keepCount] { + for _, dfs := range failedS3Snapshots[:e.config.EtcdSnapshotRetention] { sfKey := generateSnapshotConfigMapKey(dfs) - marshalledSnapshot, err := json.Marshal(dfs) + marshalledSnapshot, err := marshalSnapshotFile(dfs) if err != nil { - logrus.Errorf("unable to marshal snapshot to store in configmap %v", err) + logrus.Errorf("Failed to marshal snapshot to store in configmap %v", err) } else { snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) } @@ -815,7 +865,7 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { if v, ok := deletedSnapshots[sfKey]; ok { // use the snapshot file we have from the existing configmap, and unmarshal it so we can manipulate it if err := json.Unmarshal([]byte(v), &sf); err != nil { - logrus.Errorf("error unmarshaling snapshot file: %v", err) + logrus.Errorf("Error unmarshaling snapshot file: %v", err) // use the snapshot with info we sourced from disk/S3 (will be missing metadata, but something is better than nothing) sf = snapshot } @@ -824,18 +874,28 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { } sf.Status = successfulSnapshotStatus // if the snapshot is on disk or in S3, it was successful. - - marshalledSnapshot, err := json.Marshal(sf) + marshalledSnapshot, err := marshalSnapshotFile(sf) if err != nil { - logrus.Warnf("unable to marshal snapshot metadata %s to store in configmap, received error: %v", sf.Name, err) + logrus.Warnf("Failed to marshal snapshot metadata %s to store in configmap, received error: %v", sf.Name, err) } else { snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) } } + // If the configmap update was rejected due to size, drop the oldest entries from the map. + // We will continue to remove an increasing number of old snapshots from the map until the request succeeds, + // or the number we would attempt to remove exceeds the number stored. + if isTooLargeError(lastErr) { + logrus.Warnf("Snapshot configmap is too large, attempting to elide %d oldest snapshots from list", pruneCount) + if err := pruneConfigMap(snapshotConfigMap, pruneCount); err != nil { + return err + } + pruneCount += pruneStepSize + } + logrus.Debugf("Updating snapshot ConfigMap (%s) with %d entries", snapshotConfigMapName, len(snapshotConfigMap.Data)) - _, err = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) - return err + _, lastErr = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) + return lastErr }) } @@ -848,7 +908,7 @@ func (e *ETCD) setSnapshotFunction(ctx context.Context) { // when updating the snapshot list configmap. time.Sleep(time.Duration(rand.Float64() * float64(snapshotJitterMax))) if err := e.Snapshot(ctx); err != nil { - logrus.Error(err) + logrus.Errorf("Failed to take scheduled snapshot: %v", err) } }))) } @@ -862,13 +922,15 @@ func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) logrus.Infof("Applying local snapshot retention policy: retention: %d, snapshotPrefix: %s, directory: %s", retention, snapshotPrefix, snapshotDir) - var snapshotFiles []os.FileInfo + var snapshotFiles []snapshotFile if err := filepath.Walk(snapshotDir, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if strings.HasPrefix(info.Name(), snapshotPrefix) { - snapshotFiles = append(snapshotFiles, info) + basename, compressed := strings.CutSuffix(info.Name(), compressedExtension) + ts, _ := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) + snapshotFiles = append(snapshotFiles, snapshotFile{Name: info.Name(), CreatedAt: &metav1.Time{Time: time.Unix(ts, 0)}, Compressed: compressed}) } return nil }); err != nil { @@ -877,16 +939,14 @@ func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) if len(snapshotFiles) <= retention { return nil } - sort.Slice(snapshotFiles, func(firstSnapshot, secondSnapshot int) bool { - // it takes the name from the snapshot file ex: etcd-snapshot-example-{date}, makes the split using "-" to find the date, takes the date and sort by date - firstSnapshotName, secondSnapshotName := strings.Split(snapshotFiles[firstSnapshot].Name(), "-"), strings.Split(snapshotFiles[secondSnapshot].Name(), "-") - firstSnapshotDate, secondSnapshotDate := firstSnapshotName[len(firstSnapshotName)-1], secondSnapshotName[len(secondSnapshotName)-1] - return firstSnapshotDate < secondSnapshotDate + + // sort newest-first so we can prune entries past the retention count + sort.Slice(snapshotFiles, func(i, j int) bool { + return snapshotFiles[j].CreatedAt.Before(snapshotFiles[i].CreatedAt) }) - delCount := len(snapshotFiles) - retention - for _, df := range snapshotFiles[:delCount] { - snapshotPath := filepath.Join(snapshotDir, df.Name()) + for _, df := range snapshotFiles[retention:] { + snapshotPath := filepath.Join(snapshotDir, df.Name) logrus.Infof("Removing local snapshot %s", snapshotPath) if err := os.Remove(snapshotPath); err != nil { return err @@ -895,3 +955,8 @@ func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) return nil } + +func isTooLargeError(err error) bool { + // There are no helpers for unpacking field validation errors, so we just check for "Too long" in the error string. + return apierrors.IsRequestEntityTooLargeError(err) || (apierrors.IsInvalid(err) && strings.Contains(err.Error(), "Too long")) +} From df1cfc0491197945d28d9adf475e9a92f69bd6fe Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Fri, 29 Sep 2023 08:42:34 +0000 Subject: [PATCH 06/14] Tidy s3 upload functions Consistently refer to object keys as such, simplify error handling. Signed-off-by: Brad Davidson --- pkg/etcd/s3.go | 110 +++++++++++++------------------------------ pkg/etcd/snapshot.go | 29 +++++------- 2 files changed, 45 insertions(+), 94 deletions(-) diff --git a/pkg/etcd/s3.go b/pkg/etcd/s3.go index 0bd09a2c2478..dcd061d93c8b 100644 --- a/pkg/etcd/s3.go +++ b/pkg/etcd/s3.go @@ -10,6 +10,7 @@ import ( "io" "net/http" "os" + "path" "path/filepath" "sort" "strings" @@ -96,89 +97,56 @@ func NewS3(ctx context.Context, config *config.Control) (*S3, error) { func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.ConfigMap, now time.Time) (*snapshotFile, error) { logrus.Infof("Uploading snapshot %s to S3", snapshot) basename := filepath.Base(snapshot) - var snapshotFileName string - var sf snapshotFile - if s.config.EtcdS3Folder != "" { - snapshotFileName = filepath.Join(s.config.EtcdS3Folder, basename) - } else { - snapshotFileName = basename + sf := &snapshotFile{ + Name: basename, + NodeName: "s3", + CreatedAt: &metav1.Time{}, + S3: &s3Config{ + Endpoint: s.config.EtcdS3Endpoint, + EndpointCA: s.config.EtcdS3EndpointCA, + SkipSSLVerify: s.config.EtcdS3SkipSSLVerify, + Bucket: s.config.EtcdS3BucketName, + Region: s.config.EtcdS3Region, + Folder: s.config.EtcdS3Folder, + Insecure: s.config.EtcdS3Insecure, + }, + metadataSource: extraMetadata, } + snapshotKey := path.Join(s.config.EtcdS3Folder, basename) + toCtx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) defer cancel() opts := minio.PutObjectOptions{NumThreads: 2} if strings.HasSuffix(snapshot, compressedExtension) { opts.ContentType = "application/zip" + sf.Compressed = true } else { opts.ContentType = "application/octet-stream" } - uploadInfo, err := s.client.FPutObject(toCtx, s.config.EtcdS3BucketName, snapshotFileName, snapshot, opts) + uploadInfo, err := s.client.FPutObject(toCtx, s.config.EtcdS3BucketName, snapshotKey, snapshot, opts) if err != nil { - sf = snapshotFile{ - Name: filepath.Base(uploadInfo.Key), - NodeName: "s3", - CreatedAt: &metav1.Time{ - Time: now, - }, - Message: base64.StdEncoding.EncodeToString([]byte(err.Error())), - Size: 0, - Status: failedSnapshotStatus, - S3: &s3Config{ - Endpoint: s.config.EtcdS3Endpoint, - EndpointCA: s.config.EtcdS3EndpointCA, - SkipSSLVerify: s.config.EtcdS3SkipSSLVerify, - Bucket: s.config.EtcdS3BucketName, - Region: s.config.EtcdS3Region, - Folder: s.config.EtcdS3Folder, - Insecure: s.config.EtcdS3Insecure, - }, - metadataSource: extraMetadata, - } - logrus.Errorf("Error received during snapshot upload to S3: %s", err) + sf.CreatedAt.Time = now + sf.Status = failedSnapshotStatus + sf.Message = base64.StdEncoding.EncodeToString([]byte(err.Error())) } else { - ca, err := time.Parse(time.RFC3339, uploadInfo.LastModified.Format(time.RFC3339)) - if err != nil { - return nil, err - } - - sf = snapshotFile{ - Name: filepath.Base(uploadInfo.Key), - NodeName: "s3", - CreatedAt: &metav1.Time{ - Time: ca, - }, - Size: uploadInfo.Size, - Status: successfulSnapshotStatus, - S3: &s3Config{ - Endpoint: s.config.EtcdS3Endpoint, - EndpointCA: s.config.EtcdS3EndpointCA, - SkipSSLVerify: s.config.EtcdS3SkipSSLVerify, - Bucket: s.config.EtcdS3BucketName, - Region: s.config.EtcdS3Region, - Folder: s.config.EtcdS3Folder, - Insecure: s.config.EtcdS3Insecure, - }, - metadataSource: extraMetadata, - } + sf.CreatedAt.Time = uploadInfo.LastModified + sf.Status = successfulSnapshotStatus + sf.Size = uploadInfo.Size } - return &sf, nil + return sf, err } // download downloads the given snapshot from the configured S3 // compatible backend. func (s *S3) Download(ctx context.Context) error { - var remotePath string - if s.config.EtcdS3Folder != "" { - remotePath = filepath.Join(s.config.EtcdS3Folder, s.config.ClusterResetRestorePath) - } else { - remotePath = s.config.ClusterResetRestorePath - } + snapshotKey := path.Join(s.config.EtcdS3Folder, s.config.ClusterResetRestorePath) - logrus.Debugf("retrieving snapshot: %s", remotePath) + logrus.Debugf("retrieving snapshot: %s", snapshotKey) toCtx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) defer cancel() - r, err := s.client.GetObject(toCtx, s.config.EtcdS3BucketName, remotePath, minio.GetObjectOptions{}) + r, err := s.client.GetObject(toCtx, s.config.EtcdS3BucketName, snapshotKey, minio.GetObjectOptions{}) if err != nil { return nil } @@ -213,14 +181,7 @@ func (s *S3) Download(ctx context.Context) error { // snapshotPrefix returns the prefix used in the // naming of the snapshots. func (s *S3) snapshotPrefix() string { - fullSnapshotPrefix := s.config.EtcdSnapshotName - var prefix string - if s.config.EtcdS3Folder != "" { - prefix = filepath.Join(s.config.EtcdS3Folder, fullSnapshotPrefix) - } else { - prefix = fullSnapshotPrefix - } - return prefix + return path.Join(s.config.EtcdS3Folder, s.config.EtcdSnapshotName) } // snapshotRetention prunes snapshots in the configured S3 compatible backend for this specific node. @@ -250,15 +211,12 @@ func (s *S3) snapshotRetention(ctx context.Context) error { return nil } - sort.Slice(snapshotFiles, func(firstSnapshot, secondSnapshot int) bool { - // it takes the key from the snapshot file ex: etcd-snapshot-example-{date}, makes the split using "-" to find the date, takes the date and sort by date - firstSnapshotName, secondSnapshotName := strings.Split(snapshotFiles[firstSnapshot].Key, "-"), strings.Split(snapshotFiles[secondSnapshot].Key, "-") - firstSnapshotDate, secondSnapshotDate := firstSnapshotName[len(firstSnapshotName)-1], secondSnapshotName[len(secondSnapshotName)-1] - return firstSnapshotDate < secondSnapshotDate + // sort newest-first so we can prune entries past the retention count + sort.Slice(snapshotFiles, func(i, j int) bool { + return snapshotFiles[j].LastModified.Before(snapshotFiles[i].LastModified) }) - delCount := len(snapshotFiles) - s.config.EtcdSnapshotRetention - for _, df := range snapshotFiles[:delCount] { + for _, df := range snapshotFiles[s.config.EtcdSnapshotRetention:] { logrus.Infof("Removing S3 snapshot: %s", df.Key) if err := s.client.RemoveObject(ctx, s.config.EtcdS3BucketName, df.Key, minio.RemoveObjectOptions{}); err != nil { return err diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go index 9c8d210884b0..ced9f90f5b51 100644 --- a/pkg/etcd/snapshot.go +++ b/pkg/etcd/snapshot.go @@ -312,8 +312,6 @@ func (e *ETCD) Snapshot(ctx context.Context) error { if e.config.EtcdS3 { logrus.Infof("Saving etcd snapshot %s to S3", snapshotName) - // Set sf to nil so that we can attempt to now upload the snapshot to S3 if needed - sf = nil if err := e.initS3IfNil(ctx); err != nil { logrus.Warnf("Unable to initialize S3 client: %v", err) sf = &snapshotFile{ @@ -336,21 +334,23 @@ func (e *ETCD) Snapshot(ctx context.Context) error { }, metadataSource: extraMetadata, } - } - // sf should be nil if we were able to successfully initialize the S3 client. - if sf == nil { + } else { + // upload will return a snapshotFile even on error - if there was an + // error, it will be reflected in the status and message. sf, err = e.s3.upload(ctx, snapshotPath, extraMetadata, now) if err != nil { - return err - } - logrus.Infof("S3 upload complete for %s", snapshotName) - if err := e.s3.snapshotRetention(ctx); err != nil { - return errors.Wrap(err, "failed to apply s3 snapshot retention policy") + logrus.Errorf("Error received during snapshot upload to S3: %s", err) + } else { + logrus.Infof("S3 upload complete for %s", snapshotName) } } if err := e.addSnapshotData(*sf); err != nil { return errors.Wrap(err, "failed to save snapshot data to configmap") } + if err := e.s3.snapshotRetention(ctx); err != nil { + logrus.Errorf("Failed to apply s3 snapshot retention policy: %v", err) + } + } } @@ -463,17 +463,11 @@ func (e *ETCD) listS3Snapshots(ctx context.Context) (map[string]snapshotFile, er if obj.Size == 0 { continue } - - ca, err := time.Parse(time.RFC3339, obj.LastModified.Format(time.RFC3339)) - if err != nil { - return nil, err - } - sf := snapshotFile{ Name: filepath.Base(obj.Key), NodeName: "s3", CreatedAt: &metav1.Time{ - Time: ca, + Time: obj.LastModified, }, Size: obj.Size, S3: &s3Config{ @@ -634,7 +628,6 @@ func marshalSnapshotFile(sf snapshotFile) ([]byte, error) { if m, err := json.Marshal(sf.metadataSource.Data); err != nil { logrus.Debugf("Error attempting to marshal extra metadata contained in %s ConfigMap, error: %v", snapshotExtraMetadataConfigMapName, err) } else { - logrus.Tracef("Marshalled extra metadata in %s ConfigMap was: %s", snapshotExtraMetadataConfigMapName, string(m)) sf.Metadata = base64.StdEncoding.EncodeToString(m) } } From af03eafc446cdaca8cb7d2336ab2f34293f60df7 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Fri, 29 Sep 2023 16:59:24 +0000 Subject: [PATCH 07/14] Consistently set snapshotFile timestamp Attempt to use timestamp from creation or filename instead of file/object modification times Signed-off-by: Brad Davidson --- pkg/etcd/s3.go | 10 ++++---- pkg/etcd/snapshot.go | 60 ++++++++++++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/pkg/etcd/s3.go b/pkg/etcd/s3.go index dcd061d93c8b..0ec774e120e7 100644 --- a/pkg/etcd/s3.go +++ b/pkg/etcd/s3.go @@ -98,9 +98,11 @@ func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.Conf logrus.Infof("Uploading snapshot %s to S3", snapshot) basename := filepath.Base(snapshot) sf := &snapshotFile{ - Name: basename, - NodeName: "s3", - CreatedAt: &metav1.Time{}, + Name: basename, + NodeName: "s3", + CreatedAt: &metav1.Time{ + Time: now, + }, S3: &s3Config{ Endpoint: s.config.EtcdS3Endpoint, EndpointCA: s.config.EtcdS3EndpointCA, @@ -126,11 +128,9 @@ func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.Conf } uploadInfo, err := s.client.FPutObject(toCtx, s.config.EtcdS3BucketName, snapshotKey, snapshot, opts) if err != nil { - sf.CreatedAt.Time = now sf.Status = failedSnapshotStatus sf.Message = base64.StdEncoding.EncodeToString([]byte(err.Error())) } else { - sf.CreatedAt.Time = uploadInfo.LastModified sf.Status = successfulSnapshotStatus sf.Size = uploadInfo.Size } diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go index ced9f90f5b51..e07b5a3740f7 100644 --- a/pkg/etcd/snapshot.go +++ b/pkg/etcd/snapshot.go @@ -9,6 +9,7 @@ import ( "io" "math/rand" "os" + "path" "path/filepath" "runtime" "sort" @@ -93,7 +94,7 @@ func (e *ETCD) preSnapshotSetup(ctx context.Context) error { // compressSnapshot compresses the given snapshot and provides the // caller with the path to the file. -func (e *ETCD) compressSnapshot(snapshotDir, snapshotName, snapshotPath string) (string, error) { +func (e *ETCD) compressSnapshot(snapshotDir, snapshotName, snapshotPath string, now time.Time) (string, error) { logrus.Info("Compressing etcd snapshot file: " + snapshotName) zippedSnapshotName := snapshotName + compressedExtension @@ -130,7 +131,7 @@ func (e *ETCD) compressSnapshot(snapshotDir, snapshotName, snapshotPath string) header.Name = snapshotName header.Method = zip.Deflate - header.Modified = time.Now() + header.Modified = now writer, err := zipWriter.CreateHeader(header) if err != nil { @@ -239,7 +240,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { } nodeName := os.Getenv("NODE_NAME") - now := time.Now() + now := time.Now().Round(time.Second) snapshotName := fmt.Sprintf("%s-%s-%d", e.config.EtcdSnapshotName, nodeName, now.Unix()) snapshotPath := filepath.Join(snapshotDir, snapshotName) @@ -273,7 +274,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { } if e.config.EtcdSnapshotCompress { - zipPath, err := e.compressSnapshot(snapshotDir, snapshotName, snapshotPath) + zipPath, err := e.compressSnapshot(snapshotDir, snapshotName, snapshotPath, now) if err != nil { return err } @@ -295,7 +296,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { Location: "file://" + snapshotPath, NodeName: nodeName, CreatedAt: &metav1.Time{ - Time: f.ModTime(), + Time: now, }, Status: successfulSnapshotStatus, Size: f.Size(), @@ -397,36 +398,39 @@ type snapshotFile struct { // snapshots on disk along with their relevant // metadata. func (e *ETCD) listLocalSnapshots() (map[string]snapshotFile, error) { + nodeName := os.Getenv("NODE_NAME") snapshots := make(map[string]snapshotFile) snapshotDir, err := snapshotDir(e.config, true) if err != nil { return snapshots, errors.Wrap(err, "failed to get the snapshot dir") } - dirEntries, err := os.ReadDir(snapshotDir) - if err != nil { - return nil, err - } - - nodeName := os.Getenv("NODE_NAME") + if err := filepath.Walk(snapshotDir, func(path string, file os.FileInfo, err error) error { + if file.IsDir() || err != nil { + return err + } - for _, de := range dirEntries { - file, err := de.Info() + basename, compressed := strings.CutSuffix(file.Name(), compressedExtension) + ts, err := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) if err != nil { - return nil, err + ts = file.ModTime().Unix() } sf := snapshotFile{ Name: file.Name(), Location: "file://" + filepath.Join(snapshotDir, file.Name()), NodeName: nodeName, CreatedAt: &metav1.Time{ - Time: file.ModTime(), + Time: time.Unix(ts, 0), }, - Size: file.Size(), - Status: successfulSnapshotStatus, + Size: file.Size(), + Status: successfulSnapshotStatus, + Compressed: compressed, } sfKey := generateSnapshotConfigMapKey(sf) snapshots[sfKey] = sf + return nil + }); err != nil { + return nil, err } return snapshots, nil @@ -463,11 +467,19 @@ func (e *ETCD) listS3Snapshots(ctx context.Context) (map[string]snapshotFile, er if obj.Size == 0 { continue } + + filename := path.Base(obj.Key) + basename, compressed := strings.CutSuffix(filename, compressedExtension) + ts, err := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) + if err != nil { + ts = obj.LastModified.Unix() + } + sf := snapshotFile{ - Name: filepath.Base(obj.Key), + Name: filename, NodeName: "s3", CreatedAt: &metav1.Time{ - Time: obj.LastModified, + Time: time.Unix(ts, 0), }, Size: obj.Size, S3: &s3Config{ @@ -479,7 +491,8 @@ func (e *ETCD) listS3Snapshots(ctx context.Context) (map[string]snapshotFile, er Folder: e.config.EtcdS3Folder, Insecure: e.config.EtcdS3Insecure, }, - Status: successfulSnapshotStatus, + Status: successfulSnapshotStatus, + Compressed: compressed, } sfKey := generateSnapshotConfigMapKey(sf) snapshots[sfKey] = sf @@ -917,12 +930,15 @@ func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) var snapshotFiles []snapshotFile if err := filepath.Walk(snapshotDir, func(path string, info os.FileInfo, err error) error { - if err != nil { + if info.IsDir() || err != nil { return err } if strings.HasPrefix(info.Name(), snapshotPrefix) { basename, compressed := strings.CutSuffix(info.Name(), compressedExtension) - ts, _ := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) + ts, err := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) + if err != nil { + ts = info.ModTime().Unix() + } snapshotFiles = append(snapshotFiles, snapshotFile{Name: info.Name(), CreatedAt: &metav1.Time{Time: time.Unix(ts, 0)}, Compressed: compressed}) } return nil From 29cfd48733b61ca65f7aa1e6cd22530f30ccd74e Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Sat, 30 Sep 2023 01:58:48 +0000 Subject: [PATCH 08/14] Move s3 snapshot list functionality to s3.go Also, don't list ONLY s3 snapshots if S3 is enabled. Signed-off-by: Brad Davidson --- pkg/etcd/s3.go | 59 +++++++++++++++++++++++++++++ pkg/etcd/snapshot.go | 90 ++++++++++---------------------------------- 2 files changed, 79 insertions(+), 70 deletions(-) diff --git a/pkg/etcd/s3.go b/pkg/etcd/s3.go index 0ec774e120e7..e38a58ed88c9 100644 --- a/pkg/etcd/s3.go +++ b/pkg/etcd/s3.go @@ -13,6 +13,7 @@ import ( "path" "path/filepath" "sort" + "strconv" "strings" "time" @@ -226,6 +227,64 @@ func (s *S3) snapshotRetention(ctx context.Context) error { return nil } +// listSnapshots provides a list of currently stored +// snapshots in S3 along with their relevant +// metadata. +func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) { + snapshots := make(map[string]snapshotFile) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + var loo minio.ListObjectsOptions + if s.config.EtcdS3Folder != "" { + loo = minio.ListObjectsOptions{ + Prefix: s.config.EtcdS3Folder, + Recursive: true, + } + } + + objects := s.client.ListObjects(ctx, s.config.EtcdS3BucketName, loo) + + for obj := range objects { + if obj.Err != nil { + return nil, obj.Err + } + if obj.Size == 0 { + continue + } + + filename := path.Base(obj.Key) + basename, compressed := strings.CutSuffix(filename, compressedExtension) + ts, err := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) + if err != nil { + ts = obj.LastModified.Unix() + } + + sf := snapshotFile{ + Name: filename, + NodeName: "s3", + CreatedAt: &metav1.Time{ + Time: time.Unix(ts, 0), + }, + Size: obj.Size, + S3: &s3Config{ + Endpoint: s.config.EtcdS3Endpoint, + EndpointCA: s.config.EtcdS3EndpointCA, + SkipSSLVerify: s.config.EtcdS3SkipSSLVerify, + Bucket: s.config.EtcdS3BucketName, + Region: s.config.EtcdS3Region, + Folder: s.config.EtcdS3Folder, + Insecure: s.config.EtcdS3Insecure, + }, + Status: successfulSnapshotStatus, + Compressed: compressed, + } + sfKey := generateSnapshotConfigMapKey(sf) + snapshots[sfKey] = sf + } + return snapshots, nil +} + func readS3EndpointCA(endpointCA string) ([]byte, error) { ca, err := base64.StdEncoding.DecodeString(endpointCA) if err != nil { diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go index e07b5a3740f7..d640b69eadb9 100644 --- a/pkg/etcd/snapshot.go +++ b/pkg/etcd/snapshot.go @@ -9,7 +9,6 @@ import ( "io" "math/rand" "os" - "path" "path/filepath" "runtime" "sort" @@ -436,71 +435,6 @@ func (e *ETCD) listLocalSnapshots() (map[string]snapshotFile, error) { return snapshots, nil } -// listS3Snapshots provides a list of currently stored -// snapshots in S3 along with their relevant -// metadata. -func (e *ETCD) listS3Snapshots(ctx context.Context) (map[string]snapshotFile, error) { - snapshots := make(map[string]snapshotFile) - - if e.config.EtcdS3 { - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - if err := e.initS3IfNil(ctx); err != nil { - return nil, err - } - - var loo minio.ListObjectsOptions - if e.config.EtcdS3Folder != "" { - loo = minio.ListObjectsOptions{ - Prefix: e.config.EtcdS3Folder, - Recursive: true, - } - } - - objects := e.s3.client.ListObjects(ctx, e.config.EtcdS3BucketName, loo) - - for obj := range objects { - if obj.Err != nil { - return nil, obj.Err - } - if obj.Size == 0 { - continue - } - - filename := path.Base(obj.Key) - basename, compressed := strings.CutSuffix(filename, compressedExtension) - ts, err := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) - if err != nil { - ts = obj.LastModified.Unix() - } - - sf := snapshotFile{ - Name: filename, - NodeName: "s3", - CreatedAt: &metav1.Time{ - Time: time.Unix(ts, 0), - }, - Size: obj.Size, - S3: &s3Config{ - Endpoint: e.config.EtcdS3Endpoint, - EndpointCA: e.config.EtcdS3EndpointCA, - SkipSSLVerify: e.config.EtcdS3SkipSSLVerify, - Bucket: e.config.EtcdS3BucketName, - Region: e.config.EtcdS3Region, - Folder: e.config.EtcdS3Folder, - Insecure: e.config.EtcdS3Insecure, - }, - Status: successfulSnapshotStatus, - Compressed: compressed, - } - sfKey := generateSnapshotConfigMapKey(sf) - snapshots[sfKey] = sf - } - } - return snapshots, nil -} - // initS3IfNil initializes the S3 client // if it hasn't yet been initialized. func (e *ETCD) initS3IfNil(ctx context.Context) error { @@ -535,17 +469,33 @@ func (e *ETCD) PruneSnapshots(ctx context.Context) error { } } } - return e.ReconcileSnapshotData(ctx) } // ListSnapshots is an exported wrapper method that wraps an // unexported method of the same name. func (e *ETCD) ListSnapshots(ctx context.Context) (map[string]snapshotFile, error) { + snapshotFiles := map[string]snapshotFile{} if e.config.EtcdS3 { - return e.listS3Snapshots(ctx) + if err := e.initS3IfNil(ctx); err != nil { + return nil, err + } + sfs, err := e.s3.listSnapshots(ctx) + if err != nil { + return nil, err + } + snapshotFiles = sfs + } + + sfs, err := e.listLocalSnapshots() + if err != nil { + return nil, err } - return e.listLocalSnapshots() + for k, sf := range sfs { + snapshotFiles[k] = sf + } + + return snapshotFiles, err } // deleteSnapshots removes the given snapshots from @@ -785,7 +735,7 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { var s3ListSuccessful bool if e.config.EtcdS3 { - if s3Snapshots, err := e.listS3Snapshots(ctx); err != nil { + if s3Snapshots, err := e.s3.listSnapshots(ctx); err != nil { logrus.Errorf("Error retrieving S3 snapshots for reconciliation: %v", err) } else { for k, v := range s3Snapshots { From c574a776a39f60abc6e7785bcf351f91d8e98593 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Mon, 2 Oct 2023 23:20:22 +0000 Subject: [PATCH 09/14] Store extra metadata and cluster ID for snapshots Write the extra metadata both locally and to S3. These files are placed such that they will not be used by older versions of K3s that do not make use of them. Signed-off-by: Brad Davidson --- pkg/cli/etcdsnapshot/etcd_snapshot.go | 177 ++++++++------------ pkg/cluster/bootstrap.go | 9 -- pkg/cluster/bootstrap_test.go | 47 ------ pkg/daemons/config/types.go | 2 + pkg/etcd/s3.go | 224 +++++++++++++++++++------- pkg/etcd/snapshot.go | 116 +++++++++---- pkg/server/server.go | 1 + 7 files changed, 324 insertions(+), 252 deletions(-) diff --git a/pkg/cli/etcdsnapshot/etcd_snapshot.go b/pkg/cli/etcdsnapshot/etcd_snapshot.go index 714ccc982a4b..93dd738cd734 100644 --- a/pkg/cli/etcdsnapshot/etcd_snapshot.go +++ b/pkg/cli/etcdsnapshot/etcd_snapshot.go @@ -1,6 +1,7 @@ package etcdsnapshot import ( + "context" "encoding/json" "errors" "fmt" @@ -12,8 +13,7 @@ import ( "github.com/erikdubbelboer/gspt" "github.com/k3s-io/k3s/pkg/cli/cmds" - "github.com/k3s-io/k3s/pkg/cluster" - "github.com/k3s-io/k3s/pkg/daemons/config" + daemonconfig "github.com/k3s-io/k3s/pkg/daemons/config" "github.com/k3s-io/k3s/pkg/etcd" "github.com/k3s-io/k3s/pkg/server" util2 "github.com/k3s-io/k3s/pkg/util" @@ -22,16 +22,22 @@ import ( "gopkg.in/yaml.v2" ) +type etcdCommand struct { + etcd *etcd.ETCD + ctx context.Context +} + // commandSetup setups up common things needed // for each etcd command. -func commandSetup(app *cli.Context, cfg *cmds.Server, sc *server.Config) error { +func commandSetup(app *cli.Context, cfg *cmds.Server, config *server.Config) (*etcdCommand, error) { + ctx := signals.SetupSignalContext() gspt.SetProcTitle(os.Args[0]) nodeName := app.String("node-name") if nodeName == "" { h, err := os.Hostname() if err != nil { - return err + return nil, err } nodeName = h } @@ -40,33 +46,53 @@ func commandSetup(app *cli.Context, cfg *cmds.Server, sc *server.Config) error { dataDir, err := server.ResolveDataDir(cfg.DataDir) if err != nil { - return err + return nil, err + } + + config.DisableAgent = true + config.ControlConfig.DataDir = dataDir + config.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName + config.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir + config.ControlConfig.EtcdSnapshotCompress = cfg.EtcdSnapshotCompress + config.ControlConfig.EtcdListFormat = strings.ToLower(cfg.EtcdListFormat) + config.ControlConfig.EtcdS3 = cfg.EtcdS3 + config.ControlConfig.EtcdS3Endpoint = cfg.EtcdS3Endpoint + config.ControlConfig.EtcdS3EndpointCA = cfg.EtcdS3EndpointCA + config.ControlConfig.EtcdS3SkipSSLVerify = cfg.EtcdS3SkipSSLVerify + config.ControlConfig.EtcdS3AccessKey = cfg.EtcdS3AccessKey + config.ControlConfig.EtcdS3SecretKey = cfg.EtcdS3SecretKey + config.ControlConfig.EtcdS3BucketName = cfg.EtcdS3BucketName + config.ControlConfig.EtcdS3Region = cfg.EtcdS3Region + config.ControlConfig.EtcdS3Folder = cfg.EtcdS3Folder + config.ControlConfig.EtcdS3Insecure = cfg.EtcdS3Insecure + config.ControlConfig.EtcdS3Timeout = cfg.EtcdS3Timeout + config.ControlConfig.Runtime = daemonconfig.NewRuntime(nil) + config.ControlConfig.Runtime.ETCDServerCA = filepath.Join(dataDir, "tls", "etcd", "server-ca.crt") + config.ControlConfig.Runtime.ClientETCDCert = filepath.Join(dataDir, "tls", "etcd", "client.crt") + config.ControlConfig.Runtime.ClientETCDKey = filepath.Join(dataDir, "tls", "etcd", "client.key") + config.ControlConfig.Runtime.KubeConfigAdmin = filepath.Join(dataDir, "cred", "admin.kubeconfig") + + e := etcd.NewETCD() + if err := e.SetControlConfig(&config.ControlConfig); err != nil { + return nil, err + } + + initialized, err := e.IsInitialized() + if err != nil { + return nil, err + } + if !initialized { + return nil, fmt.Errorf("etcd database not found in %s", config.ControlConfig.DataDir) } - sc.DisableAgent = true - sc.ControlConfig.DataDir = dataDir - sc.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName - sc.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir - sc.ControlConfig.EtcdSnapshotCompress = cfg.EtcdSnapshotCompress - sc.ControlConfig.EtcdListFormat = strings.ToLower(cfg.EtcdListFormat) - sc.ControlConfig.EtcdS3 = cfg.EtcdS3 - sc.ControlConfig.EtcdS3Endpoint = cfg.EtcdS3Endpoint - sc.ControlConfig.EtcdS3EndpointCA = cfg.EtcdS3EndpointCA - sc.ControlConfig.EtcdS3SkipSSLVerify = cfg.EtcdS3SkipSSLVerify - sc.ControlConfig.EtcdS3AccessKey = cfg.EtcdS3AccessKey - sc.ControlConfig.EtcdS3SecretKey = cfg.EtcdS3SecretKey - sc.ControlConfig.EtcdS3BucketName = cfg.EtcdS3BucketName - sc.ControlConfig.EtcdS3Region = cfg.EtcdS3Region - sc.ControlConfig.EtcdS3Folder = cfg.EtcdS3Folder - sc.ControlConfig.EtcdS3Insecure = cfg.EtcdS3Insecure - sc.ControlConfig.EtcdS3Timeout = cfg.EtcdS3Timeout - sc.ControlConfig.Runtime = config.NewRuntime(nil) - sc.ControlConfig.Runtime.ETCDServerCA = filepath.Join(dataDir, "tls", "etcd", "server-ca.crt") - sc.ControlConfig.Runtime.ClientETCDCert = filepath.Join(dataDir, "tls", "etcd", "client.crt") - sc.ControlConfig.Runtime.ClientETCDKey = filepath.Join(dataDir, "tls", "etcd", "client.key") - sc.ControlConfig.Runtime.KubeConfigAdmin = filepath.Join(dataDir, "cred", "admin.kubeconfig") + sc, err := server.NewContext(ctx, config.ControlConfig.Runtime.KubeConfigAdmin, false) + if err != nil { + return nil, err + } + config.ControlConfig.Runtime.K3s = sc.K3s + config.ControlConfig.Runtime.Core = sc.Core - return nil + return &etcdCommand{etcd: e, ctx: ctx}, nil } // Save triggers an on-demand etcd snapshot operation @@ -80,43 +106,18 @@ func Save(app *cli.Context) error { func save(app *cli.Context, cfg *cmds.Server) error { var serverConfig server.Config - if err := commandSetup(app, cfg, &serverConfig); err != nil { - return err - } - if len(app.Args()) > 0 { return util2.ErrCommandNoArgs } - serverConfig.ControlConfig.EtcdSnapshotRetention = 0 // disable retention check - - ctx := signals.SetupSignalContext() - e := etcd.NewETCD() - if err := e.SetControlConfig(&serverConfig.ControlConfig); err != nil { - return err - } - - initialized, err := e.IsInitialized() + ec, err := commandSetup(app, cfg, &serverConfig) if err != nil { return err } - if !initialized { - return fmt.Errorf("etcd database not found in %s", serverConfig.ControlConfig.DataDir) - } - - cluster := cluster.New(&serverConfig.ControlConfig) - if err := cluster.Bootstrap(ctx, true); err != nil { - return err - } - - sc, err := server.NewContext(ctx, serverConfig.ControlConfig.Runtime.KubeConfigAdmin, false) - if err != nil { - return err - } - serverConfig.ControlConfig.Runtime.Core = sc.Core + serverConfig.ControlConfig.EtcdSnapshotRetention = 0 // disable retention check - return cluster.Snapshot(ctx, &serverConfig.ControlConfig) + return ec.etcd.Snapshot(ec.ctx) } func Delete(app *cli.Context) error { @@ -129,7 +130,8 @@ func Delete(app *cli.Context) error { func delete(app *cli.Context, cfg *cmds.Server) error { var serverConfig server.Config - if err := commandSetup(app, cfg, &serverConfig); err != nil { + ec, err := commandSetup(app, cfg, &serverConfig) + if err != nil { return err } @@ -138,19 +140,7 @@ func delete(app *cli.Context, cfg *cmds.Server) error { return errors.New("no snapshots given for removal") } - ctx := signals.SetupSignalContext() - e := etcd.NewETCD() - if err := e.SetControlConfig(&serverConfig.ControlConfig); err != nil { - return err - } - - sc, err := server.NewContext(ctx, serverConfig.ControlConfig.Runtime.KubeConfigAdmin, false) - if err != nil { - return err - } - serverConfig.ControlConfig.Runtime.Core = sc.Core - - return e.DeleteSnapshots(ctx, app.Args()) + return ec.etcd.DeleteSnapshots(ec.ctx, app.Args()) } func List(app *cli.Context) error { @@ -160,7 +150,7 @@ func List(app *cli.Context) error { return list(app, &cmds.ServerConfig) } -var etcdListFormats = []string{"json", "yaml"} +var etcdListFormats = []string{"json", "yaml", "table"} func validEtcdListFormat(format string) bool { for _, supportedFormat := range etcdListFormats { @@ -174,17 +164,12 @@ func validEtcdListFormat(format string) bool { func list(app *cli.Context, cfg *cmds.Server) error { var serverConfig server.Config - if err := commandSetup(app, cfg, &serverConfig); err != nil { - return err - } - - ctx := signals.SetupSignalContext() - e := etcd.NewETCD() - if err := e.SetControlConfig(&serverConfig.ControlConfig); err != nil { + ec, err := commandSetup(app, cfg, &serverConfig) + if err != nil { return err } - sf, err := e.ListSnapshots(ctx) + sf, err := ec.etcd.ListSnapshots(ec.ctx) if err != nil { return err } @@ -208,20 +193,9 @@ func list(app *cli.Context, cfg *cmds.Server) error { w := tabwriter.NewWriter(os.Stdout, 0, 0, 1, ' ', 0) defer w.Flush() - if cfg.EtcdS3 { - fmt.Fprint(w, "Name\tSize\tCreated\n") - for _, s := range sf { - if s.NodeName == "s3" { - fmt.Fprintf(w, "%s\t%d\t%s\n", s.Name, s.Size, s.CreatedAt.Format(time.RFC3339)) - } - } - } else { - fmt.Fprint(w, "Name\tLocation\tSize\tCreated\n") - for _, s := range sf { - if s.NodeName != "s3" { - fmt.Fprintf(w, "%s\t%s\t%d\t%s\n", s.Name, s.Location, s.Size, s.CreatedAt.Format(time.RFC3339)) - } - } + fmt.Fprint(w, "Name\tLocation\tSize\tCreated\n") + for _, s := range sf { + fmt.Fprintf(w, "%s\t%s\t%d\t%s\n", s.Name, s.Location, s.Size, s.CreatedAt.Format(time.RFC3339)) } } @@ -238,23 +212,12 @@ func Prune(app *cli.Context) error { func prune(app *cli.Context, cfg *cmds.Server) error { var serverConfig server.Config - if err := commandSetup(app, cfg, &serverConfig); err != nil { + ec, err := commandSetup(app, cfg, &serverConfig) + if err != nil { return err } serverConfig.ControlConfig.EtcdSnapshotRetention = cfg.EtcdSnapshotRetention - ctx := signals.SetupSignalContext() - e := etcd.NewETCD() - if err := e.SetControlConfig(&serverConfig.ControlConfig); err != nil { - return err - } - - sc, err := server.NewContext(ctx, serverConfig.ControlConfig.Runtime.KubeConfigAdmin, false) - if err != nil { - return err - } - serverConfig.ControlConfig.Runtime.Core = sc.Core - - return e.PruneSnapshots(ctx) + return ec.etcd.PruneSnapshots(ec.ctx) } diff --git a/pkg/cluster/bootstrap.go b/pkg/cluster/bootstrap.go index a2c63a974e49..4a5e636a21c8 100644 --- a/pkg/cluster/bootstrap.go +++ b/pkg/cluster/bootstrap.go @@ -424,15 +424,6 @@ func (c *Cluster) bootstrap(ctx context.Context) error { return c.storageBootstrap(ctx) } -// Snapshot is a proxy method to call the snapshot method on the managedb -// interface for etcd clusters. -func (c *Cluster) Snapshot(ctx context.Context, config *config.Control) error { - if c.managedDB == nil { - return errors.New("unable to perform etcd snapshot on non-etcd system") - } - return c.managedDB.Snapshot(ctx) -} - // compareConfig verifies that the config of the joining control plane node coincides with the cluster's config func (c *Cluster) compareConfig() error { token := c.config.AgentToken diff --git a/pkg/cluster/bootstrap_test.go b/pkg/cluster/bootstrap_test.go index b20a36fd6841..3531fcab25f2 100644 --- a/pkg/cluster/bootstrap_test.go +++ b/pkg/cluster/bootstrap_test.go @@ -197,50 +197,3 @@ func TestCluster_migrateBootstrapData(t *testing.T) { }) } } - -func TestCluster_Snapshot(t *testing.T) { - type fields struct { - clientAccessInfo *clientaccess.Info - config *config.Control - managedDB managed.Driver - joining bool - storageStarted bool - saveBootstrap bool - shouldBootstrap bool - } - type args struct { - ctx context.Context - config *config.Control - } - tests := []struct { - name string - fields fields - args args - wantErr bool - }{ - { - name: "Fail on non etcd cluster", - fields: fields{}, - args: args{ - ctx: context.Background(), - }, - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - c := &Cluster{ - clientAccessInfo: tt.fields.clientAccessInfo, - config: tt.fields.config, - managedDB: tt.fields.managedDB, - joining: tt.fields.joining, - storageStarted: tt.fields.storageStarted, - saveBootstrap: tt.fields.saveBootstrap, - shouldBootstrap: tt.fields.shouldBootstrap, - } - if err := c.Snapshot(tt.args.ctx, tt.args.config); (err != nil) != tt.wantErr { - t.Errorf("Cluster.Snapshot() error = %v, wantErr %v", err, tt.wantErr) - } - }) - } -} diff --git a/pkg/daemons/config/types.go b/pkg/daemons/config/types.go index 6bd022735e23..7744ce26db49 100644 --- a/pkg/daemons/config/types.go +++ b/pkg/daemons/config/types.go @@ -10,6 +10,7 @@ import ( "sync" "time" + "github.com/k3s-io/k3s/pkg/generated/controllers/k3s.cattle.io" "github.com/k3s-io/kine/pkg/endpoint" "github.com/rancher/wrangler/pkg/generated/controllers/core" "github.com/rancher/wrangler/pkg/leader" @@ -342,6 +343,7 @@ type ControlRuntime struct { ClientETCDCert string ClientETCDKey string + K3s *k3s.Factory Core *core.Factory Event record.EventRecorder EtcdConfig endpoint.ETCDConfig diff --git a/pkg/etcd/s3.go b/pkg/etcd/s3.go index e38a58ed88c9..952e98849a50 100644 --- a/pkg/etcd/s3.go +++ b/pkg/etcd/s3.go @@ -7,17 +7,20 @@ import ( "encoding/base64" "encoding/pem" "fmt" - "io" + "io/ioutil" "net/http" + "net/textproto" "os" "path" "path/filepath" + "runtime" "sort" "strconv" "strings" "time" "github.com/k3s-io/k3s/pkg/daemons/config" + "github.com/k3s-io/k3s/pkg/version" "github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7/pkg/credentials" "github.com/pkg/errors" @@ -26,10 +29,17 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +var ( + clusterIDKey = textproto.CanonicalMIMEHeaderKey(version.Program + "-cluster-id") + nodeNameKey = textproto.CanonicalMIMEHeaderKey(version.Program + "-node-name") +) + // S3 maintains state for S3 functionality. type S3 struct { - config *config.Control - client *minio.Client + config *config.Control + client *minio.Client + clusterID string + nodeName string } // newS3 creates a new value of type s3 pointer with a @@ -83,23 +93,42 @@ func NewS3(ctx context.Context, config *config.Control) (*S3, error) { return nil, err } if !exists { - return nil, fmt.Errorf("bucket: %s does not exist", config.EtcdS3BucketName) + return nil, fmt.Errorf("bucket %s does not exist", config.EtcdS3BucketName) } logrus.Infof("S3 bucket %s exists", config.EtcdS3BucketName) + for config.Runtime.Core == nil { + runtime.Gosched() + } + + // cluster id hack: see https://groups.google.com/forum/#!msg/kubernetes-sig-architecture/mVGobfD4TpY/nkdbkX1iBwAJ + var clusterID string + if ns, err := config.Runtime.Core.Core().V1().Namespace().Get(metav1.NamespaceSystem, metav1.GetOptions{}); err != nil { + logrus.Warnf("Failed to set cluster ID: %v", err) + } else { + clusterID = string(ns.UID) + } + return &S3{ - config: config, - client: c, + config: config, + client: c, + clusterID: clusterID, + nodeName: os.Getenv("NODE_NAME"), }, nil } // upload uploads the given snapshot to the configured S3 // compatible backend. func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.ConfigMap, now time.Time) (*snapshotFile, error) { - logrus.Infof("Uploading snapshot %s to S3", snapshot) + logrus.Infof("Uploading snapshot to s3://%s/%s", s.config.EtcdS3BucketName, snapshot) basename := filepath.Base(snapshot) + metadata := filepath.Join(filepath.Dir(snapshot), "..", metadataDir, basename) + snapshotKey := path.Join(s.config.EtcdS3Folder, basename) + metadataKey := path.Join(s.config.EtcdS3Folder, metadataDir, basename) + sf := &snapshotFile{ Name: basename, + Location: fmt.Sprintf("s3://%s/%s", s.config.EtcdS3BucketName, snapshotKey), NodeName: "s3", CreatedAt: &metav1.Time{ Time: now, @@ -113,21 +142,11 @@ func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.Conf Folder: s.config.EtcdS3Folder, Insecure: s.config.EtcdS3Insecure, }, + Compressed: strings.HasSuffix(snapshot, compressedExtension), metadataSource: extraMetadata, } - snapshotKey := path.Join(s.config.EtcdS3Folder, basename) - - toCtx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) - defer cancel() - opts := minio.PutObjectOptions{NumThreads: 2} - if strings.HasSuffix(snapshot, compressedExtension) { - opts.ContentType = "application/zip" - sf.Compressed = true - } else { - opts.ContentType = "application/octet-stream" - } - uploadInfo, err := s.client.FPutObject(toCtx, s.config.EtcdS3BucketName, snapshotKey, snapshot, opts) + uploadInfo, err := s.uploadSnapshot(ctx, snapshotKey, snapshot) if err != nil { sf.Status = failedSnapshotStatus sf.Message = base64.StdEncoding.EncodeToString([]byte(err.Error())) @@ -135,48 +154,101 @@ func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.Conf sf.Status = successfulSnapshotStatus sf.Size = uploadInfo.Size } + if _, err := s.uploadSnapshotMetadata(ctx, metadataKey, metadata); err != nil { + logrus.Warnf("Failed to upload snapshot metadata to S3: %v", err) + } else { + logrus.Infof("Uploaded snapshot metadata s3://%s/%s", s.config.EtcdS3BucketName, metadata) + } return sf, err } -// download downloads the given snapshot from the configured S3 -// compatible backend. -func (s *S3) Download(ctx context.Context) error { - snapshotKey := path.Join(s.config.EtcdS3Folder, s.config.ClusterResetRestorePath) - - logrus.Debugf("retrieving snapshot: %s", snapshotKey) - toCtx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) +// uploadSnapshot uploads the snapshot file to S3 using the minio API. +func (s *S3) uploadSnapshot(ctx context.Context, key, path string) (info minio.UploadInfo, err error) { + opts := minio.PutObjectOptions{ + NumThreads: 2, + UserMetadata: map[string]string{ + clusterIDKey: s.clusterID, + nodeNameKey: s.nodeName, + }, + } + if strings.HasSuffix(key, compressedExtension) { + opts.ContentType = "application/zip" + } else { + opts.ContentType = "application/octet-stream" + } + ctx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) defer cancel() - r, err := s.client.GetObject(toCtx, s.config.EtcdS3BucketName, snapshotKey, minio.GetObjectOptions{}) - if err != nil { - return nil + return s.client.FPutObject(ctx, s.config.EtcdS3BucketName, key, path, opts) +} + +// uploadSnapshotMetadata marshals and uploads the snapshot metadata to S3 using the minio API. +// The upload is silently skipped if no extra metadata is provided. +func (s *S3) uploadSnapshotMetadata(ctx context.Context, key, path string) (info minio.UploadInfo, err error) { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + return minio.UploadInfo{}, nil + } + return minio.UploadInfo{}, err + } + + opts := minio.PutObjectOptions{ + NumThreads: 2, + ContentType: "application/json", + UserMetadata: map[string]string{ + clusterIDKey: s.clusterID, + nodeNameKey: s.nodeName, + }, } - defer r.Close() + ctx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) + defer cancel() + return s.client.FPutObject(ctx, s.config.EtcdS3BucketName, key, path, opts) +} +// download downloads the given snapshot from the configured S3 +// compatible backend. +func (s *S3) Download(ctx context.Context) error { + snapshotKey := path.Join(s.config.EtcdS3Folder, s.config.ClusterResetRestorePath) + metadataKey := path.Join(s.config.EtcdS3Folder, metadataDir, s.config.ClusterResetRestorePath) snapshotDir, err := snapshotDir(s.config, true) if err != nil { return errors.Wrap(err, "failed to get the snapshot dir") } + snapshotFile := filepath.Join(snapshotDir, s.config.ClusterResetRestorePath) + metadataFile := filepath.Join(snapshotDir, "..", metadataDir, s.config.ClusterResetRestorePath) - fullSnapshotPath := filepath.Join(snapshotDir, s.config.ClusterResetRestorePath) - sf, err := os.Create(fullSnapshotPath) - if err != nil { + logrus.Debugf("Downloading snapshot from s3://%s/%s", s.config.EtcdS3BucketName, snapshotKey) + if err := s.downloadSnapshot(ctx, snapshotKey, snapshotFile); err != nil { return err } - defer sf.Close() - - stat, err := r.Stat() - if err != nil { + if err := s.downloadSnapshotMetadata(ctx, metadataKey, metadataFile); err != nil { return err } - if _, err := io.CopyN(sf, r, stat.Size); err != nil { - return err - } + s.config.ClusterResetRestorePath = snapshotFile + return nil +} - s.config.ClusterResetRestorePath = fullSnapshotPath +// downloadSnapshot downloads the snapshot file from S3 using the minio API. +func (s *S3) downloadSnapshot(ctx context.Context, key, file string) error { + ctx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) + defer cancel() + defer os.Chmod(file, 0600) + return s.client.FGetObject(ctx, s.config.EtcdS3BucketName, key, file, minio.GetObjectOptions{}) +} - return os.Chmod(fullSnapshotPath, 0600) +// downloadSnapshotMetadata downloads the snapshot metadata file from S3 using the minio API. +// No error is returned if the metadata file does not exist, as it is optional. +func (s *S3) downloadSnapshotMetadata(ctx context.Context, key, file string) error { + logrus.Debugf("Downloading snapshot metadata from s3://%s/%s", s.config.EtcdS3BucketName, key) + ctx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) + defer cancel() + defer os.Chmod(file, 0600) + err := s.client.FGetObject(ctx, s.config.EtcdS3BucketName, key, file, minio.GetObjectOptions{}) + if resp := minio.ToErrorResponse(err); resp.StatusCode == http.StatusNotFound { + return nil + } + return err } // snapshotPrefix returns the prefix used in the @@ -190,21 +262,27 @@ func (s *S3) snapshotRetention(ctx context.Context) error { if s.config.EtcdSnapshotRetention < 1 { return nil } - logrus.Infof("Applying snapshot retention policy to snapshots stored in S3: retention: %d, snapshotPrefix: %s", s.config.EtcdSnapshotRetention, s.snapshotPrefix()) + logrus.Infof("Applying snapshot retention=%d to snapshots stored in s3://%s/%s", s.config.EtcdSnapshotRetention, s.config.EtcdS3BucketName, s.snapshotPrefix()) var snapshotFiles []minio.ObjectInfo toCtx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) defer cancel() - loo := minio.ListObjectsOptions{ - Recursive: true, + opts := minio.ListObjectsOptions{ Prefix: s.snapshotPrefix(), + Recursive: true, } - for info := range s.client.ListObjects(toCtx, s.config.EtcdS3BucketName, loo) { + for info := range s.client.ListObjects(toCtx, s.config.EtcdS3BucketName, opts) { if info.Err != nil { return info.Err } + + // skip metadata + if path.Base(path.Dir(info.Key)) == metadataDir { + continue + } + snapshotFiles = append(snapshotFiles, info) } @@ -218,10 +296,17 @@ func (s *S3) snapshotRetention(ctx context.Context) error { }) for _, df := range snapshotFiles[s.config.EtcdSnapshotRetention:] { - logrus.Infof("Removing S3 snapshot: %s", df.Key) + logrus.Infof("Removing S3 snapshot: s3://%s/%s", s.config.EtcdS3BucketName, df.Key) if err := s.client.RemoveObject(ctx, s.config.EtcdS3BucketName, df.Key, minio.RemoveObjectOptions{}); err != nil { return err } + metadataKey := path.Join(path.Dir(df.Key), metadataDir, path.Base(df.Key)) + if err := s.client.RemoveObject(ctx, s.config.EtcdS3BucketName, metadataKey, minio.RemoveObjectOptions{}); err != nil { + if resp := minio.ToErrorResponse(err); resp.StatusCode == http.StatusNotFound { + return nil + } + return err + } } return nil @@ -231,19 +316,17 @@ func (s *S3) snapshotRetention(ctx context.Context) error { // snapshots in S3 along with their relevant // metadata. func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) { - snapshots := make(map[string]snapshotFile) + snapshots := map[string]snapshotFile{} + metadatas := []string{} ctx, cancel := context.WithCancel(ctx) defer cancel() - var loo minio.ListObjectsOptions - if s.config.EtcdS3Folder != "" { - loo = minio.ListObjectsOptions{ - Prefix: s.config.EtcdS3Folder, - Recursive: true, - } + opts := minio.ListObjectsOptions{ + Prefix: s.config.EtcdS3Folder, + Recursive: true, } - objects := s.client.ListObjects(ctx, s.config.EtcdS3BucketName, loo) + objects := s.client.ListObjects(ctx, s.config.EtcdS3BucketName, opts) for obj := range objects { if obj.Err != nil { @@ -253,7 +336,18 @@ func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) continue } + if o, err := s.client.StatObject(ctx, s.config.EtcdS3BucketName, obj.Key, minio.StatObjectOptions{}); err != nil { + logrus.Warnf("Failed to get object metadata: %v", err) + } else { + obj = o + } + filename := path.Base(obj.Key) + if path.Base(path.Dir(obj.Key)) == metadataDir { + metadatas = append(metadatas, obj.Key) + continue + } + basename, compressed := strings.CutSuffix(filename, compressedExtension) ts, err := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) if err != nil { @@ -262,6 +356,7 @@ func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) sf := snapshotFile{ Name: filename, + Location: fmt.Sprintf("s3://%s/%s", s.config.EtcdS3BucketName, obj.Key), NodeName: "s3", CreatedAt: &metav1.Time{ Time: time.Unix(ts, 0), @@ -282,6 +377,25 @@ func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) sfKey := generateSnapshotConfigMapKey(sf) snapshots[sfKey] = sf } + + for _, metadataKey := range metadatas { + filename := path.Base(metadataKey) + sfKey := generateSnapshotConfigMapKey(snapshotFile{Name: filename, NodeName: "s3"}) + if sf, ok := snapshots[sfKey]; ok { + logrus.Debugf("Loading snapshot metadata from s3://%s/%s", s.config.EtcdS3BucketName, metadataKey) + if obj, err := s.client.GetObject(ctx, s.config.EtcdS3BucketName, metadataKey, minio.GetObjectOptions{}); err != nil { + logrus.Warnf("Failed to get snapshot metadata: %v", err) + } else { + if m, err := ioutil.ReadAll(obj); err != nil { + logrus.Warnf("Failed to read snapshot metadata: %v", err) + } else { + sf.Metadata = base64.StdEncoding.EncodeToString(m) + snapshots[sfKey] = sf + } + } + } + } + return snapshots, nil } diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go index d640b69eadb9..82fc553ad05d 100644 --- a/pkg/etcd/snapshot.go +++ b/pkg/etcd/snapshot.go @@ -38,6 +38,7 @@ const ( maxConcurrentSnapshots = 1 pruneStepSize = 5 compressedExtension = ".zip" + metadataDir = ".metadata" ) var ( @@ -272,20 +273,20 @@ func (e *ETCD) Snapshot(ctx context.Context) error { } } - if e.config.EtcdSnapshotCompress { - zipPath, err := e.compressSnapshot(snapshotDir, snapshotName, snapshotPath, now) - if err != nil { - return err - } - if err := os.Remove(snapshotPath); err != nil { - return err + // If the snapshot attempt was successful, sf will be nil as we did not set it to store the error message. + if sf == nil { + if e.config.EtcdSnapshotCompress { + zipPath, err := e.compressSnapshot(snapshotDir, snapshotName, snapshotPath, now) + if err != nil { + return errors.Wrap(err, "failed to compress snapshot") + } + if err := os.Remove(snapshotPath); err != nil { + return errors.Wrap(err, "failed to remove uncompressed snapshot") + } + snapshotPath = zipPath + logrus.Info("Compressed snapshot: " + snapshotPath) } - snapshotPath = zipPath - logrus.Info("Compressed snapshot: " + snapshotPath) - } - // If the snapshot attempt was successful, sf will be nil as we did not set it. - if sf == nil { f, err := os.Stat(snapshotPath) if err != nil { return errors.Wrap(err, "unable to retrieve snapshot information from local snapshot") @@ -303,15 +304,19 @@ func (e *ETCD) Snapshot(ctx context.Context) error { metadataSource: extraMetadata, } + if err := saveSnapshotMetadata(snapshotPath, extraMetadata); err != nil { + return errors.Wrap(err, "failed to save local snapshot metadata") + } + if err := e.addSnapshotData(*sf); err != nil { return errors.Wrap(err, "failed to save local snapshot data to configmap") } + if err := snapshotRetention(e.config.EtcdSnapshotRetention, e.config.EtcdSnapshotName, snapshotDir); err != nil { return errors.Wrap(err, "failed to apply local snapshot retention policy") } if e.config.EtcdS3 { - logrus.Infof("Saving etcd snapshot %s to S3", snapshotName) if err := e.initS3IfNil(ctx); err != nil { logrus.Warnf("Unable to initialize S3 client: %v", err) sf = &snapshotFile{ @@ -335,6 +340,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { metadataSource: extraMetadata, } } else { + logrus.Infof("Saving etcd snapshot %s to S3", snapshotName) // upload will return a snapshotFile even on error - if there was an // error, it will be reflected in the status and message. sf, err = e.s3.upload(ctx, snapshotPath, extraMetadata, now) @@ -414,10 +420,21 @@ func (e *ETCD) listLocalSnapshots() (map[string]snapshotFile, error) { if err != nil { ts = file.ModTime().Unix() } + + // try to read metadata from disk; don't warn if it is missing as it will not exist + // for snapshot files from old releases or if there was no metadata provided. + var metadata string + metadataFile := filepath.Join(filepath.Dir(path), "..", metadataDir, file.Name()) + if m, err := os.ReadFile(metadataFile); err == nil { + logrus.Debugf("Loading snapshot metadata from %s", metadataFile) + metadata = base64.StdEncoding.EncodeToString(m) + } + sf := snapshotFile{ Name: file.Name(), Location: "file://" + filepath.Join(snapshotDir, file.Name()), NodeName: nodeName, + Metadata: metadata, CreatedAt: &metav1.Time{ Time: time.Unix(ts, 0), }, @@ -462,7 +479,7 @@ func (e *ETCD) PruneSnapshots(ctx context.Context) error { if e.config.EtcdS3 { if err := e.initS3IfNil(ctx); err != nil { - logrus.Warnf("Unable to initialize S3 client during prune: %v", err) + logrus.Warnf("Unable to initialize S3 client: %v", err) } else { if err := e.s3.snapshotRetention(ctx); err != nil { logrus.Errorf("Error applying S3 snapshot retention policy: %v", err) @@ -478,6 +495,7 @@ func (e *ETCD) ListSnapshots(ctx context.Context) (map[string]snapshotFile, erro snapshotFiles := map[string]snapshotFile{} if e.config.EtcdS3 { if err := e.initS3IfNil(ctx); err != nil { + logrus.Warnf("Unable to initialize S3 client: %v", err) return nil, err } sfs, err := e.s3.listSnapshots(ctx) @@ -506,13 +524,30 @@ func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error { return errors.Wrap(err, "failed to get the snapshot dir") } - if e.config.EtcdS3 { - logrus.Info("Removing the given etcd snapshot(s) from S3") - logrus.Debugf("Removing the given etcd snapshot(s) from S3: %v", snapshots) + logrus.Info("Removing the given locally stored etcd snapshot(s)") + logrus.Debugf("Attempting to remove the given locally stored etcd snapshot(s): %v", snapshots) + + for _, s := range snapshots { + // check if the given snapshot exists. If it does, + // remove it, otherwise continue. + sf := filepath.Join(snapshotDir, s) + if _, err := os.Stat(sf); os.IsNotExist(err) { + logrus.Infof("Snapshot %s, does not exist", s) + continue + } + if err := os.Remove(sf); err != nil { + return err + } + logrus.Debug("Removed snapshot ", s) + } + if e.config.EtcdS3 { if e.initS3IfNil(ctx); err != nil { + logrus.Warnf("Unable to initialize S3 client: %v", err) return err } + logrus.Info("Removing the given etcd snapshot(s) from S3") + logrus.Debugf("Removing the given etcd snapshot(s) from S3: %v", snapshots) objectsCh := make(chan minio.ObjectInfo) @@ -566,23 +601,6 @@ func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error { } } - logrus.Info("Removing the given locally stored etcd snapshot(s)") - logrus.Debugf("Attempting to remove the given locally stored etcd snapshot(s): %v", snapshots) - - for _, s := range snapshots { - // check if the given snapshot exists. If it does, - // remove it, otherwise continue. - sf := filepath.Join(snapshotDir, s) - if _, err := os.Stat(sf); os.IsNotExist(err) { - logrus.Infof("Snapshot %s, does not exist", s) - continue - } - if err := os.Remove(sf); err != nil { - return err - } - logrus.Debug("Removed snapshot ", s) - } - return e.ReconcileSnapshotData(ctx) } @@ -735,6 +753,11 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { var s3ListSuccessful bool if e.config.EtcdS3 { + if err := e.initS3IfNil(ctx); err != nil { + logrus.Warnf("Unable to initialize S3 client: %v", err) + return err + } + if s3Snapshots, err := e.s3.listSnapshots(ctx); err != nil { logrus.Errorf("Error retrieving S3 snapshots for reconciliation: %v", err) } else { @@ -906,10 +929,14 @@ func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) for _, df := range snapshotFiles[retention:] { snapshotPath := filepath.Join(snapshotDir, df.Name) + metadataPath := filepath.Join(snapshotDir, "..", metadataDir, df.Name) logrus.Infof("Removing local snapshot %s", snapshotPath) if err := os.Remove(snapshotPath); err != nil { return err } + if err := os.Remove(metadataPath); err != nil && !os.IsNotExist(err) { + return err + } } return nil @@ -919,3 +946,24 @@ func isTooLargeError(err error) bool { // There are no helpers for unpacking field validation errors, so we just check for "Too long" in the error string. return apierrors.IsRequestEntityTooLargeError(err) || (apierrors.IsInvalid(err) && strings.Contains(err.Error(), "Too long")) } + +// saveSnapshotMetadata writes extra metadata to disk. +// The upload is silently skipped if no extra metadata is provided. +func saveSnapshotMetadata(snapshotPath string, extraMetadata *v1.ConfigMap) error { + if extraMetadata == nil || len(extraMetadata.Data) == 0 { + return nil + } + + dir := filepath.Join(filepath.Dir(snapshotPath), "..", metadataDir) + filename := filepath.Base(snapshotPath) + metadataPath := filepath.Join(dir, filename) + logrus.Infof("Saving snapshot metadata to %s", metadataPath) + m, err := json.Marshal(extraMetadata.Data) + if err != nil { + return err + } + if err := os.MkdirAll(dir, 0700); err != nil { + return err + } + return os.WriteFile(metadataPath, m, 0700) +} diff --git a/pkg/server/server.go b/pkg/server/server.go index 2734f6ed5196..7ddc7c23fa18 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -114,6 +114,7 @@ func runControllers(ctx context.Context, config *Config) error { controlConfig.Runtime.NodePasswdFile); err != nil { logrus.Warn(errors.Wrap(err, "error migrating node-password file")) } + controlConfig.Runtime.K3s = sc.K3s controlConfig.Runtime.Event = sc.Event controlConfig.Runtime.Core = sc.Core From 7c346a4d63ba4d7931f94fb89f04b507d669ac04 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Tue, 10 Oct 2023 19:51:42 +0000 Subject: [PATCH 10/14] Sort snapshots by time and key in tabwriter output Fixes snapshot list coming out in non-deterministic order Signed-off-by: Brad Davidson --- pkg/cli/etcdsnapshot/etcd_snapshot.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pkg/cli/etcdsnapshot/etcd_snapshot.go b/pkg/cli/etcdsnapshot/etcd_snapshot.go index 93dd738cd734..97e8c696fa65 100644 --- a/pkg/cli/etcdsnapshot/etcd_snapshot.go +++ b/pkg/cli/etcdsnapshot/etcd_snapshot.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "path/filepath" + "sort" "strings" "text/tabwriter" "time" @@ -193,9 +194,23 @@ func list(app *cli.Context, cfg *cmds.Server) error { w := tabwriter.NewWriter(os.Stdout, 0, 0, 1, ' ', 0) defer w.Flush() + // Sort snapshots by creation time and key + sfKeys := make([]string, 0, len(sf)) + for k := range sf { + sfKeys = append(sfKeys, k) + } + sort.Slice(sfKeys, func(i, j int) bool { + iKey := sfKeys[i] + jKey := sfKeys[j] + if sf[iKey].CreatedAt.Equal(sf[jKey].CreatedAt) { + return iKey < jKey + } + return sf[iKey].CreatedAt.Before(sf[jKey].CreatedAt) + }) + fmt.Fprint(w, "Name\tLocation\tSize\tCreated\n") - for _, s := range sf { - fmt.Fprintf(w, "%s\t%s\t%d\t%s\n", s.Name, s.Location, s.Size, s.CreatedAt.Format(time.RFC3339)) + for _, k := range sfKeys { + fmt.Fprintf(w, "%s\t%s\t%d\t%s\n", sf[k].Name, sf[k].Location, sf[k].Size, sf[k].CreatedAt.Format(time.RFC3339)) } } From 4d87b2c73ff00cfd5dd27bcbf184b8179a47dc45 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Thu, 5 Oct 2023 16:49:22 +0000 Subject: [PATCH 11/14] Move snapshot delete into local/s3 functions Signed-off-by: Brad Davidson --- pkg/etcd/s3.go | 26 +++- pkg/etcd/snapshot.go | 112 +++++++----------- .../etcdsnapshot/etcdsnapshot_int_test.go | 6 +- 3 files changed, 67 insertions(+), 77 deletions(-) diff --git a/pkg/etcd/s3.go b/pkg/etcd/s3.go index 952e98849a50..ebe5abaf1dff 100644 --- a/pkg/etcd/s3.go +++ b/pkg/etcd/s3.go @@ -205,7 +205,7 @@ func (s *S3) uploadSnapshotMetadata(ctx context.Context, key, path string) (info return s.client.FPutObject(ctx, s.config.EtcdS3BucketName, key, path, opts) } -// download downloads the given snapshot from the configured S3 +// Download downloads the given snapshot from the configured S3 // compatible backend. func (s *S3) Download(ctx context.Context) error { snapshotKey := path.Join(s.config.EtcdS3Folder, s.config.ClusterResetRestorePath) @@ -297,12 +297,12 @@ func (s *S3) snapshotRetention(ctx context.Context) error { for _, df := range snapshotFiles[s.config.EtcdSnapshotRetention:] { logrus.Infof("Removing S3 snapshot: s3://%s/%s", s.config.EtcdS3BucketName, df.Key) - if err := s.client.RemoveObject(ctx, s.config.EtcdS3BucketName, df.Key, minio.RemoveObjectOptions{}); err != nil { + if err := s.client.RemoveObject(toCtx, s.config.EtcdS3BucketName, df.Key, minio.RemoveObjectOptions{}); err != nil { return err } metadataKey := path.Join(path.Dir(df.Key), metadataDir, path.Base(df.Key)) - if err := s.client.RemoveObject(ctx, s.config.EtcdS3BucketName, metadataKey, minio.RemoveObjectOptions{}); err != nil { - if resp := minio.ToErrorResponse(err); resp.StatusCode == http.StatusNotFound { + if err := s.client.RemoveObject(toCtx, s.config.EtcdS3BucketName, metadataKey, minio.RemoveObjectOptions{}); err != nil { + if isNotExist(err) { return nil } return err @@ -312,13 +312,29 @@ func (s *S3) snapshotRetention(ctx context.Context) error { return nil } +func (s *S3) deleteSnapshot(ctx context.Context, key string) error { + ctx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) + defer cancel() + + key = path.Join(s.config.EtcdS3Folder, key) + err := s.client.RemoveObject(ctx, s.config.EtcdS3BucketName, key, minio.RemoveObjectOptions{}) + if err == nil || isNotExist(err) { + metadataKey := path.Join(path.Dir(key), metadataDir, path.Base(key)) + if merr := s.client.RemoveObject(ctx, s.config.EtcdS3BucketName, metadataKey, minio.RemoveObjectOptions{}); merr != nil && !isNotExist(merr) { + err = merr + } + } + + return err +} + // listSnapshots provides a list of currently stored // snapshots in S3 along with their relevant // metadata. func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) { snapshots := map[string]snapshotFile{} metadatas := []string{} - ctx, cancel := context.WithCancel(ctx) + ctx, cancel := context.WithTimeout(ctx, s.config.EtcdS3Timeout) defer cancel() opts := minio.ListObjectsOptions{ diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go index 82fc553ad05d..3b436e26ac3e 100644 --- a/pkg/etcd/snapshot.go +++ b/pkg/etcd/snapshot.go @@ -8,6 +8,7 @@ import ( "fmt" "io" "math/rand" + "net/http" "os" "path/filepath" "runtime" @@ -516,94 +517,60 @@ func (e *ETCD) ListSnapshots(ctx context.Context) (map[string]snapshotFile, erro return snapshotFiles, err } -// deleteSnapshots removes the given snapshots from -// either local storage or S3. +// DeleteSnapshots removes the given snapshots from local storage and S3. func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error { snapshotDir, err := snapshotDir(e.config, false) if err != nil { return errors.Wrap(err, "failed to get the snapshot dir") } - - logrus.Info("Removing the given locally stored etcd snapshot(s)") - logrus.Debugf("Attempting to remove the given locally stored etcd snapshot(s): %v", snapshots) - - for _, s := range snapshots { - // check if the given snapshot exists. If it does, - // remove it, otherwise continue. - sf := filepath.Join(snapshotDir, s) - if _, err := os.Stat(sf); os.IsNotExist(err) { - logrus.Infof("Snapshot %s, does not exist", s) - continue - } - if err := os.Remove(sf); err != nil { - return err - } - logrus.Debug("Removed snapshot ", s) - } - if e.config.EtcdS3 { - if e.initS3IfNil(ctx); err != nil { - logrus.Warnf("Unable to initialize S3 client: %v", err) + if err := e.initS3IfNil(ctx); err != nil { return err } - logrus.Info("Removing the given etcd snapshot(s) from S3") - logrus.Debugf("Removing the given etcd snapshot(s) from S3: %v", snapshots) - - objectsCh := make(chan minio.ObjectInfo) - - ctx, cancel := context.WithTimeout(ctx, e.config.EtcdS3Timeout) - defer cancel() - - go func() { - defer close(objectsCh) + } - opts := minio.ListObjectsOptions{ - Recursive: true, + for _, s := range snapshots { + if err := e.deleteSnapshot(filepath.Join(snapshotDir, s)); err != nil { + if isNotExist(err) { + logrus.Infof("Snapshot %s not found locally", s) + } else { + logrus.Errorf("Failed to delete local snapshot %s: %v", s, err) } + } else { + logrus.Infof("Snapshot %s deleted locally", s) + } - for obj := range e.s3.client.ListObjects(ctx, e.config.EtcdS3BucketName, opts) { - if obj.Err != nil { - logrus.Errorf("Failed to list snapshots from S3: %v", obj.Err) - return - } - - // iterate through the given snapshots and only - // add them to the channel for remove if they're - // actually found from the bucket listing. - for _, snapshot := range snapshots { - if snapshot == obj.Key { - objectsCh <- obj - } - } - } - }() - - err = func() error { - for { - select { - case <-ctx.Done(): - logrus.Errorf("Unable to delete snapshot: %v", ctx.Err()) - return e.ReconcileSnapshotData(ctx) - case <-time.After(time.Millisecond * 100): - continue - case err, ok := <-e.s3.client.RemoveObjects(ctx, e.config.EtcdS3BucketName, objectsCh, minio.RemoveObjectsOptions{}): - if err.Err != nil { - logrus.Errorf("Unable to delete snapshot: %v", err.Err) - } - if !ok { - return e.ReconcileSnapshotData(ctx) - } + if e.config.EtcdS3 { + if err := e.s3.deleteSnapshot(s); err != nil { + if isNotExist(err) { + logrus.Infof("Snapshot %s not found in S3", s) + } else { + logrus.Errorf("Failed to delete S3 snapshot %s: %v", s, err) } + } else { + logrus.Infof("Snapshot %s deleted from S3", s) } - }() - if err != nil { - return err } } return e.ReconcileSnapshotData(ctx) } +func (e *ETCD) deleteSnapshot(snapshotPath string) error { + dir := filepath.Join(filepath.Dir(snapshotPath), "..", metadataDir) + filename := filepath.Base(snapshotPath) + metadataPath := filepath.Join(dir, filename) + + err := os.Remove(snapshotPath) + if err == nil || os.IsNotExist(err) { + if merr := os.Remove(metadataPath); err != nil && !isNotExist(err) { + err = merr + } + } + + return err +} + func marshalSnapshotFile(sf snapshotFile) ([]byte, error) { if sf.metadataSource != nil { if m, err := json.Marshal(sf.metadataSource.Data); err != nil { @@ -947,6 +914,13 @@ func isTooLargeError(err error) bool { return apierrors.IsRequestEntityTooLargeError(err) || (apierrors.IsInvalid(err) && strings.Contains(err.Error(), "Too long")) } +func isNotExist(err error) bool { + if resp := minio.ToErrorResponse(err); resp.StatusCode == http.StatusNotFound || os.IsNotExist(err) { + return true + } + return false +} + // saveSnapshotMetadata writes extra metadata to disk. // The upload is silently skipped if no extra metadata is provided. func saveSnapshotMetadata(snapshotPath string, extraMetadata *v1.ConfigMap) error { diff --git a/tests/integration/etcdsnapshot/etcdsnapshot_int_test.go b/tests/integration/etcdsnapshot/etcdsnapshot_int_test.go index 3fe9f4152b84..ee5ec6b049a7 100644 --- a/tests/integration/etcdsnapshot/etcdsnapshot_int_test.go +++ b/tests/integration/etcdsnapshot/etcdsnapshot_int_test.go @@ -58,7 +58,7 @@ var _ = Describe("etcd snapshots", Ordered, func() { Expect(err).ToNot(HaveOccurred()) snapshotName := reg.FindString(lsResult) Expect(testutil.K3sCmd("etcd-snapshot", "delete", snapshotName)). - To(ContainSubstring("Removing the given locally stored etcd snapshot")) + To(ContainSubstring("Snapshot " + snapshotName + " deleted locally")) }) }) When("saving a custom name", func() { @@ -73,7 +73,7 @@ var _ = Describe("etcd snapshots", Ordered, func() { Expect(err).ToNot(HaveOccurred()) snapshotName := reg.FindString(lsResult) Expect(testutil.K3sCmd("etcd-snapshot", "delete", snapshotName)). - To(ContainSubstring("Removing the given locally stored etcd snapshot")) + To(ContainSubstring("Snapshot " + snapshotName + " deleted locally")) }) }) When("using etcd snapshot prune", func() { @@ -113,7 +113,7 @@ var _ = Describe("etcd snapshots", Ordered, func() { Expect(err).ToNot(HaveOccurred()) for _, snapshotName := range reg.FindAllString(lsResult, -1) { Expect(testutil.K3sCmd("etcd-snapshot", "delete", snapshotName)). - To(ContainSubstring("Removing the given locally stored etcd snapshot")) + To(ContainSubstring("Snapshot " + snapshotName + " deleted locally")) } }) }) From 58fb71e3556713476219a63612212fb56803d2ce Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Tue, 3 Oct 2023 17:13:26 +0000 Subject: [PATCH 12/14] Switch to managing ETCDSnapshotFile resources Reconcile snapshot CRs instead of ConfigMap; manage ConfigMap downstream from CR list Signed-off-by: Brad Davidson --- pkg/etcd/etcd.go | 1 + pkg/etcd/s3.go | 7 +- pkg/etcd/snapshot.go | 572 ++++++++++++++++++++------------ pkg/etcd/snapshot_controller.go | 312 +++++++++++++++++ 4 files changed, 681 insertions(+), 211 deletions(-) create mode 100644 pkg/etcd/snapshot_controller.go diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index eb3e5cdac7dd..fc3894b37fd3 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -576,6 +576,7 @@ func (e *ETCD) Register(handler http.Handler) (http.Handler, error) { e.config.Runtime.LeaderElectedClusterControllerStarts[version.Program+"-etcd"] = func(ctx context.Context) { registerEndpointsHandlers(ctx, e) registerMemberHandlers(ctx, e) + registerSnapshotHandlers(ctx, e) } } diff --git a/pkg/etcd/s3.go b/pkg/etcd/s3.go index ebe5abaf1dff..d96b536d29fb 100644 --- a/pkg/etcd/s3.go +++ b/pkg/etcd/s3.go @@ -144,6 +144,7 @@ func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.Conf }, Compressed: strings.HasSuffix(snapshot, compressedExtension), metadataSource: extraMetadata, + nodeSource: s.nodeName, } uploadInfo, err := s.uploadSnapshot(ctx, snapshotKey, snapshot) @@ -338,8 +339,9 @@ func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) defer cancel() opts := minio.ListObjectsOptions{ - Prefix: s.config.EtcdS3Folder, - Recursive: true, + Prefix: s.config.EtcdS3Folder, + Recursive: true, + WithMetadata: true, } objects := s.client.ListObjects(ctx, s.config.EtcdS3BucketName, opts) @@ -389,6 +391,7 @@ func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) }, Status: successfulSnapshotStatus, Compressed: compressed, + nodeSource: obj.UserMetadata[nodeNameKey], } sfKey := generateSnapshotConfigMapKey(sf) snapshots[sfKey] = sf diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go index 3b436e26ac3e..4c710d7b5153 100644 --- a/pkg/etcd/snapshot.go +++ b/pkg/etcd/snapshot.go @@ -3,7 +3,9 @@ package etcd import ( "archive/zip" "context" + "crypto/sha256" "encoding/base64" + "encoding/hex" "encoding/json" "fmt" "io" @@ -17,7 +19,9 @@ import ( "strings" "time" + apisv1 "github.com/k3s-io/k3s/pkg/apis/k3s.cattle.io/v1" "github.com/k3s-io/k3s/pkg/daemons/config" + "github.com/k3s-io/k3s/pkg/util" "github.com/k3s-io/k3s/pkg/version" "github.com/minio/minio-go/v7" "github.com/pkg/errors" @@ -29,22 +33,30 @@ import ( "go.uber.org/zap" "golang.org/x/sync/semaphore" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/validation" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/util/retry" + "k8s.io/utils/pointer" ) const ( maxConcurrentSnapshots = 1 - pruneStepSize = 5 compressedExtension = ".zip" metadataDir = ".metadata" + errorTTL = 24 * time.Hour ) var ( snapshotExtraMetadataConfigMapName = version.Program + "-etcd-snapshot-extra-metadata" - snapshotConfigMapName = version.Program + "-etcd-snapshots" + labelStorageNode = "etcd." + version.Program + ".cattle.io/snapshot-storage-node" + annotationLocalReconciled = "etcd." + version.Program + ".cattle.io/local-snapshots-timestamp" + annotationS3Reconciled = "etcd." + version.Program + ".cattle.io/s3-snapshots-timestamp" // snapshotDataBackoff will retry at increasing steps for up to ~30 seconds. // If the ConfigMap update fails, the list won't be reconciled again until next time @@ -170,7 +182,7 @@ func (e *ETCD) decompressSnapshot(snapshotDir, snapshotFile string) (string, err defer ss.Close() if _, err := io.Copy(decompressed, ss); err != nil { - os.Remove("") + os.Remove(decompressed.Name()) return "", err } } @@ -265,12 +277,11 @@ func (e *ETCD) Snapshot(ctx context.Context) error { Status: failedSnapshotStatus, Message: base64.StdEncoding.EncodeToString([]byte(err.Error())), Size: 0, - Compressed: e.config.EtcdSnapshotCompress, metadataSource: extraMetadata, } logrus.Errorf("Failed to take etcd snapshot: %v", err) if err := e.addSnapshotData(*sf); err != nil { - return errors.Wrap(err, "failed to save local snapshot failure data to configmap") + return errors.Wrap(err, "failed to sync ETCDSnapshotFile") } } @@ -310,7 +321,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { } if err := e.addSnapshotData(*sf); err != nil { - return errors.Wrap(err, "failed to save local snapshot data to configmap") + return errors.Wrap(err, "failed to sync ETCDSnapshotFile") } if err := snapshotRetention(e.config.EtcdSnapshotRetention, e.config.EtcdSnapshotName, snapshotDir); err != nil { @@ -352,7 +363,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { } } if err := e.addSnapshotData(*sf); err != nil { - return errors.Wrap(err, "failed to save snapshot data to configmap") + return errors.Wrap(err, "failed to sync ETCDSnapshotFile") } if err := e.s3.snapshotRetention(ctx); err != nil { logrus.Errorf("Failed to apply s3 snapshot retention policy: %v", err) @@ -397,7 +408,10 @@ type snapshotFile struct { S3 *s3Config `json:"s3Config,omitempty"` Compressed bool `json:"compressed"` + // these fields are used for the internal representation of the snapshot + // to populate other fields before serialization to the legacy configmap. metadataSource *v1.ConfigMap `json:"-"` + nodeSource string `json:"-"` } // listLocalSnapshots provides a list of the currently stored @@ -541,7 +555,7 @@ func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error { } if e.config.EtcdS3 { - if err := e.s3.deleteSnapshot(s); err != nil { + if err := e.s3.deleteSnapshot(ctx, s); err != nil { if isNotExist(err) { logrus.Infof("Snapshot %s not found in S3", s) } else { @@ -582,62 +596,55 @@ func marshalSnapshotFile(sf snapshotFile) ([]byte, error) { return json.Marshal(sf) } -// AddSnapshotData adds the given snapshot file information to the snapshot configmap, using the existing extra metadata -// available at the time. +// addSnapshotData syncs an internal snapshotFile representation to an ETCDSnapshotFile resource +// of the same name. Resources will be created or updated as necessary. func (e *ETCD) addSnapshotData(sf snapshotFile) error { - // make sure the core.Factory is initialized. There can - // be a race between this core code startup. - for e.config.Runtime.Core == nil { + // make sure the K3s factory is initialized. + for e.config.Runtime.K3s == nil { runtime.Gosched() } - sfKey := generateSnapshotConfigMapKey(sf) - marshalledSnapshotFile, err := marshalSnapshotFile(sf) - if err != nil { - return err - } + snapshots := e.config.Runtime.K3s.K3s().V1().ETCDSnapshotFile() + esfName := generateSnapshotName(sf) - pruneCount := pruneStepSize - var lastErr error + var esf *apisv1.ETCDSnapshotFile return retry.OnError(snapshotDataBackoff, func(err error) bool { - return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) || isTooLargeError(err) - }, func() error { - snapshotConfigMap, getErr := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) - - if apierrors.IsNotFound(getErr) { - cm := v1.ConfigMap{ + return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) + }, func() (err error) { + // Get current object or create new one + esf, err = snapshots.Get(esfName, metav1.GetOptions{}) + if err != nil { + if !apierrors.IsNotFound(err) { + return err + } + esf = &apisv1.ETCDSnapshotFile{ ObjectMeta: metav1.ObjectMeta{ - Name: snapshotConfigMapName, - Namespace: metav1.NamespaceSystem, + Name: esfName, }, - Data: map[string]string{sfKey: string(marshalledSnapshotFile)}, } - _, err := e.config.Runtime.Core.Core().V1().ConfigMap().Create(&cm) - return err - } - - if snapshotConfigMap.Data == nil { - snapshotConfigMap.Data = make(map[string]string) } - // If the configmap update was rejected due to size, drop the oldest entries from the map. - // We will continue to remove an increasing number of old snapshots from the map until the request succeeds, - // or the number we would attempt to remove exceeds the number stored. - if isTooLargeError(lastErr) { - logrus.Warnf("Snapshot configmap is too large, attempting to elide %d oldest snapshots from list", pruneCount) - if err := pruneConfigMap(snapshotConfigMap, pruneCount); err != nil { - return err + // mutate object + existing := esf.DeepCopyObject() + sf.toETCDSnapshotFile(esf) + + // create or update as necessary + if esf.CreationTimestamp.IsZero() { + var created *apisv1.ETCDSnapshotFile + created, err = snapshots.Create(esf) + if err == nil { + // Only emit an event for the snapshot when creating the resource + e.emitEvent(created) } - pruneCount += pruneStepSize + } else if !equality.Semantic.DeepEqual(existing, esf) { + _, err = snapshots.Update(esf) } - - snapshotConfigMap.Data[sfKey] = string(marshalledSnapshotFile) - - _, lastErr = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) - return lastErr + return err }) } +// generateSnapshotConfigMapKey generates a derived name for the snapshot that is safe for use +// as a configmap key. func generateSnapshotConfigMapKey(sf snapshotFile) string { name := invalidKeyChars.ReplaceAllString(sf.Name, "_") if sf.NodeName == "s3" { @@ -646,33 +653,61 @@ func generateSnapshotConfigMapKey(sf snapshotFile) string { return "local-" + name } -// pruneConfigMap drops the oldest entries from the configMap. -// Note that the actual snapshot files are not removed, just the entries that track them in the configmap. -func pruneConfigMap(snapshotConfigMap *v1.ConfigMap, pruneCount int) error { - if pruneCount > len(snapshotConfigMap.Data) { - return errors.New("unable to reduce snapshot ConfigMap size by eliding old snapshots") +// generateSnapshotName generates a derived name for the snapshot that is safe for use +// as a resource name. +func generateSnapshotName(sf snapshotFile) string { + name := strings.ToLower(sf.Name) + nodename := sf.nodeSource + if nodename == "" { + nodename = sf.NodeName } - - var snapshotFiles []snapshotFile - retention := len(snapshotConfigMap.Data) - pruneCount - for name := range snapshotConfigMap.Data { - basename, compressed := strings.CutSuffix(name, compressedExtension) - ts, _ := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) - snapshotFiles = append(snapshotFiles, snapshotFile{Name: name, CreatedAt: &metav1.Time{Time: time.Unix(ts, 0)}, Compressed: compressed}) + // Include a digest of the hostname and location to ensure unique resource + // names. Snapshots should already include the hostname, but this ensures we + // don't accidentally hide records if a snapshot with the same name somehow + // exists on multiple nodes. + digest := sha256.Sum256([]byte(nodename + sf.Location)) + // If the lowercase filename isn't usable as a resource name, and short enough that we can include a prefix and suffix, + // generate a safe name derived from the hostname and timestamp. + if errs := validation.IsDNS1123Subdomain(name); len(errs) != 0 || len(name)+13 > validation.DNS1123SubdomainMaxLength { + nodename, _, _ := strings.Cut(nodename, ".") + name = fmt.Sprintf("etcd-snapshot-%s-%d", nodename, sf.CreatedAt.Unix()) + if sf.Compressed { + name += compressedExtension + } } + if sf.NodeName == "s3" { + return "s3-" + name + "-" + hex.EncodeToString(digest[0:])[0:6] + } + return "local-" + name + "-" + hex.EncodeToString(digest[0:])[0:6] +} - // sort newest-first so we can prune entries past the retention count - sort.Slice(snapshotFiles, func(i, j int) bool { - return snapshotFiles[j].CreatedAt.Before(snapshotFiles[i].CreatedAt) - }) +// generateETCDSnapshotFileConfigMapKey generates a key that the corresponding +// snapshotFile would be stored under in the legacy configmap +func generateETCDSnapshotFileConfigMapKey(esf apisv1.ETCDSnapshotFile) string { + name := invalidKeyChars.ReplaceAllString(esf.Spec.SnapshotName, "_") + if esf.Spec.S3 != nil { + return "s3-" + name + } + return "local-" + name +} - for _, snapshotFile := range snapshotFiles[retention:] { - delete(snapshotConfigMap.Data, snapshotFile.Name) +func (e *ETCD) emitEvent(esf *apisv1.ETCDSnapshotFile) { + switch { + case e.config.Runtime.Event == nil: + case !esf.DeletionTimestamp.IsZero(): + e.config.Runtime.Event.Eventf(esf, v1.EventTypeNormal, "ETCDSnapshotDeleted", "Snapshot %s deleted", esf.Spec.SnapshotName) + case esf.Status.Error != nil: + message := fmt.Sprintf("Failed to save snapshot %s on %s", esf.Spec.SnapshotName, esf.Spec.NodeName) + if esf.Status.Error.Message != nil { + message += ": " + *esf.Status.Error.Message + } + e.config.Runtime.Event.Event(esf, v1.EventTypeWarning, "ETCDSnapshotFailed", message) + default: + e.config.Runtime.Event.Eventf(esf, v1.EventTypeNormal, "ETCDSnapshotCreated", "Snapshot %s saved on %s", esf.Spec.SnapshotName, esf.Spec.NodeName) } - return nil } -// ReconcileSnapshotData reconciles snapshot data in the snapshot ConfigMap. +// ReconcileSnapshotData reconciles snapshot data in the ETCDSnapshotFile resources. // It will reconcile snapshot data from disk locally always, and if S3 is enabled, will attempt to list S3 snapshots // and reconcile snapshots from S3. func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { @@ -682,167 +717,171 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error { runtime.Gosched() } - logrus.Infof("Reconciling etcd snapshot data in %s ConfigMap", snapshotConfigMapName) - defer logrus.Infof("Reconciliation of snapshot data in %s ConfigMap complete", snapshotConfigMapName) + logrus.Infof("Reconciling ETCDSnapshotFile resources") + defer logrus.Infof("Reconciliation of ETCDSnapshotFile resources complete") - pruneCount := pruneStepSize - var lastErr error - return retry.OnError(retry.DefaultBackoff, func(err error) bool { - return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err) || isTooLargeError(err) - }, func() error { - snapshotConfigMap, getErr := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) - if apierrors.IsNotFound(getErr) { - cm := &v1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: snapshotConfigMapName, - Namespace: metav1.NamespaceSystem, - }, - } - cm, err := e.config.Runtime.Core.Core().V1().ConfigMap().Create(cm) - if err != nil { - return err - } - snapshotConfigMap = cm - } + // Get snapshots from local filesystem + snapshotFiles, err := e.listLocalSnapshots() + if err != nil { + return err + } - logrus.Debugf("Attempting to reconcile etcd snapshot data for configmap generation %d", snapshotConfigMap.Generation) - if snapshotConfigMap.Data == nil { - snapshotConfigMap.Data = map[string]string{} - } + nodeNames := []string{os.Getenv("NODE_NAME")} - snapshotFiles, err := e.listLocalSnapshots() - if err != nil { + // Get snapshots from S3 + if e.config.EtcdS3 { + if err := e.initS3IfNil(ctx); err != nil { return err } - // s3ListSuccessful is set to true if we are successful at listing snapshots from S3 to eliminate accidental - // clobbering of S3 snapshots in the configmap due to misconfigured S3 credentials/details - var s3ListSuccessful bool - - if e.config.EtcdS3 { - if err := e.initS3IfNil(ctx); err != nil { - logrus.Warnf("Unable to initialize S3 client: %v", err) - return err + if s3Snapshots, err := e.s3.listSnapshots(ctx); err != nil { + logrus.Errorf("Error retrieving S3 snapshots for reconciliation: %v", err) + } else { + for k, v := range s3Snapshots { + snapshotFiles[k] = v } + nodeNames = append(nodeNames, "s3") + } + } - if s3Snapshots, err := e.s3.listSnapshots(ctx); err != nil { - logrus.Errorf("Error retrieving S3 snapshots for reconciliation: %v", err) - } else { - for k, v := range s3Snapshots { - snapshotFiles[k] = v + // Try to load metadata from the legacy configmap, in case any local or s3 snapshots + // were created by an old release that does not write the metadata alongside the snapshot file. + snapshotConfigMap, err := e.config.Runtime.Core.Core().V1().ConfigMap().Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + + if snapshotConfigMap != nil { + for sfKey, sf := range snapshotFiles { + logrus.Debugf("Found snapshotFile for %s with key %s", sf.Name, sfKey) + // if the configmap has data for this snapshot, and local metadata is empty, + // deserialize the value from the configmap and attempt to load it. + if cmSnapshotValue := snapshotConfigMap.Data[sfKey]; cmSnapshotValue != "" && sf.Metadata == "" && sf.metadataSource == nil { + sfTemp := &snapshotFile{} + if err := json.Unmarshal([]byte(cmSnapshotValue), sfTemp); err != nil { + logrus.Warnf("Failed to unmarshal configmap data for snapshot %s: %v", sfKey, err) + continue } - s3ListSuccessful = true + sf.Metadata = sfTemp.Metadata + snapshotFiles[sfKey] = sf } } + } - nodeName := os.Getenv("NODE_NAME") + labelSelector := &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{{ + Key: labelStorageNode, + Operator: metav1.LabelSelectorOpIn, + Values: nodeNames, + }}, + } - // deletedSnapshots is a map[string]string where key is the configmap key and the value is the marshalled snapshot file - // it will be populated below with snapshots that are either from S3 or on the local node. Notably, deletedSnapshots will - // not contain snapshots that are in the "failed" status - deletedSnapshots := make(map[string]string) - // failedSnapshots is a slice of unmarshaled snapshot files sourced from the configmap - // These are stored unmarshaled so we can sort based on name. - var failedSnapshots []snapshotFile - var failedS3Snapshots []snapshotFile + selector, err := metav1.LabelSelectorAsSelector(labelSelector) + if err != nil { + return err + } - // remove entries for this node and s3 (if S3 is enabled) only - for k, v := range snapshotConfigMap.Data { - var sf snapshotFile - if err := json.Unmarshal([]byte(v), &sf); err != nil { - return err + // List all snapshots matching the selector + snapshots := e.config.Runtime.K3s.K3s().V1().ETCDSnapshotFile() + esfList, err := snapshots.List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + + // If a snapshot from Kubernetes was found on disk/s3, it is in sync and we can remove it from the map to sync. + // If a snapshot from Kubernetes was not found on disk/s3, is is gone and can be removed from Kubernetes. + // The one exception to the last rule is failed snapshots - these must be retained for a period of time. + for _, esf := range esfList.Items { + sfKey := generateETCDSnapshotFileConfigMapKey(esf) + logrus.Debugf("Found ETCDSnapshotFile for %s with key %s", esf.Spec.SnapshotName, sfKey) + if sf, ok := snapshotFiles[sfKey]; ok && generateSnapshotName(sf) == esf.Name { + // exists in both and names match, don't need to sync + delete(snapshotFiles, sfKey) + } else { + // doesn't exist on disk - if it's an error that hasn't expired yet, leave it, otherwise remove it + if esf.Status.Error != nil && esf.Status.Error.Time != nil { + expires := esf.Status.Error.Time.Add(errorTTL) + if time.Now().Before(expires) { + continue + } + } + if ok { + logrus.Debugf("Name of ETCDSnapshotFile for snapshotFile with key %s does not match: %s vs %s", sfKey, generateSnapshotName(sf), esf.Name) + } else { + logrus.Debugf("Key %s not found in snapshotFile list", sfKey) } - if (sf.NodeName == nodeName || (sf.NodeName == "s3" && s3ListSuccessful)) && sf.Status != failedSnapshotStatus { - // Only delete the snapshot if the snapshot was not failed - // sf.Status != FailedSnapshotStatus is intentional, as it is possible we are reconciling snapshots stored from older versions that did not set status - deletedSnapshots[generateSnapshotConfigMapKey(sf)] = v // store a copy of the snapshot - delete(snapshotConfigMap.Data, k) - } else if sf.Status == failedSnapshotStatus && sf.NodeName == nodeName && e.config.EtcdSnapshotRetention >= 1 { - // Handle locally failed snapshots. - failedSnapshots = append(failedSnapshots, sf) - delete(snapshotConfigMap.Data, k) - } else if sf.Status == failedSnapshotStatus && e.config.EtcdS3 && sf.NodeName == "s3" && strings.HasPrefix(sf.Name, e.config.EtcdSnapshotName+"-"+nodeName) && e.config.EtcdSnapshotRetention >= 1 { - // If we're operating against S3, we can clean up failed S3 snapshots that failed on this node. - failedS3Snapshots = append(failedS3Snapshots, sf) - delete(snapshotConfigMap.Data, k) + logrus.Infof("Deleting ETCDSnapshotFile for %s", esf.Spec.SnapshotName) + if err := snapshots.Delete(esf.Name, &metav1.DeleteOptions{}); err != nil { + logrus.Errorf("Failed to delete ETCDSnapshotFile: %v", err) } } + } - // Apply the failed snapshot retention policy to locally failed snapshots - if len(failedSnapshots) > 0 && e.config.EtcdSnapshotRetention >= 1 { - // sort newest-first so we can record only the retention count - sort.Slice(failedSnapshots, func(i, j int) bool { - return failedSnapshots[j].CreatedAt.Before(failedSnapshots[i].CreatedAt) - }) - - for _, dfs := range failedSnapshots[:e.config.EtcdSnapshotRetention] { - sfKey := generateSnapshotConfigMapKey(dfs) - marshalledSnapshot, err := marshalSnapshotFile(dfs) - if err != nil { - logrus.Errorf("Failed to marshal snapshot to store in configmap %v", err) - } else { - snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) - } - } + // Any snapshots remaining in the map from disk/s3 were not found in Kubernetes and need to be created + for _, sf := range snapshotFiles { + logrus.Infof("Creating ETCDSnapshotFile for %s", sf.Name) + if err := e.addSnapshotData(sf); err != nil { + logrus.Errorf("Failed to create ETCDSnapshotFile: %v", err) } + } - // Apply the failed snapshot retention policy to the S3 snapshots - if len(failedS3Snapshots) > 0 && e.config.EtcdSnapshotRetention >= 1 { - // sort newest-first so we can record only the retention count - sort.Slice(failedS3Snapshots, func(i, j int) bool { - return failedS3Snapshots[j].CreatedAt.Before(failedS3Snapshots[i].CreatedAt) - }) + // List all snapshots in Kubernetes not stored on S3 or a current etcd node. + // These snapshots are local to a node that no longer runs etcd and cannot be restored. + // If the node rejoins later and has local snapshots, it will reconcile them itself. + labelSelector.MatchExpressions[0].Operator = metav1.LabelSelectorOpNotIn + labelSelector.MatchExpressions[0].Values = []string{"s3"} - for _, dfs := range failedS3Snapshots[:e.config.EtcdSnapshotRetention] { - sfKey := generateSnapshotConfigMapKey(dfs) - marshalledSnapshot, err := marshalSnapshotFile(dfs) - if err != nil { - logrus.Errorf("Failed to marshal snapshot to store in configmap %v", err) - } else { - snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) - } - } - } + // Get a list of all etcd nodes currently in the cluster and add them to the selector + nodes := e.config.Runtime.Core.Core().V1().Node() + etcdSelector := labels.Set{util.ETCDRoleLabelKey: "true"} + nodeList, err := nodes.List(metav1.ListOptions{LabelSelector: etcdSelector.String()}) + if err != nil { + return err + } - // save the local entries to the ConfigMap if they are still on disk or in S3. - for _, snapshot := range snapshotFiles { - var sf snapshotFile - sfKey := generateSnapshotConfigMapKey(snapshot) - if v, ok := deletedSnapshots[sfKey]; ok { - // use the snapshot file we have from the existing configmap, and unmarshal it so we can manipulate it - if err := json.Unmarshal([]byte(v), &sf); err != nil { - logrus.Errorf("Error unmarshaling snapshot file: %v", err) - // use the snapshot with info we sourced from disk/S3 (will be missing metadata, but something is better than nothing) - sf = snapshot - } - } else { - sf = snapshot - } + for _, node := range nodeList.Items { + labelSelector.MatchExpressions[0].Values = append(labelSelector.MatchExpressions[0].Values, node.Name) + } - sf.Status = successfulSnapshotStatus // if the snapshot is on disk or in S3, it was successful. - marshalledSnapshot, err := marshalSnapshotFile(sf) - if err != nil { - logrus.Warnf("Failed to marshal snapshot metadata %s to store in configmap, received error: %v", sf.Name, err) - } else { - snapshotConfigMap.Data[sfKey] = string(marshalledSnapshot) - } - } + selector, err = metav1.LabelSelectorAsSelector(labelSelector) + if err != nil { + return err + } - // If the configmap update was rejected due to size, drop the oldest entries from the map. - // We will continue to remove an increasing number of old snapshots from the map until the request succeeds, - // or the number we would attempt to remove exceeds the number stored. - if isTooLargeError(lastErr) { - logrus.Warnf("Snapshot configmap is too large, attempting to elide %d oldest snapshots from list", pruneCount) - if err := pruneConfigMap(snapshotConfigMap, pruneCount); err != nil { - return err - } - pruneCount += pruneStepSize + // List and remove all snapshots stored on nodes that do not match the selector + esfList, err = snapshots.List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + + for _, esf := range esfList.Items { + if err := snapshots.Delete(esf.Name, &metav1.DeleteOptions{}); err != nil { + logrus.Errorf("Failed to delete ETCDSnapshotFile for non-etcd node %s: %v", esf.Spec.NodeName, err) } + } - logrus.Debugf("Updating snapshot ConfigMap (%s) with %d entries", snapshotConfigMapName, len(snapshotConfigMap.Data)) - _, lastErr = e.config.Runtime.Core.Core().V1().ConfigMap().Update(snapshotConfigMap) - return lastErr - }) + // Update our Node object to note the timestamp of the snapshot storages that have been reconciled + now := time.Now().Round(time.Second).Format(time.RFC3339) + patch := []map[string]string{ + { + "op": "add", + "value": now, + "path": "/metadata/annotations/" + strings.ReplaceAll(annotationLocalReconciled, "/", "~1"), + }, + } + if e.config.EtcdS3 { + patch = append(patch, map[string]string{ + "op": "add", + "value": now, + "path": "/metadata/annotations/" + strings.ReplaceAll(annotationS3Reconciled, "/", "~1"), + }) + } + b, err := json.Marshal(patch) + if err != nil { + return err + } + _, err = nodes.Patch(nodeNames[0], types.JSONPatchType, b) + return err } // setSnapshotFunction schedules snapshots at the configured interval. @@ -866,7 +905,7 @@ func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) return nil } - logrus.Infof("Applying local snapshot retention policy: retention: %d, snapshotPrefix: %s, directory: %s", retention, snapshotPrefix, snapshotDir) + logrus.Infof("Applying snapshot retention=%d to local snapshots with prefix %s in %s", retention, snapshotPrefix, snapshotDir) var snapshotFiles []snapshotFile if err := filepath.Walk(snapshotDir, func(path string, info os.FileInfo, err error) error { @@ -909,11 +948,6 @@ func snapshotRetention(retention int, snapshotPrefix string, snapshotDir string) return nil } -func isTooLargeError(err error) bool { - // There are no helpers for unpacking field validation errors, so we just check for "Too long" in the error string. - return apierrors.IsRequestEntityTooLargeError(err) || (apierrors.IsInvalid(err) && strings.Contains(err.Error(), "Too long")) -} - func isNotExist(err error) bool { if resp := minio.ToErrorResponse(err); resp.StatusCode == http.StatusNotFound || os.IsNotExist(err) { return true @@ -941,3 +975,123 @@ func saveSnapshotMetadata(snapshotPath string, extraMetadata *v1.ConfigMap) erro } return os.WriteFile(metadataPath, m, 0700) } + +func (sf *snapshotFile) fromETCDSnapshotFile(esf *apisv1.ETCDSnapshotFile) { + if esf == nil { + panic("cannot convert from nil ETCDSnapshotFile") + } + + sf.Name = esf.Spec.SnapshotName + sf.Location = esf.Spec.Location + sf.CreatedAt = esf.Status.CreationTime + sf.nodeSource = esf.Spec.NodeName + sf.Compressed = strings.HasSuffix(esf.Spec.SnapshotName, compressedExtension) + + if esf.Status.ReadyToUse != nil && *esf.Status.ReadyToUse { + sf.Status = successfulSnapshotStatus + } else { + sf.Status = failedSnapshotStatus + } + + if esf.Status.Size != nil { + sf.Size = esf.Status.Size.Value() + } + + if esf.Status.Error != nil { + if esf.Status.Error.Time != nil { + sf.CreatedAt = esf.Status.Error.Time + } + message := "etcd snapshot failed" + if esf.Status.Error.Message != nil { + message = *esf.Status.Error.Message + } + sf.Message = base64.StdEncoding.EncodeToString([]byte(message)) + } + + if len(esf.Spec.Metadata) > 0 { + if b, err := json.Marshal(esf.Spec.Metadata); err != nil { + logrus.Warnf("Failed to marshal metadata for %s: %v", esf.Name, err) + } else { + sf.Metadata = base64.StdEncoding.EncodeToString(b) + } + } + + if esf.Spec.S3 == nil { + sf.NodeName = esf.Spec.NodeName + } else { + sf.NodeName = "s3" + sf.S3 = &s3Config{ + Endpoint: esf.Spec.S3.Endpoint, + EndpointCA: esf.Spec.S3.EndpointCA, + SkipSSLVerify: esf.Spec.S3.SkipSSLVerify, + Bucket: esf.Spec.S3.Bucket, + Region: esf.Spec.S3.Region, + Folder: esf.Spec.S3.Prefix, + Insecure: esf.Spec.S3.Insecure, + } + } +} + +func (sf *snapshotFile) toETCDSnapshotFile(esf *apisv1.ETCDSnapshotFile) { + if esf == nil { + panic("cannot convert to nil ETCDSnapshotFile") + } + esf.Spec.SnapshotName = sf.Name + esf.Spec.Location = sf.Location + esf.Status.CreationTime = sf.CreatedAt + esf.Status.ReadyToUse = pointer.Bool(sf.Status == successfulSnapshotStatus) + esf.Status.Size = resource.NewQuantity(sf.Size, resource.DecimalSI) + + if sf.nodeSource != "" { + esf.Spec.NodeName = sf.nodeSource + } else { + esf.Spec.NodeName = sf.NodeName + } + + if sf.Message != "" { + var message string + b, err := base64.StdEncoding.DecodeString(sf.Message) + if err != nil { + logrus.Warnf("Failed to decode error message for %s: %v", sf.Name, err) + message = "etcd snapshot failed" + } else { + message = string(b) + } + esf.Status.Error = &apisv1.ETCDSnapshotError{ + Time: sf.CreatedAt, + Message: &message, + } + } + + if sf.metadataSource != nil { + esf.Spec.Metadata = sf.metadataSource.Data + } else if sf.Metadata != "" { + metadata, err := base64.StdEncoding.DecodeString(sf.Metadata) + if err != nil { + logrus.Warnf("Failed to decode metadata for %s: %v", sf.Name, err) + } else { + if err := json.Unmarshal(metadata, &esf.Spec.Metadata); err != nil { + logrus.Warnf("Failed to unmarshal metadata for %s: %v", sf.Name, err) + } + } + } + + if esf.ObjectMeta.Labels == nil { + esf.ObjectMeta.Labels = map[string]string{} + } + + if sf.S3 == nil { + esf.ObjectMeta.Labels[labelStorageNode] = esf.Spec.NodeName + } else { + esf.ObjectMeta.Labels[labelStorageNode] = "s3" + esf.Spec.S3 = &apisv1.ETCDSnapshotS3{ + Endpoint: sf.S3.Endpoint, + EndpointCA: sf.S3.EndpointCA, + SkipSSLVerify: sf.S3.SkipSSLVerify, + Bucket: sf.S3.Bucket, + Region: sf.S3.Region, + Prefix: sf.S3.Folder, + Insecure: sf.S3.Insecure, + } + } +} diff --git a/pkg/etcd/snapshot_controller.go b/pkg/etcd/snapshot_controller.go new file mode 100644 index 000000000000..7da376741b40 --- /dev/null +++ b/pkg/etcd/snapshot_controller.go @@ -0,0 +1,312 @@ +package etcd + +import ( + "context" + "sort" + "strconv" + "strings" + "time" + + apisv1 "github.com/k3s-io/k3s/pkg/apis/k3s.cattle.io/v1" + controllersv1 "github.com/k3s-io/k3s/pkg/generated/controllers/k3s.cattle.io/v1" + "github.com/k3s-io/k3s/pkg/util" + "github.com/k3s-io/k3s/pkg/version" + "github.com/pkg/errors" + controllerv1 "github.com/rancher/wrangler/pkg/generated/controllers/core/v1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" + + "github.com/sirupsen/logrus" +) + +const ( + pruneStepSize = 4 + reconcileKey = "_reconcile_" + reconcileInterval = 600 * time.Minute +) + +var ( + snapshotConfigMapName = version.Program + "-etcd-snapshots" +) + +type etcdSnapshotHandler struct { + ctx context.Context + etcd *ETCD + snapshots controllersv1.ETCDSnapshotFileController + configmaps controllerv1.ConfigMapController +} + +func registerSnapshotHandlers(ctx context.Context, etcd *ETCD) { + snapshots := etcd.config.Runtime.K3s.K3s().V1().ETCDSnapshotFile() + e := &etcdSnapshotHandler{ + ctx: ctx, + etcd: etcd, + snapshots: snapshots, + configmaps: etcd.config.Runtime.Core.Core().V1().ConfigMap(), + } + + logrus.Infof("Starting managed etcd snapshot ConfigMap controller") + snapshots.OnChange(ctx, "managed-etcd-snapshots-controller", e.sync) + snapshots.OnRemove(ctx, "managed-etcd-snapshots-controller", e.onRemove) + go wait.JitterUntil(func() { snapshots.Enqueue(reconcileKey) }, reconcileInterval, 0.04, false, ctx.Done()) +} + +func (e *etcdSnapshotHandler) sync(key string, esf *apisv1.ETCDSnapshotFile) (*apisv1.ETCDSnapshotFile, error) { + if key == reconcileKey { + return nil, e.reconcile() + } + if esf == nil || !esf.DeletionTimestamp.IsZero() { + return nil, nil + } + + sf := snapshotFile{} + sf.fromETCDSnapshotFile(esf) + sfKey := generateSnapshotConfigMapKey(sf) + m, err := marshalSnapshotFile(sf) + if err != nil { + return nil, errors.Wrap(err, "failed to marshal snapshot ConfigMap data") + } + marshalledSnapshot := string(m) + + snapshotConfigMap, err := e.configmaps.Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) + if err != nil { + if !apierrors.IsNotFound(err) { + return nil, errors.Wrap(err, "failed to get snapshot ConfigMap") + } + snapshotConfigMap = &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: snapshotConfigMapName, + Namespace: metav1.NamespaceSystem, + }, + } + } + + if snapshotConfigMap.Data[sfKey] != marshalledSnapshot { + if snapshotConfigMap.Data == nil { + snapshotConfigMap.Data = map[string]string{} + } + snapshotConfigMap.Data[sfKey] = marshalledSnapshot + + // Try to create or update the ConfigMap. If it is too large, prune old entries + // until it fits, or until it cannot be pruned any further. + pruneCount := pruneStepSize + err = retry.OnError(snapshotDataBackoff, isTooLargeError, func() (err error) { + if snapshotConfigMap.CreationTimestamp.IsZero() { + _, err = e.configmaps.Create(snapshotConfigMap) + } else { + _, err = e.configmaps.Update(snapshotConfigMap) + } + + if isTooLargeError(err) { + logrus.Warnf("Snapshot ConfigMap is too large, attempting to elide %d of %d entries to reduce size", pruneCount, len(snapshotConfigMap.Data)) + if perr := pruneConfigMap(snapshotConfigMap, pruneCount); perr != nil { + err = perr + } + // if the entry we're trying to add just got pruned, give up on adding it, + // as it is always going to get pushed off due to being too old to keep. + if _, ok := snapshotConfigMap.Data[sfKey]; !ok { + logrus.Warnf("Snapshot %s has been elided from ConfigMap to reduce size; not requeuing", key) + return nil + } + + pruneCount += pruneStepSize + } + return err + }) + } + + if err != nil { + err = errors.Wrap(err, "failed to sync snapshot to ConfigMap") + } + + return nil, err +} + +func (e *etcdSnapshotHandler) onRemove(key string, esf *apisv1.ETCDSnapshotFile) (*apisv1.ETCDSnapshotFile, error) { + if esf == nil { + return nil, nil + } + snapshotConfigMap, err := e.configmaps.Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + return nil, nil + } + return nil, errors.Wrap(err, "failed to get snapshot ConfigMap") + } + + sfKey := generateETCDSnapshotFileConfigMapKey(*esf) + if _, ok := snapshotConfigMap.Data[sfKey]; ok { + delete(snapshotConfigMap.Data, sfKey) + if _, err := e.configmaps.Update(snapshotConfigMap); err != nil { + return nil, errors.Wrap(err, "failed to remove snapshot from ConfigMap") + } + } + e.etcd.emitEvent(esf) + return nil, nil +} + +func (e *etcdSnapshotHandler) reconcile() error { + logrus.Infof("Reconciling snapshot ConfigMap data") + + snapshotConfigMap, err := e.configmaps.Get(metav1.NamespaceSystem, snapshotConfigMapName, metav1.GetOptions{}) + if err != nil { + if !apierrors.IsNotFound(err) { + return errors.Wrap(err, "failed to get snapshot ConfigMap") + } + snapshotConfigMap = &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: snapshotConfigMapName, + Namespace: metav1.NamespaceSystem, + }, + } + } + + // Get a list of all etcd nodes currently in the cluster. + // We will use this list to prune local entries for any node that does not exist. + nodes := e.etcd.config.Runtime.Core.Core().V1().Node() + etcdSelector := labels.Set{util.ETCDRoleLabelKey: "true"} + nodeList, err := nodes.List(metav1.ListOptions{LabelSelector: etcdSelector.String()}) + if err != nil { + return err + } + + // Once a node has set the reconcile annotation, it is considered to have + // migrated to using ETCDSnapshotFile resources, and any old configmap + // entries for it can be pruned. Until the annotation is set, we will leave + // its entries alone. + syncedNodes := map[string]bool{} + for _, node := range nodeList.Items { + if _, ok := node.Annotations[annotationLocalReconciled]; ok { + syncedNodes[node.Name] = true + } + if _, ok := node.Annotations[annotationS3Reconciled]; ok { + syncedNodes["s3"] = true + } + } + + if len(syncedNodes) == 0 { + return errors.New("no nodes have reconciled ETCDSnapshotFile resources") + } + + // Get a list of existing snapshots + snapshotList, err := e.snapshots.List(metav1.ListOptions{}) + if err != nil { + return err + } + + snapshots := map[string]*apisv1.ETCDSnapshotFile{} + for i := range snapshotList.Items { + esf := &snapshotList.Items[i] + if esf.DeletionTimestamp.IsZero() { + sfKey := generateETCDSnapshotFileConfigMapKey(*esf) + snapshots[sfKey] = esf + } + } + + // Make a copy of the configmap for change detection + existing := snapshotConfigMap.DeepCopyObject() + + // Delete any keys missing from synced storages, or associated with missing nodes + for key := range snapshotConfigMap.Data { + if strings.HasPrefix(key, "s3-") { + // If a node has syncd s3 and the key is missing then delete it + if syncedNodes["s3"] && snapshots[key] == nil { + delete(snapshotConfigMap.Data, key) + } + } else if s, ok := strings.CutPrefix(key, "local-"); ok { + // If a matching node has synced and the key is missing then delete it + // If a matching node does not exist, delete the key + // A node is considered to match the snapshot if the snapshot name matches the node name + // after trimming the leading local- prefix and trailing timestamp and extension. + s, _ = strings.CutSuffix(s, ".zip") + s = strings.TrimRight(s, "-012345678") + var matchingNode bool + for _, node := range nodeList.Items { + if strings.HasSuffix(s, node.Name) { + if syncedNodes[node.Name] && snapshots[key] == nil { + delete(snapshotConfigMap.Data, key) + } + matchingNode = true + break + } + } + if !matchingNode { + delete(snapshotConfigMap.Data, key) + } + } + } + + // Ensure keys for existing snapshots + for sfKey, esf := range snapshots { + sf := snapshotFile{} + sf.fromETCDSnapshotFile(esf) + m, err := marshalSnapshotFile(sf) + if err != nil { + logrus.Warnf("Failed to marshal snapshot ConfigMap data for %s", sfKey) + continue + } + marshalledSnapshot := string(m) + snapshotConfigMap.Data[sfKey] = marshalledSnapshot + } + + // If the configmap didn't change, don't bother updating it + if equality.Semantic.DeepEqual(existing, snapshotConfigMap) { + return nil + } + + // Try to create or update the ConfigMap. If it is too large, prune old entries + // until it fits, or until it cannot be pruned any further. + pruneCount := pruneStepSize + return retry.OnError(snapshotDataBackoff, isTooLargeError, func() (err error) { + if snapshotConfigMap.CreationTimestamp.IsZero() { + _, err = e.configmaps.Create(snapshotConfigMap) + } else { + _, err = e.configmaps.Update(snapshotConfigMap) + } + + if isTooLargeError(err) { + logrus.Warnf("Snapshot ConfigMap is too large, attempting to elide %d of %d entries to reduce size", pruneCount, len(snapshotConfigMap.Data)) + if perr := pruneConfigMap(snapshotConfigMap, pruneCount); perr != nil { + err = perr + } + pruneCount += pruneStepSize + } + return err + }) +} + +// pruneConfigMap drops the oldest entries from the configMap. +// Note that the actual snapshot files are not removed, just the entries that track them in the configmap. +func pruneConfigMap(snapshotConfigMap *v1.ConfigMap, pruneCount int) error { + if pruneCount >= len(snapshotConfigMap.Data) { + return errors.New("unable to reduce snapshot ConfigMap size by eliding old snapshots") + } + + var snapshotFiles []snapshotFile + retention := len(snapshotConfigMap.Data) - pruneCount + for name := range snapshotConfigMap.Data { + basename, compressed := strings.CutSuffix(name, compressedExtension) + ts, _ := strconv.ParseInt(basename[strings.LastIndexByte(basename, '-')+1:], 10, 64) + snapshotFiles = append(snapshotFiles, snapshotFile{Name: name, CreatedAt: &metav1.Time{Time: time.Unix(ts, 0)}, Compressed: compressed}) + } + + // sort newest-first so we can prune entries past the retention count + sort.Slice(snapshotFiles, func(i, j int) bool { + return snapshotFiles[j].CreatedAt.Before(snapshotFiles[i].CreatedAt) + }) + + for _, snapshotFile := range snapshotFiles[retention:] { + delete(snapshotConfigMap.Data, snapshotFile.Name) + } + return nil +} + +func isTooLargeError(err error) bool { + // There are no helpers for unpacking field validation errors, so we just check for "Too long" in the error string. + return apierrors.IsRequestEntityTooLargeError(err) || (apierrors.IsInvalid(err) && strings.Contains(err.Error(), "Too long")) +} From 87cc6a299ded94d730fda972f70d5c42ed8eb5db Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Tue, 10 Oct 2023 01:06:32 +0000 Subject: [PATCH 13/14] Add server token hash to CR and S3 This required pulling the token hash stuff out of the cluster package, into util. Signed-off-by: Brad Davidson --- docs/adrs/etcd-snapshot-cr.md | 9 +++-- pkg/cluster/bootstrap.go | 5 +-- pkg/cluster/encrypt.go | 11 +------ pkg/cluster/storage.go | 51 +++++----------------------- pkg/etcd/s3.go | 12 +++++++ pkg/etcd/snapshot.go | 20 +++++++++++ pkg/util/token.go | 62 +++++++++++++++++++++++++++++++++++ 7 files changed, 112 insertions(+), 58 deletions(-) diff --git a/docs/adrs/etcd-snapshot-cr.md b/docs/adrs/etcd-snapshot-cr.md index 369cbdba64d5..d4454df7f2aa 100644 --- a/docs/adrs/etcd-snapshot-cr.md +++ b/docs/adrs/etcd-snapshot-cr.md @@ -45,10 +45,13 @@ it into a neutral project for use by both projects. 3. The new Custom Resource will be cluster-scoped, as etcd and its snapshots are a cluster-level resource. 4. Snapshot metadata will also be written alongside snapshot files created on disk and/or uploaded to S3. The metadata files will have the same basename as their corresponding snapshot file. -5. Downstream consumers of etcd snapshot lists will migrate to watching Custom Resource types, instead of the ConfigMap. -6. K3s will observe a three minor version transition period, where both the new Custom Resources, and the existing +5. A hash of the server token will be stored as an annotation on the Custom Resource, and stored as metadata on snapshots uploaded to S3. + This hash should be compared to a current etcd snapshot's token hash to determine if the server token must be rolled back as part of the + snapshot restore process. +6. Downstream consumers of etcd snapshot lists will migrate to watching Custom Resource types, instead of the ConfigMap. +7. K3s will observe a three minor version transition period, where both the new Custom Resources, and the existing ConfigMap, will both be used. -7. During the transition period, older snapshot metadata may be removed from the ConfigMap while those snapshots still +8. During the transition period, older snapshot metadata may be removed from the ConfigMap while those snapshots still exist and are referenced by new Custom Resources, if the ConfigMap exceeds a preset size or key count limit. ## Consequences diff --git a/pkg/cluster/bootstrap.go b/pkg/cluster/bootstrap.go index 4a5e636a21c8..a0f804564931 100644 --- a/pkg/cluster/bootstrap.go +++ b/pkg/cluster/bootstrap.go @@ -19,6 +19,7 @@ import ( "github.com/k3s-io/k3s/pkg/clientaccess" "github.com/k3s-io/k3s/pkg/daemons/config" "github.com/k3s-io/k3s/pkg/etcd" + "github.com/k3s-io/k3s/pkg/util" "github.com/k3s-io/k3s/pkg/version" "github.com/k3s-io/kine/pkg/client" "github.com/k3s-io/kine/pkg/endpoint" @@ -248,7 +249,7 @@ func (c *Cluster) ReconcileBootstrapData(ctx context.Context, buf io.ReadSeeker, if c.managedDB != nil && !isHTTP { token := c.config.Token if token == "" { - tokenFromFile, err := readTokenFromFile(c.config.Runtime.ServerToken, c.config.Runtime.ServerCA, c.config.DataDir) + tokenFromFile, err := util.ReadTokenFromFile(c.config.Runtime.ServerToken, c.config.Runtime.ServerCA, c.config.DataDir) if err != nil { return err } @@ -260,7 +261,7 @@ func (c *Cluster) ReconcileBootstrapData(ctx context.Context, buf io.ReadSeeker, token = tokenFromFile } - normalizedToken, err := normalizeToken(token) + normalizedToken, err := util.NormalizeToken(token) if err != nil { return err } diff --git a/pkg/cluster/encrypt.go b/pkg/cluster/encrypt.go index 1046d61e1a8b..b39fdc151370 100644 --- a/pkg/cluster/encrypt.go +++ b/pkg/cluster/encrypt.go @@ -5,9 +5,7 @@ import ( "crypto/cipher" "crypto/rand" "crypto/sha1" - "crypto/sha256" "encoding/base64" - "encoding/hex" "fmt" "io" "strings" @@ -19,14 +17,7 @@ import ( // storageKey returns the etcd key for storing bootstrap data for a given passphrase. // The key is derived from the sha256 hash of the passphrase. func storageKey(passphrase string) string { - return "/bootstrap/" + keyHash(passphrase) -} - -// keyHash returns the first 12 characters of the sha256 sum of the passphrase. -func keyHash(passphrase string) string { - d := sha256.New() - d.Write([]byte(passphrase)) - return hex.EncodeToString(d.Sum(nil)[:])[:12] + return "/bootstrap/" + util.ShortHash(passphrase, 12) } // encrypt encrypts a byte slice using aes+gcm with a pbkdf2 key derived from the passphrase and a random salt. diff --git a/pkg/cluster/storage.go b/pkg/cluster/storage.go index 70e3961fdd23..549291961253 100644 --- a/pkg/cluster/storage.go +++ b/pkg/cluster/storage.go @@ -4,13 +4,11 @@ import ( "bytes" "context" "errors" - "os" - "path/filepath" "time" "github.com/k3s-io/k3s/pkg/bootstrap" - "github.com/k3s-io/k3s/pkg/clientaccess" "github.com/k3s-io/k3s/pkg/daemons/config" + "github.com/k3s-io/k3s/pkg/util" "github.com/k3s-io/kine/pkg/client" "github.com/sirupsen/logrus" "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" @@ -23,12 +21,12 @@ const maxBootstrapWaitAttempts = 5 func RotateBootstrapToken(ctx context.Context, config *config.Control, oldToken string) error { - token, err := readTokenFromFile(config.Runtime.ServerToken, config.Runtime.ServerCA, config.DataDir) + token, err := util.ReadTokenFromFile(config.Runtime.ServerToken, config.Runtime.ServerCA, config.DataDir) if err != nil { return err } - normalizedToken, err := normalizeToken(token) + normalizedToken, err := util.NormalizeToken(token) if err != nil { return err } @@ -52,7 +50,7 @@ func RotateBootstrapToken(ctx context.Context, config *config.Control, oldToken return err } - normalizedOldToken, err := normalizeToken(oldToken) + normalizedOldToken, err := util.NormalizeToken(oldToken) if err != nil { return err } @@ -76,13 +74,13 @@ func Save(ctx context.Context, config *config.Control, override bool) error { } token := config.Token if token == "" { - tokenFromFile, err := readTokenFromFile(config.Runtime.ServerToken, config.Runtime.ServerCA, config.DataDir) + tokenFromFile, err := util.ReadTokenFromFile(config.Runtime.ServerToken, config.Runtime.ServerCA, config.DataDir) if err != nil { return err } token = tokenFromFile } - normalizedToken, err := normalizeToken(token) + normalizedToken, err := util.NormalizeToken(token) if err != nil { return err } @@ -165,7 +163,7 @@ func (c *Cluster) storageBootstrap(ctx context.Context) error { token := c.config.Token if token == "" { - tokenFromFile, err := readTokenFromFile(c.config.Runtime.ServerToken, c.config.Runtime.ServerCA, c.config.DataDir) + tokenFromFile, err := util.ReadTokenFromFile(c.config.Runtime.ServerToken, c.config.Runtime.ServerCA, c.config.DataDir) if err != nil { return err } @@ -181,7 +179,7 @@ func (c *Cluster) storageBootstrap(ctx context.Context) error { } token = tokenFromFile } - normalizedToken, err := normalizeToken(token) + normalizedToken, err := util.NormalizeToken(token) if err != nil { return err } @@ -288,39 +286,6 @@ func getBootstrapKeyFromStorage(ctx context.Context, storageClient client.Client return nil, false, errors.New("bootstrap data already found and encrypted with different token") } -// readTokenFromFile will attempt to get the token from /token if it the file not found -// in case of fresh installation it will try to use the runtime serverToken saved in memory -// after stripping it from any additional information like the username or cahash, if the file -// found then it will still strip the token from any additional info -func readTokenFromFile(serverToken, certs, dataDir string) (string, error) { - tokenFile := filepath.Join(dataDir, "token") - - b, err := os.ReadFile(tokenFile) - if err != nil { - if os.IsNotExist(err) { - token, err := clientaccess.FormatToken(serverToken, certs) - if err != nil { - return token, err - } - return token, nil - } - return "", err - } - - // strip the token from any new line if its read from file - return string(bytes.TrimRight(b, "\n")), nil -} - -// normalizeToken will normalize the token read from file or passed as a cli flag -func normalizeToken(token string) (string, error) { - _, password, ok := clientaccess.ParseUsernamePassword(token) - if !ok { - return password, errors.New("failed to normalize server token; must be in format K10::: or ") - } - - return password, nil -} - // migrateTokens will list all keys that has prefix /bootstrap and will check for key that is // hashed with empty string and keys that is hashed with old token format before normalizing // then migrate those and resave only with the normalized token diff --git a/pkg/etcd/s3.go b/pkg/etcd/s3.go index d96b536d29fb..3409337d0bb2 100644 --- a/pkg/etcd/s3.go +++ b/pkg/etcd/s3.go @@ -20,6 +20,7 @@ import ( "time" "github.com/k3s-io/k3s/pkg/daemons/config" + "github.com/k3s-io/k3s/pkg/util" "github.com/k3s-io/k3s/pkg/version" "github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7/pkg/credentials" @@ -31,6 +32,7 @@ import ( var ( clusterIDKey = textproto.CanonicalMIMEHeaderKey(version.Program + "-cluster-id") + tokenHashKey = textproto.CanonicalMIMEHeaderKey(version.Program + "-token-hash") nodeNameKey = textproto.CanonicalMIMEHeaderKey(version.Program + "-node-name") ) @@ -39,6 +41,7 @@ type S3 struct { config *config.Control client *minio.Client clusterID string + tokenHash string nodeName string } @@ -109,10 +112,16 @@ func NewS3(ctx context.Context, config *config.Control) (*S3, error) { clusterID = string(ns.UID) } + tokenHash, err := util.GetTokenHash(config) + if err != nil { + return nil, errors.Wrap(err, "failed to get server token hash for etcd snapshot") + } + return &S3{ config: config, client: c, clusterID: clusterID, + tokenHash: tokenHash, nodeName: os.Getenv("NODE_NAME"), }, nil } @@ -154,6 +163,7 @@ func (s *S3) upload(ctx context.Context, snapshot string, extraMetadata *v1.Conf } else { sf.Status = successfulSnapshotStatus sf.Size = uploadInfo.Size + sf.tokenHash = s.tokenHash } if _, err := s.uploadSnapshotMetadata(ctx, metadataKey, metadata); err != nil { logrus.Warnf("Failed to upload snapshot metadata to S3: %v", err) @@ -170,6 +180,7 @@ func (s *S3) uploadSnapshot(ctx context.Context, key, path string) (info minio.U UserMetadata: map[string]string{ clusterIDKey: s.clusterID, nodeNameKey: s.nodeName, + tokenHashKey: s.tokenHash, }, } if strings.HasSuffix(key, compressedExtension) { @@ -392,6 +403,7 @@ func (s *S3) listSnapshots(ctx context.Context) (map[string]snapshotFile, error) Status: successfulSnapshotStatus, Compressed: compressed, nodeSource: obj.UserMetadata[nodeNameKey], + tokenHash: obj.UserMetadata[tokenHashKey], } sfKey := generateSnapshotConfigMapKey(sf) snapshots[sfKey] = sf diff --git a/pkg/etcd/snapshot.go b/pkg/etcd/snapshot.go index 4c710d7b5153..d11a7fb5b0b9 100644 --- a/pkg/etcd/snapshot.go +++ b/pkg/etcd/snapshot.go @@ -57,6 +57,7 @@ var ( labelStorageNode = "etcd." + version.Program + ".cattle.io/snapshot-storage-node" annotationLocalReconciled = "etcd." + version.Program + ".cattle.io/local-snapshots-timestamp" annotationS3Reconciled = "etcd." + version.Program + ".cattle.io/s3-snapshots-timestamp" + annotationTokenHash = "etcd." + version.Program + ".cattle.io/snapshot-token-hash" // snapshotDataBackoff will retry at increasing steps for up to ~30 seconds. // If the ConfigMap update fails, the list won't be reconciled again until next time @@ -252,6 +253,11 @@ func (e *ETCD) Snapshot(ctx context.Context) error { return errors.Wrap(err, "failed to get config for etcd snapshot") } + tokenHash, err := util.GetTokenHash(e.config) + if err != nil { + return errors.Wrap(err, "failed to get server token hash for etcd snapshot") + } + nodeName := os.Getenv("NODE_NAME") now := time.Now().Round(time.Second) snapshotName := fmt.Sprintf("%s-%s-%d", e.config.EtcdSnapshotName, nodeName, now.Unix()) @@ -314,6 +320,7 @@ func (e *ETCD) Snapshot(ctx context.Context) error { Size: f.Size(), Compressed: e.config.EtcdSnapshotCompress, metadataSource: extraMetadata, + tokenHash: tokenHash, } if err := saveSnapshotMetadata(snapshotPath, extraMetadata); err != nil { @@ -412,6 +419,7 @@ type snapshotFile struct { // to populate other fields before serialization to the legacy configmap. metadataSource *v1.ConfigMap `json:"-"` nodeSource string `json:"-"` + tokenHash string `json:"-"` } // listLocalSnapshots provides a list of the currently stored @@ -1016,6 +1024,10 @@ func (sf *snapshotFile) fromETCDSnapshotFile(esf *apisv1.ETCDSnapshotFile) { } } + if tokenHash := esf.Annotations[annotationTokenHash]; tokenHash != "" { + sf.tokenHash = tokenHash + } + if esf.Spec.S3 == nil { sf.NodeName = esf.Spec.NodeName } else { @@ -1080,6 +1092,14 @@ func (sf *snapshotFile) toETCDSnapshotFile(esf *apisv1.ETCDSnapshotFile) { esf.ObjectMeta.Labels = map[string]string{} } + if esf.ObjectMeta.Annotations == nil { + esf.ObjectMeta.Annotations = map[string]string{} + } + + if sf.tokenHash != "" { + esf.ObjectMeta.Annotations[annotationTokenHash] = sf.tokenHash + } + if sf.S3 == nil { esf.ObjectMeta.Labels[labelStorageNode] = esf.Spec.NodeName } else { diff --git a/pkg/util/token.go b/pkg/util/token.go index a47a4eefd99d..c4d3495af2bd 100644 --- a/pkg/util/token.go +++ b/pkg/util/token.go @@ -1,8 +1,16 @@ package util import ( + "bytes" cryptorand "crypto/rand" + "crypto/sha256" "encoding/hex" + "os" + "path/filepath" + + "github.com/k3s-io/k3s/pkg/clientaccess" + "github.com/k3s-io/k3s/pkg/daemons/config" + "github.com/pkg/errors" ) func Random(size int) (string, error) { @@ -13,3 +21,57 @@ func Random(size int) (string, error) { } return hex.EncodeToString(token), err } + +// ReadTokenFromFile will attempt to get the token from /token if it the file not found +// in case of fresh installation it will try to use the runtime serverToken saved in memory +// after stripping it from any additional information like the username or cahash, if the file +// found then it will still strip the token from any additional info +func ReadTokenFromFile(serverToken, certs, dataDir string) (string, error) { + tokenFile := filepath.Join(dataDir, "token") + + b, err := os.ReadFile(tokenFile) + if err != nil { + if os.IsNotExist(err) { + token, err := clientaccess.FormatToken(serverToken, certs) + if err != nil { + return token, err + } + return token, nil + } + return "", err + } + + // strip the token from any new line if its read from file + return string(bytes.TrimRight(b, "\n")), nil +} + +// NormalizeToken will normalize the token read from file or passed as a cli flag +func NormalizeToken(token string) (string, error) { + _, password, ok := clientaccess.ParseUsernamePassword(token) + if !ok { + return password, errors.New("failed to normalize server token; must be in format K10::: or ") + } + + return password, nil +} + +func GetTokenHash(config *config.Control) (string, error) { + token := config.Token + if token == "" { + tokenFromFile, err := ReadTokenFromFile(config.Runtime.ServerToken, config.Runtime.ServerCA, config.DataDir) + if err != nil { + return "", err + } + token = tokenFromFile + } + normalizedToken, err := NormalizeToken(token) + if err != nil { + return "", err + } + return ShortHash(normalizedToken, 12), nil +} + +func ShortHash(s string, i int) string { + digest := sha256.Sum256([]byte(s)) + return hex.EncodeToString(digest[:])[:i] +} From 174ff79b5202e9e89f8de3a078e9443ef5d8af18 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Tue, 10 Oct 2023 17:03:11 +0000 Subject: [PATCH 14/14] Fix etcd snapshot integration tests Snapshot delete/prune tests were only working because the delete command would report success even when deleting a snapshot that didn't exist, and the test regex was finding the snapshot name multiple times in the list output and deleting it twice. Snapshot restore tests seem to have expected the deployment to be rolled out immediately, which is not a reasonable expectation. Signed-off-by: Brad Davidson --- .../etcdrestore/etcd_restore_int_test.go | 17 +++++++++++------ .../etcdrestore/testdata/temp_depl.yaml | 5 ++++- .../etcdrestore/testdata/temp_depl2.yaml | 5 ++++- .../etcdsnapshot/etcdsnapshot_int_test.go | 10 +++++----- tests/integration/integration.go | 3 +++ 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/tests/integration/etcdrestore/etcd_restore_int_test.go b/tests/integration/etcdrestore/etcd_restore_int_test.go index 22bb0f2b6ee5..5ea168d53237 100644 --- a/tests/integration/etcdrestore/etcd_restore_int_test.go +++ b/tests/integration/etcdrestore/etcd_restore_int_test.go @@ -41,6 +41,11 @@ var _ = Describe("etcd snapshot restore", Ordered, func() { Expect(result).To(ContainSubstring("deployment.apps/nginx-deployment created")) Expect(err).NotTo(HaveOccurred()) }) + It("make sure workload exists", func() { + res, err := testutil.K3sCmd("kubectl", "rollout", "status", "deployment", "nginx-deployment", "--watch=true", "--timeout=360s") + Expect(res).To(ContainSubstring("successfully rolled out")) + Expect(err).ToNot(HaveOccurred()) + }) It("saves an etcd snapshot", func() { Expect(testutil.K3sCmd("etcd-snapshot", "save", "-d", tmpdDataDir, "--name", "snapshot-to-restore")). To(ContainSubstring("saved")) @@ -83,15 +88,15 @@ var _ = Describe("etcd snapshot restore", Ordered, func() { return testutil.K3sDefaultDeployments() }, "360s", "5s").Should(Succeed()) }) - It("Make sure Workload 1 exists", func() { - Eventually(func() (string, error) { - return testutil.K3sCmd("kubectl", "get", "deployment", "nginx-deployment") - }, "360s", "5s").Should(ContainSubstring("3/3")) + It("make sure workload 1 exists", func() { + res, err := testutil.K3sCmd("kubectl", "rollout", "status", "deployment", "nginx-deployment", "--watch=true", "--timeout=360s") + Expect(res).To(ContainSubstring("successfully rolled out")) + Expect(err).ToNot(HaveOccurred()) }) - It("Make sure Workload 2 does not exists", func() { + It("make sure workload 2 does not exists", func() { res, err := testutil.K3sCmd("kubectl", "get", "deployment", "nginx-deployment-post-snapshot") - Expect(err).To(HaveOccurred()) Expect(res).To(ContainSubstring("not found")) + Expect(err).To(HaveOccurred()) }) It("check if CA cert hash matches", func() { // get md5sum of the CA certs diff --git a/tests/integration/etcdrestore/testdata/temp_depl.yaml b/tests/integration/etcdrestore/testdata/temp_depl.yaml index 3649247c1bb1..8e8c564fec83 100644 --- a/tests/integration/etcdrestore/testdata/temp_depl.yaml +++ b/tests/integration/etcdrestore/testdata/temp_depl.yaml @@ -6,6 +6,9 @@ metadata: app: nginx spec: replicas: 3 + revisionHistoryLimit: 0 + strategy: + type: Recreate selector: matchLabels: app: nginx @@ -18,4 +21,4 @@ spec: - name: nginx image: nginx:1.14.2 ports: - - containerPort: 80 \ No newline at end of file + - containerPort: 80 diff --git a/tests/integration/etcdrestore/testdata/temp_depl2.yaml b/tests/integration/etcdrestore/testdata/temp_depl2.yaml index 8cea5e6f2d95..c5247a77e75d 100644 --- a/tests/integration/etcdrestore/testdata/temp_depl2.yaml +++ b/tests/integration/etcdrestore/testdata/temp_depl2.yaml @@ -6,6 +6,9 @@ metadata: app: nginx spec: replicas: 3 + revisionHistoryLimit: 0 + strategy: + type: Recreate selector: matchLabels: app: nginx @@ -18,4 +21,4 @@ spec: - name: nginx image: nginx:1.14.2 ports: - - containerPort: 80 \ No newline at end of file + - containerPort: 80 diff --git a/tests/integration/etcdsnapshot/etcdsnapshot_int_test.go b/tests/integration/etcdsnapshot/etcdsnapshot_int_test.go index ee5ec6b049a7..1d7c9b5ea21b 100644 --- a/tests/integration/etcdsnapshot/etcdsnapshot_int_test.go +++ b/tests/integration/etcdsnapshot/etcdsnapshot_int_test.go @@ -54,7 +54,7 @@ var _ = Describe("etcd snapshots", Ordered, func() { It("deletes a snapshot", func() { lsResult, err := testutil.K3sCmd("etcd-snapshot", "ls") Expect(err).ToNot(HaveOccurred()) - reg, err := regexp.Compile(`on-demand[^\s]+`) + reg, err := regexp.Compile(`(?m)^on-demand[^\s]+`) Expect(err).ToNot(HaveOccurred()) snapshotName := reg.FindString(lsResult) Expect(testutil.K3sCmd("etcd-snapshot", "delete", snapshotName)). @@ -69,7 +69,7 @@ var _ = Describe("etcd snapshots", Ordered, func() { It("deletes that snapshot", func() { lsResult, err := testutil.K3sCmd("etcd-snapshot", "ls") Expect(err).ToNot(HaveOccurred()) - reg, err := regexp.Compile(`ALIVEBEEF[^\s]+`) + reg, err := regexp.Compile(`(?m)^ALIVEBEEF[^\s]+`) Expect(err).ToNot(HaveOccurred()) snapshotName := reg.FindString(lsResult) Expect(testutil.K3sCmd("etcd-snapshot", "delete", snapshotName)). @@ -91,7 +91,7 @@ var _ = Describe("etcd snapshots", Ordered, func() { It("lists all 3 snapshots", func() { lsResult, err := testutil.K3sCmd("etcd-snapshot", "ls") Expect(err).ToNot(HaveOccurred()) - reg, err := regexp.Compile(`:///var/lib/rancher/k3s/server/db/snapshots/PRUNE_TEST`) + reg, err := regexp.Compile(`(?m):///var/lib/rancher/k3s/server/db/snapshots/PRUNE_TEST`) Expect(err).ToNot(HaveOccurred()) sepLines := reg.FindAllString(lsResult, -1) Expect(sepLines).To(HaveLen(3)) @@ -101,7 +101,7 @@ var _ = Describe("etcd snapshots", Ordered, func() { To(ContainSubstring("Removing local snapshot")) lsResult, err := testutil.K3sCmd("etcd-snapshot", "ls") Expect(err).ToNot(HaveOccurred()) - reg, err := regexp.Compile(`:///var/lib/rancher/k3s/server/db/snapshots/PRUNE_TEST`) + reg, err := regexp.Compile(`(?m):///var/lib/rancher/k3s/server/db/snapshots/PRUNE_TEST`) Expect(err).ToNot(HaveOccurred()) sepLines := reg.FindAllString(lsResult, -1) Expect(sepLines).To(HaveLen(2)) @@ -109,7 +109,7 @@ var _ = Describe("etcd snapshots", Ordered, func() { It("cleans up remaining snapshots", func() { lsResult, err := testutil.K3sCmd("etcd-snapshot", "ls") Expect(err).ToNot(HaveOccurred()) - reg, err := regexp.Compile(`PRUNE_TEST[^\s]+`) + reg, err := regexp.Compile(`(?m)^PRUNE_TEST[^\s]+`) Expect(err).ToNot(HaveOccurred()) for _, snapshotName := range reg.FindAllString(lsResult, -1) { Expect(testutil.K3sCmd("etcd-snapshot", "delete", snapshotName)). diff --git a/tests/integration/integration.go b/tests/integration/integration.go index 5f7f9ee74a80..2aea49de0ab1 100644 --- a/tests/integration/integration.go +++ b/tests/integration/integration.go @@ -280,6 +280,9 @@ func K3sStopServer(server *K3sServer) error { // K3sKillServer terminates the running K3s server and its children. // Equivalent to k3s-killall.sh func K3sKillServer(server *K3sServer) error { + if server == nil { + return nil + } if server.log != nil { server.log.Close() os.Remove(server.log.Name())