Skip to content

Commit

Permalink
fix: audit and fix cgroup reservations
Browse files Browse the repository at this point in the history
Fixes: #7081

Review all reservations and limits set, test under stress load (using
both memory and CPU).

The goal: system components (Talos itself) and runtime (kubelet, CRI)
should survive under extreme resource starvation (workloads consuming
all CPU/memory).

Uses #9337 to visualize changes, but doesn't depend on it.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
  • Loading branch information
smira committed Sep 20, 2024
1 parent 32b5d01 commit 6b15ca1
Show file tree
Hide file tree
Showing 15 changed files with 337 additions and 42 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ require (
github.com/gizak/termui/v3 v3.1.0
github.com/godbus/dbus/v5 v5.1.0
github.com/golang/mock v1.6.0
github.com/google/cadvisor v0.50.0
github.com/google/go-containerregistry v0.20.2
github.com/google/go-tpm v0.9.1
github.com/google/nftables v0.2.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,8 @@ github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6
github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU=
github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/cadvisor v0.50.0 h1:7w/hKIbJKBWqQsRTy+Hpj2vj+fnxrLXcEXFy+LW0Bsg=
github.com/google/cadvisor v0.50.0/go.mod h1:VxCDwZalpFyENvmfabFqaIGsqNKLtDzE62a19rfVTB8=
github.com/google/cel-go v0.21.0 h1:cl6uW/gxN+Hy50tNYvI691+sXxioCnstFzLp2WO4GCI=
github.com/google/cel-go v0.21.0/go.mod h1:rHUlWCcBKgyEk+eV03RPdZUekPp6YcJwV0FxuUksYxc=
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
Expand Down
28 changes: 25 additions & 3 deletions internal/app/machined/pkg/controllers/k8s/kubelet_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ import (
v1alpha1runtime "github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/pkg/cgroup"
"github.com/siderolabs/talos/pkg/argsbuilder"
"github.com/siderolabs/talos/pkg/machinery/config/machine"
"github.com/siderolabs/talos/pkg/machinery/constants"
"github.com/siderolabs/talos/pkg/machinery/kubelet"
"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
)

Expand Down Expand Up @@ -63,6 +65,12 @@ func (ctrl *KubeletSpecController) Inputs() []controller.Input {
ID: optional.Some(k8s.KubeletID),
Kind: controller.InputWeak,
},
{
Namespace: config.NamespaceName,
Type: config.MachineTypeType,
ID: optional.Some(config.MachineTypeID),
Kind: controller.InputWeak,
},
}
}

Expand Down Expand Up @@ -100,6 +108,15 @@ func (ctrl *KubeletSpecController) Run(ctx context.Context, r controller.Runtime

kubeletVersion := compatibility.VersionFromImageRef(cfgSpec.Image)

machineType, err := safe.ReaderGetByID[*config.MachineType](ctx, r, config.MachineTypeID)
if err != nil {
if state.IsNotFoundError(err) {
continue
}

return fmt.Errorf("error getting machine type: %w", err)
}

nodename, err := safe.ReaderGetByID[*k8s.Nodename](ctx, r, k8s.NodenameID)
if err != nil {
if state.IsNotFoundError(err) {
Expand Down Expand Up @@ -173,7 +190,7 @@ func (ctrl *KubeletSpecController) Run(ctx context.Context, r controller.Runtime
args["image-credential-provider-config"] = constants.KubeletCredentialProviderConfig
}

kubeletConfig, err := NewKubeletConfiguration(cfgSpec, kubeletVersion)
kubeletConfig, err := NewKubeletConfiguration(cfgSpec, kubeletVersion, machineType.MachineType())
if err != nil {
return fmt.Errorf("error creating kubelet configuration: %w", err)
}
Expand Down Expand Up @@ -242,7 +259,7 @@ func prepareExtraConfig(extraConfig map[string]any) (*kubeletconfig.KubeletConfi
// NewKubeletConfiguration builds kubelet configuration with defaults and overrides from extraConfig.
//
//nolint:gocyclo,cyclop
func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion compatibility.Version) (*kubeletconfig.KubeletConfiguration, error) {
func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion compatibility.Version, machineType machine.Type) (*kubeletconfig.KubeletConfiguration, error) {
config, err := prepareExtraConfig(cfgSpec.ExtraConfig)
if err != nil {
return nil, err
Expand Down Expand Up @@ -333,10 +350,15 @@ func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion comp
if len(config.SystemReserved) == 0 {
config.SystemReserved = map[string]string{
"cpu": constants.KubeletSystemReservedCPU,
"memory": constants.KubeletSystemReservedMemory,
"pid": constants.KubeletSystemReservedPid,
"ephemeral-storage": constants.KubeletSystemReservedEphemeralStorage,
}

if machineType.IsControlPlane() {
config.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryControlPlane
} else {
config.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryWorker
}
}

if config.Logging.Format == "" {
Expand Down
55 changes: 48 additions & 7 deletions internal/app/machined/pkg/controllers/k8s/kubelet_spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ import (

"github.com/siderolabs/talos/internal/app/machined/pkg/controllers/ctest"
k8sctrl "github.com/siderolabs/talos/internal/app/machined/pkg/controllers/k8s"
"github.com/siderolabs/talos/pkg/machinery/config/machine"
"github.com/siderolabs/talos/pkg/machinery/constants"
"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
)

Expand Down Expand Up @@ -60,6 +62,10 @@ func (suite *KubeletSpecSuite) TestReconcileDefault() {

suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))

machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))

rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()

Expand Down Expand Up @@ -97,6 +103,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithExplicitNodeIP() {

suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))

machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))

rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()

Expand All @@ -114,7 +124,7 @@ func (suite *KubeletSpecSuite) TestReconcileWithExplicitNodeIP() {
})
}

func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEnpointFlag() {
func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEndpointFlag() {
cfg := k8s.NewKubeletConfig(k8s.NamespaceName, k8s.KubeletID)
cfg.TypedSpec().Image = "kubelet:v1.25.0"
cfg.TypedSpec().ClusterDNS = []string{"10.96.0.10"}
Expand All @@ -128,6 +138,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEnpointFlag() {

suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))

machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))

rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()

Expand Down Expand Up @@ -180,6 +194,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithExtraConfig() {

suite.Require().NoError(suite.State().Create(suite.Ctx(), nodeIP))

machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))

rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()

Expand Down Expand Up @@ -219,6 +237,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithSkipNodeRegistration() {

suite.Require().NoError(suite.State().Create(suite.Ctx(), nodeIP))

machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))

rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()

Expand Down Expand Up @@ -307,7 +329,7 @@ func TestNewKubeletConfigurationFail(t *testing.T) {
tt.name, func(t *testing.T) {
t.Parallel()

_, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, compatibility.VersionFromImageRef(""))
_, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, compatibility.VersionFromImageRef(""), machine.TypeWorker)
require.Error(t, err)

assert.EqualError(t, err, tt.expectedErr)
Expand Down Expand Up @@ -352,7 +374,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
FailSwapOn: pointer.To(false),
SystemReserved: map[string]string{
"cpu": constants.KubeletSystemReservedCPU,
"memory": constants.KubeletSystemReservedMemory,
"memory": constants.KubeletSystemReservedMemoryWorker,
"pid": constants.KubeletSystemReservedPid,
"ephemeral-storage": constants.KubeletSystemReservedEphemeralStorage,
},
Expand All @@ -373,6 +395,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
cfgSpec *k8s.KubeletConfigSpec
kubeletVersion compatibility.Version
expectedOverrides func(*kubeletconfig.KubeletConfiguration)
machineType machine.Type
}{
{
name: "override some",
Expand All @@ -389,6 +412,19 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
kc.OOMScoreAdj = pointer.To[int32](-300)
kc.EnableDebuggingHandlers = pointer.To(true)
},
machineType: machine.TypeWorker,
},
{
name: "controlplane",
cfgSpec: &k8s.KubeletConfigSpec{
ClusterDNS: []string{"10.0.0.5"},
ClusterDomain: "cluster.local",
},
kubeletVersion: compatibility.VersionFromImageRef("ghcr.io/siderolabs/kubelet:v1.29.0"),
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
kc.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryControlPlane
},
machineType: machine.TypeControlPlane,
},
{
name: "disable graceful shutdown",
Expand All @@ -405,6 +441,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
kc.ShutdownGracePeriod = metav1.Duration{}
kc.ShutdownGracePeriodCriticalPods = metav1.Duration{}
},
machineType: machine.TypeWorker,
},
{
name: "enable seccomp default",
Expand All @@ -417,6 +454,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
kc.SeccompDefault = pointer.To(true)
},
machineType: machine.TypeWorker,
},
{
name: "enable skipNodeRegistration",
Expand All @@ -430,6 +468,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
kc.Authentication.Webhook.Enabled = pointer.To(false)
kc.Authorization.Mode = kubeletconfig.KubeletAuthorizationModeAlwaysAllow
},
machineType: machine.TypeWorker,
},
{
name: "disable manifests directory",
Expand All @@ -442,6 +481,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
kc.StaticPodPath = ""
},
machineType: machine.TypeWorker,
},
{
name: "enable local FS quota monitoring",
Expand All @@ -456,19 +496,20 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
"LocalStorageCapacityIsolationFSQuotaMonitoring": true,
}
},
machineType: machine.TypeWorker,
},
} {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

expected := defaultKubeletConfig
tt.expectedOverrides(&expected)
expected := defaultKubeletConfig.DeepCopy()
tt.expectedOverrides(expected)

config, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, tt.kubeletVersion)
config, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, tt.kubeletVersion, tt.machineType)

require.NoError(t, err)

assert.Equal(t, &expected, config)
assert.Equal(t, expected, config)
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupInitReservedMemory),
Low: pointer.To[int64](constants.CgroupInitReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupInitMillicores))),
},
},
},
{
Expand All @@ -191,15 +194,42 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupSystemReservedMemory),
Low: pointer.To[int64](constants.CgroupSystemReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupSystemMillicores))),
},
},
},
{
name: constants.CgroupSystemRuntime,
resources: &cgroup2.Resources{},
name: constants.CgroupSystemRuntime,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory),
Low: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupSystemRuntimeMillicores))),
},
},
},
{
name: constants.CgroupUdevd,
resources: &cgroup2.Resources{},
name: constants.CgroupUdevd,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupUdevdReservedMemory),
Low: pointer.To[int64](constants.CgroupUdevdReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupUdevdMillicores))),
},
},
},
{
name: constants.CgroupPodRuntimeRoot,
resources: &cgroup2.Resources{
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupPodRuntimeRootMillicores))),
},
},
},
{
name: constants.CgroupPodRuntime,
Expand All @@ -208,6 +238,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory),
Low: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupPodRuntimeMillicores))),
},
},
},
{
Expand All @@ -217,14 +250,45 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupKubeletReservedMemory),
Low: pointer.To[int64](constants.CgroupKubeletReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupKubeletMillicores))),
},
},
},
{
name: constants.CgroupDashboard,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupDashboardReservedMemory),
Low: pointer.To[int64](constants.CgroupDashboardLowMemory),
Max: pointer.To[int64](constants.CgroupDashboardMaxMemory),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupDashboardMillicores))),
},
},
},
{
name: constants.CgroupApid,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupApidReservedMemory),
Low: pointer.To[int64](constants.CgroupApidReservedMemory * 2),
Max: pointer.To[int64](constants.CgroupApidMaxMemory),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupApidMillicores))),
},
},
},
{
name: constants.CgroupTrustd,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupTrustdReservedMemory),
Low: pointer.To[int64](constants.CgroupTrustdReservedMemory * 2),
Max: pointer.To[int64](constants.CgroupTrustdMaxMemory),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupTrustdMillicores))),
},
},
},
Expand Down
Loading

0 comments on commit 6b15ca1

Please sign in to comment.