diff --git a/hack/release.toml b/hack/release.toml index aecd2fc27f..d3c8580d58 100644 --- a/hack/release.toml +++ b/hack/release.toml @@ -172,6 +172,12 @@ Talos Linux now bundles by default the following standard CNI plugins: * `portmap` The Talos bundled Flannel manifest was simplified to remove the `install-cni` step. +""" + + [notes.udevd] + title = "Device Extra Settle Timeout" + description = """\ +Talos Linux now supports a kernel command line argument `talos.device.settle_time=3m` to set the device extra settle timeout to workaround issues with broken drivers. """ [make_deps] diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go index 869812c7f7..ab4fe1b6c0 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go @@ -662,11 +662,25 @@ func StartUdevd(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) { return err } - svc := &services.Udevd{} + var extraSettleTime time.Duration + + settleTimeStr := procfs.ProcCmdline().Get(constants.KernelParamDeviceSettleTime).First() + if settleTimeStr != nil { + extraSettleTime, err = time.ParseDuration(*settleTimeStr) + if err != nil { + return fmt.Errorf("failed to parse %s: %w", constants.KernelParamDeviceSettleTime, err) + } + + logger.Printf("extra settle time: %s", extraSettleTime) + } + + svc := &services.Udevd{ + ExtraSettleTime: extraSettleTime, + } system.Services(r).LoadAndStart(svc) - ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) + ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) defer cancel() return system.WaitForService(system.StateEventUp, svc.ID(r)).Wait(ctx) diff --git a/internal/app/machined/pkg/system/services/udevd.go b/internal/app/machined/pkg/system/services/udevd.go index 95fee14e1a..d9e9fb684f 100644 --- a/internal/app/machined/pkg/system/services/udevd.go +++ b/internal/app/machined/pkg/system/services/udevd.go @@ -6,6 +6,7 @@ package services import ( "context" + "fmt" "time" "github.com/siderolabs/go-cmd/pkg/cmd" @@ -26,7 +27,10 @@ var _ system.HealthcheckedService = (*Udevd)(nil) // Udevd implements the Service interface. It serves as the concrete type with // the required methods. type Udevd struct { - triggered bool + ExtraSettleTime time.Duration + + triggered bool + extraSettleStart time.Time } // ID implements the Service interface. @@ -96,6 +100,8 @@ func (c *Udevd) Runner(r runtime.Runtime) (runner.Runner, error) { } // HealthFunc implements the HealthcheckedService interface. +// +//nolint:gocyclo func (c *Udevd) HealthFunc(runtime.Runtime) health.Check { return func(ctx context.Context) error { // checking for the existence of the udev control socket is a faster way to check @@ -107,7 +113,7 @@ func (c *Udevd) HealthFunc(runtime.Runtime) health.Check { // udevadm trigger returns with an exit code of 0 even if udevd is not fully running, // so running `udevadm control --reload` to ensure that udevd is fully initialized - // which returns an exit code of 2 if udevd is not running. This complementes the previous check + // which returns an exit code of 2 if udevd is not running. This complements the previous check if _, err := cmd.RunContext(ctx, "/sbin/udevadm", "control", "--reload"); err != nil { return err } @@ -128,8 +134,38 @@ func (c *Udevd) HealthFunc(runtime.Runtime) health.Check { // `udevd trigger`, to prevent a race condition when a user specifies a path // under `/dev/disk/*` in any disk definitions. _, err := cmd.RunContext(ctx, "/sbin/udevadm", "settle", "--timeout=50") // timeout here should be less than health.Settings.Timeout + if err != nil { + return err + } + + // If we got to the point where everything is settled, and the healthcheck would report + // success, we start the extra settle timer. + if c.extraSettleStart.IsZero() { + c.extraSettleStart = time.Now() + } + + // Wait for c.ExtraSettleTime before returning success (if configured). + if c.ExtraSettleTime <= 0 { + return nil + } + + settleEnd := c.extraSettleStart.Add(c.ExtraSettleTime) + + if time.Now().After(settleEnd) { + return nil + } + + // Can we wait until the health check deadline? + if deadline, ok := ctx.Deadline(); ok { + // if the deadline is before the settleEnd, we should wait until the deadline + if settleEnd.Before(deadline) { + time.Sleep(time.Until(settleEnd)) + + return nil + } + } - return err + return fmt.Errorf("waiting for udevd for extra settle timeout") } } diff --git a/pkg/machinery/constants/constants.go b/pkg/machinery/constants/constants.go index ce39a13604..c29bd7f92a 100644 --- a/pkg/machinery/constants/constants.go +++ b/pkg/machinery/constants/constants.go @@ -67,6 +67,10 @@ const ( // disk to wipe on the next boot and reboot. KernelParamWipe = "talos.experimental.wipe" + // KernelParamDeviceSettleTime is the kernel parameter name for specifying the + // extra device settle timeout. + KernelParamDeviceSettleTime = "talos.device.settle_time" + // KernelParamCGroups is the kernel parameter name for specifying the // cgroups version to use (default is cgroupsv2, setting this kernel arg to '0' forces cgroupsv1). KernelParamCGroups = "talos.unified_cgroup_hierarchy" diff --git a/website/content/v1.8/reference/kernel.md b/website/content/v1.8/reference/kernel.md index 790f0bf75d..6dbd4acf84 100644 --- a/website/content/v1.8/reference/kernel.md +++ b/website/content/v1.8/reference/kernel.md @@ -245,3 +245,17 @@ Example: ```text talos.environment=http_proxy=http://proxy.example.com:8080 talos.environment=https_proxy=http://proxy.example.com:8080 ``` + +#### `talos.device.settle_time` + +The time in Go duration format to wait for devices to settle before starting the boot process. +By default, Talos waits for `udevd` to scan and settle, but with some RAID controllers `udevd` might +report settled devices before they are actually ready. +Adding this kernel argument provides extra settle time on top of `udevd` settle time. +The maximum value is `10m` (10 minutes). + +Example: + +```text +talos.device.settle_time=3m +```