Skip to content

Commit

Permalink
feat: provide device extra settle timeout
Browse files Browse the repository at this point in the history
Fixes #9092

This is a workaround for broken hardware drivers (e.g. RAID
controllers), which report settled event too early.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
  • Loading branch information
smira committed Aug 14, 2024
1 parent 9e348ef commit 3c36c41
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 5 deletions.
6 changes: 6 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,12 @@ Talos Linux now bundles by default the following standard CNI plugins:
* `portmap`
The Talos bundled Flannel manifest was simplified to remove the `install-cni` step.
"""

[notes.udevd]
title = "Device Extra Settle Timeout"
description = """\
Talos Linux now supports a kernel command line argument `talos.device.settle_time=3m` to set the device extra settle timeout to workaround issues with broken drivers.
"""

[make_deps]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -662,11 +662,25 @@ func StartUdevd(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) {
return err
}

svc := &services.Udevd{}
var extraSettleTime time.Duration

settleTimeStr := procfs.ProcCmdline().Get(constants.KernelParamDeviceSettleTime).First()
if settleTimeStr != nil {
extraSettleTime, err = time.ParseDuration(*settleTimeStr)
if err != nil {
return fmt.Errorf("failed to parse %s: %w", constants.KernelParamDeviceSettleTime, err)
}

logger.Printf("extra settle time: %s", extraSettleTime)
}

svc := &services.Udevd{
ExtraSettleTime: extraSettleTime,
}

system.Services(r).LoadAndStart(svc)

ctx, cancel := context.WithTimeout(ctx, 5*time.Minute)
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
defer cancel()

return system.WaitForService(system.StateEventUp, svc.ID(r)).Wait(ctx)
Expand Down
42 changes: 39 additions & 3 deletions internal/app/machined/pkg/system/services/udevd.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package services

import (
"context"
"fmt"
"time"

"github.com/siderolabs/go-cmd/pkg/cmd"
Expand All @@ -26,7 +27,10 @@ var _ system.HealthcheckedService = (*Udevd)(nil)
// Udevd implements the Service interface. It serves as the concrete type with
// the required methods.
type Udevd struct {
triggered bool
ExtraSettleTime time.Duration

triggered bool
extraSettleStart time.Time
}

// ID implements the Service interface.
Expand Down Expand Up @@ -96,6 +100,8 @@ func (c *Udevd) Runner(r runtime.Runtime) (runner.Runner, error) {
}

// HealthFunc implements the HealthcheckedService interface.
//
//nolint:gocyclo
func (c *Udevd) HealthFunc(runtime.Runtime) health.Check {
return func(ctx context.Context) error {
// checking for the existence of the udev control socket is a faster way to check
Expand All @@ -107,7 +113,7 @@ func (c *Udevd) HealthFunc(runtime.Runtime) health.Check {

// udevadm trigger returns with an exit code of 0 even if udevd is not fully running,
// so running `udevadm control --reload` to ensure that udevd is fully initialized
// which returns an exit code of 2 if udevd is not running. This complementes the previous check
// which returns an exit code of 2 if udevd is not running. This complements the previous check
if _, err := cmd.RunContext(ctx, "/sbin/udevadm", "control", "--reload"); err != nil {
return err
}
Expand All @@ -128,8 +134,38 @@ func (c *Udevd) HealthFunc(runtime.Runtime) health.Check {
// `udevd trigger`, to prevent a race condition when a user specifies a path
// under `/dev/disk/*` in any disk definitions.
_, err := cmd.RunContext(ctx, "/sbin/udevadm", "settle", "--timeout=50") // timeout here should be less than health.Settings.Timeout
if err != nil {
return err
}

// If we got to the point where everything is settled, and the healthcheck would report
// success, we start the extra settle timer.
if c.extraSettleStart.IsZero() {
c.extraSettleStart = time.Now()
}

// Wait for c.ExtraSettleTime before returning success (if configured).
if c.ExtraSettleTime <= 0 {
return nil
}

settleEnd := c.extraSettleStart.Add(c.ExtraSettleTime)

if time.Now().After(settleEnd) {
return nil
}

// Can we wait until the health check deadline?
if deadline, ok := ctx.Deadline(); ok {
// if the deadline is before the settleEnd, we should wait until the deadline
if settleEnd.Before(deadline) {
time.Sleep(time.Until(settleEnd))

return nil
}
}

return err
return fmt.Errorf("waiting for udevd for extra settle timeout")
}
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/machinery/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ const (
// disk to wipe on the next boot and reboot.
KernelParamWipe = "talos.experimental.wipe"

// KernelParamDeviceSettleTime is the kernel parameter name for specifying the
// extra device settle timeout.
KernelParamDeviceSettleTime = "talos.device.settle_time"

// KernelParamCGroups is the kernel parameter name for specifying the
// cgroups version to use (default is cgroupsv2, setting this kernel arg to '0' forces cgroupsv1).
KernelParamCGroups = "talos.unified_cgroup_hierarchy"
Expand Down
14 changes: 14 additions & 0 deletions website/content/v1.8/reference/kernel.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,17 @@ Example:
```text
talos.environment=http_proxy=http://proxy.example.com:8080 talos.environment=https_proxy=http://proxy.example.com:8080
```

#### `talos.device.settle_time`

The time in Go duration format to wait for devices to settle before starting the boot process.
By default, Talos waits for `udevd` to scan and settle, but with some RAID controllers `udevd` might
report settled devices before they are actually ready.
Adding this kernel argument provides extra settle time on top of `udevd` settle time.
The maximum value is `10m` (10 minutes).

Example:

```text
talos.device.settle_time=3m
```

0 comments on commit 3c36c41

Please sign in to comment.