From c5b59df6976095aca5c4bac367084874242e9e80 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Tue, 23 Apr 2024 15:05:14 +0400 Subject: [PATCH] fix: wait for devices to be discovered before probing filesystems With Talos 1.7+, more storage drivers are split as modules, so the devices might not be discovered by the time platform config is going to be loaded. Explicitly wait for udevd to settle down before trying to probe a CD. Fixes #8625 Signed-off-by: Andrey Smirnov --- .../platform/internal/netutils/netutils.go | 22 ++++++++++-- .../runtime/v1alpha1/platform/metal/metal.go | 8 +++-- .../v1alpha1/platform/nocloud/metadata.go | 9 +++-- .../v1alpha1/platform/nocloud/nocloud_test.go | 6 ++++ .../v1alpha1/platform/opennebula/metadata.go | 9 ++++- .../platform/opennebula/opennebula.go | 4 +-- .../platform/opennebula/opennebula_test.go | 2 +- .../v1alpha1/platform/openstack/metadata.go | 9 ++++- .../v1alpha1/platform/openstack/openstack.go | 4 +-- pkg/machinery/resources/runtime/condition.go | 36 ++++++++++++++++++- 10 files changed, 95 insertions(+), 14 deletions(-) diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/internal/netutils/netutils.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/internal/netutils/netutils.go index a3f93778db..c358e99aee 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/internal/netutils/netutils.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/internal/netutils/netutils.go @@ -18,6 +18,7 @@ import ( "github.com/siderolabs/talos/pkg/machinery/constants" "github.com/siderolabs/talos/pkg/machinery/resources/network" + "github.com/siderolabs/talos/pkg/machinery/resources/runtime" ) // Wait for the network to be ready to interact with platform metadata services. @@ -39,11 +40,21 @@ func WaitInterfaces(ctx context.Context, r state.State) error { return fmt.Errorf("error listing host interfaces: %w", err) } - if hostInterfaces.Len() != 0 { + numPhysical := 0 + + for iter := hostInterfaces.Iterator(); iter.Next(); { + iface := iter.Value() + + if iface.TypedSpec().Physical() { + numPhysical++ + } + } + + if numPhysical > 0 { return nil } - log.Printf("waiting for network interface appearse...") + log.Printf("waiting for physical network interfaces to appear...") interval := backoff.NextBackOff() @@ -57,6 +68,13 @@ func WaitInterfaces(ctx context.Context, r state.State) error { return nil } +// WaitForDevicesReady waits for devices to be ready. +func WaitForDevicesReady(ctx context.Context, r state.State) error { + log.Printf("waiting for devices to be ready...") + + return runtime.NewDevicesStatusCondition(r).Wait(ctx) +} + // RetryFetch retries fetching from metadata service. func RetryFetch(ctx context.Context, f func(ctx context.Context) (string, error)) (string, error) { var ( diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/metal/metal.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/metal/metal.go index 03d6dd6e78..6ed75980a3 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/metal/metal.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/metal/metal.go @@ -76,7 +76,7 @@ func (m *Metal) Configuration(ctx context.Context, r state.State) ([]byte, error switch *option { case constants.MetalConfigISOLabel: - return readConfigFromISO() + return readConfigFromISO(ctx, r) default: if err := netutils.Wait(ctx, r); err != nil { return nil, err @@ -119,7 +119,11 @@ func (m *Metal) Mode() runtime.Mode { return runtime.ModeMetal } -func readConfigFromISO() ([]byte, error) { +func readConfigFromISO(ctx context.Context, r state.State) ([]byte, error) { + if err := netutils.WaitForDevicesReady(ctx, r); err != nil { + return nil, fmt.Errorf("failed to wait for devices: %w", err) + } + dev, err := probe.GetDevWithFileSystemLabel(constants.MetalConfigISOLabel) if err != nil { return nil, fmt.Errorf("failed to find %s iso: %w", constants.MetalConfigISOLabel, err) diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/nocloud/metadata.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/nocloud/metadata.go index ac18db71d4..7f44559d18 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/nocloud/metadata.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/nocloud/metadata.go @@ -144,7 +144,12 @@ func (n *Nocloud) configFromNetwork(ctx context.Context, metaBaseURL string, r s return metaConfig, networkConfig, machineConfig, err } -func (n *Nocloud) configFromCD() (metaConfig []byte, networkConfig []byte, machineConfig []byte, err error) { +//nolint:gocyclo +func (n *Nocloud) configFromCD(ctx context.Context, r state.State) (metaConfig []byte, networkConfig []byte, machineConfig []byte, err error) { + if err := netutils.WaitForDevicesReady(ctx, r); err != nil { + return nil, nil, nil, fmt.Errorf("failed to wait for devices: %w", err) + } + var dev *probe.ProbedBlockDevice dev, err = probe.GetDevWithFileSystemLabel(strings.ToLower(configISOLabel)) @@ -244,7 +249,7 @@ func (n *Nocloud) acquireConfig(ctx context.Context, r state.State) (metadataCon if networkSource && metaBaseURL != "" { metadataConfigDl, metadataNetworkConfigDl, machineConfigDl, err = n.configFromNetwork(ctx, metaBaseURL, r) } else { - metadataConfigDl, metadataNetworkConfigDl, machineConfigDl, err = n.configFromCD() + metadataConfigDl, metadataNetworkConfigDl, machineConfigDl, err = n.configFromCD(ctx, r) } metadata = &MetadataConfig{} diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/nocloud/nocloud_test.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/nocloud/nocloud_test.go index 956942817c..9a06cecbad 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/nocloud/nocloud_test.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/nocloud/nocloud_test.go @@ -58,14 +58,20 @@ func TestParseMetadata(t *testing.T) { eth0 := network.NewLinkStatus(network.NamespaceName, "eth0") eth0.TypedSpec().PermanentAddr = nethelpers.HardwareAddr{0x68, 0x05, 0xca, 0xb8, 0xf1, 0xf7} + eth0.TypedSpec().Type = nethelpers.LinkEther + eth0.TypedSpec().Kind = "" require.NoError(t, st.Create(context.TODO(), eth0)) eth1 := network.NewLinkStatus(network.NamespaceName, "eth1") eth1.TypedSpec().PermanentAddr = nethelpers.HardwareAddr{0x68, 0x05, 0xca, 0xb8, 0xf1, 0xf8} + eth1.TypedSpec().Type = nethelpers.LinkEther + eth1.TypedSpec().Kind = "" require.NoError(t, st.Create(context.TODO(), eth1)) eth2 := network.NewLinkStatus(network.NamespaceName, "eth2") eth2.TypedSpec().PermanentAddr = nethelpers.HardwareAddr{0x68, 0x05, 0xca, 0xb8, 0xf1, 0xf9} + eth2.TypedSpec().Type = nethelpers.LinkEther + eth2.TypedSpec().Kind = "" require.NoError(t, st.Create(context.TODO(), eth2)) var m nocloud.NetworkConfig diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/metadata.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/metadata.go index d7e9d22f01..6e7b02ddeb 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/metadata.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/metadata.go @@ -6,17 +6,20 @@ package opennebula import ( + "context" "fmt" "log" "os" "path/filepath" "strings" + "github.com/cosi-project/runtime/pkg/state" "github.com/siderolabs/go-blockdevice/blockdevice/filesystem" "github.com/siderolabs/go-blockdevice/blockdevice/probe" "golang.org/x/sys/unix" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform/errors" + "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform/internal/netutils" ) const ( @@ -25,7 +28,11 @@ const ( mnt = "/mnt" ) -func (o *OpenNebula) contextFromCD() (oneContext []byte, err error) { +func (o *OpenNebula) contextFromCD(ctx context.Context, r state.State) (oneContext []byte, err error) { + if err := netutils.WaitForDevicesReady(ctx, r); err != nil { + return nil, fmt.Errorf("failed to wait for devices: %w", err) + } + var dev *probe.ProbedBlockDevice dev, err = probe.GetDevWithFileSystemLabel(strings.ToLower(configISOLabel)) diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/opennebula.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/opennebula.go index 2ce3625721..306efbc3f0 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/opennebula.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/opennebula.go @@ -196,7 +196,7 @@ func (o *OpenNebula) ParseMetadata(st state.State, oneContextPlain []byte) (*run // Configuration implements the runtime.Platform interface. func (o *OpenNebula) Configuration(ctx context.Context, r state.State) (machineConfig []byte, err error) { - oneContextPlain, err := o.contextFromCD() + oneContextPlain, err := o.contextFromCD(ctx, r) if err != nil { return nil, err } @@ -234,7 +234,7 @@ func (o *OpenNebula) KernelArgs(string) procfs.Parameters { // NetworkConfiguration implements the runtime.Platform interface. func (o *OpenNebula) NetworkConfiguration(ctx context.Context, st state.State, ch chan<- *runtime.PlatformNetworkConfig) error { - oneContext, err := o.contextFromCD() + oneContext, err := o.contextFromCD(ctx, st) if stderrors.Is(err, errors.ErrNoConfigSource) { err = nil } diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/opennebula_test.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/opennebula_test.go index bc7233997e..b0a73e810f 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/opennebula_test.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula/opennebula_test.go @@ -1,7 +1,7 @@ // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -// go test -v ./internal/app/machined/pkg/runtime/v1alpha1/platform/opennebula + package opennebula_test import ( diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/openstack/metadata.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/openstack/metadata.go index 093c007021..7a2ed0b62d 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/openstack/metadata.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/openstack/metadata.go @@ -12,11 +12,13 @@ import ( "os" "path/filepath" + "github.com/cosi-project/runtime/pkg/state" "github.com/siderolabs/go-blockdevice/blockdevice/filesystem" "github.com/siderolabs/go-blockdevice/blockdevice/probe" "golang.org/x/sys/unix" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform/errors" + "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform/internal/netutils" "github.com/siderolabs/talos/pkg/download" ) @@ -108,7 +110,12 @@ func (o *Openstack) configFromNetwork(ctx context.Context) (metaConfig []byte, n return metaConfig, networkConfig, machineConfig, err } -func (o *Openstack) configFromCD() (metaConfig []byte, networkConfig []byte, machineConfig []byte, err error) { +//nolint:gocyclo +func (o *Openstack) configFromCD(ctx context.Context, r state.State) (metaConfig []byte, networkConfig []byte, machineConfig []byte, err error) { + if err := netutils.WaitForDevicesReady(ctx, r); err != nil { + return nil, nil, nil, fmt.Errorf("failed to wait for devices: %w", err) + } + var dev *probe.ProbedBlockDevice dev, err = probe.GetDevWithFileSystemLabel(configISOLabel) diff --git a/internal/app/machined/pkg/runtime/v1alpha1/platform/openstack/openstack.go b/internal/app/machined/pkg/runtime/v1alpha1/platform/openstack/openstack.go index 21b56d13fe..3b1a6d1351 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/platform/openstack/openstack.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/platform/openstack/openstack.go @@ -351,7 +351,7 @@ func (o *Openstack) ParseMetadata( // Configuration implements the runtime.Platform interface. func (o *Openstack) Configuration(ctx context.Context, r state.State) (machineConfig []byte, err error) { - _, _, machineConfig, err = o.configFromCD() + _, _, machineConfig, err = o.configFromCD(ctx, r) if err != nil { if err = netutils.Wait(ctx, r); err != nil { return nil, err @@ -389,7 +389,7 @@ func (o *Openstack) KernelArgs(string) procfs.Parameters { func (o *Openstack) NetworkConfiguration(ctx context.Context, st state.State, ch chan<- *runtime.PlatformNetworkConfig) error { networkSource := false - metadataConfigDl, metadataNetworkConfigDl, _, err := o.configFromCD() + metadataConfigDl, metadataNetworkConfigDl, _, err := o.configFromCD(ctx, st) if err != nil { metadataConfigDl, metadataNetworkConfigDl, _, err = o.configFromNetwork(ctx) if stderrors.Is(err, errors.ErrNoConfigSource) { diff --git a/pkg/machinery/resources/runtime/condition.go b/pkg/machinery/resources/runtime/condition.go index cd995aadac..979672a3ea 100644 --- a/pkg/machinery/resources/runtime/condition.go +++ b/pkg/machinery/resources/runtime/condition.go @@ -63,7 +63,7 @@ type ExtensionServiceConfigStatusCondition struct { serviceName string } -// NewExtensionServiceConfigStatusCondition builds a coondition which waits for extension service config to be available. +// NewExtensionServiceConfigStatusCondition builds a condition which waits for extension service config to be available. func NewExtensionServiceConfigStatusCondition(state state.State, serviceName string) *ExtensionServiceConfigStatusCondition { return &ExtensionServiceConfigStatusCondition{ state: state, @@ -85,3 +85,37 @@ func (condition *ExtensionServiceConfigStatusCondition) Wait(ctx context.Context return err } + +// DevicesStatusCondition implements condition which waits for devices to be ready. +type DevicesStatusCondition struct { + state state.State +} + +// NewDevicesStatusCondition builds a condition which waits for devices to be ready. +func NewDevicesStatusCondition(state state.State) *DevicesStatusCondition { + return &DevicesStatusCondition{ + state: state, + } +} + +func (condition *DevicesStatusCondition) String() string { + return "devices to be ready" +} + +// Wait implements condition interface. +func (condition *DevicesStatusCondition) Wait(ctx context.Context) error { + _, err := condition.state.WatchFor( + ctx, + resource.NewMetadata(NamespaceName, DevicesStatusType, DevicesID, resource.VersionUndefined), + state.WithEventTypes(state.Created, state.Updated), + state.WithCondition(func(r resource.Resource) (bool, error) { + if resource.IsTombstone(r) { + return false, nil + } + + return r.(*DevicesStatus).TypedSpec().Ready, nil + }), + ) + + return err +}