-
Notifications
You must be signed in to change notification settings - Fork 553
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Talos diagnostics analyzes current system state and comes up with detailed warnings on the system misconfiguration which might be tricky to figure out other way. Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
- Loading branch information
Showing
25 changed files
with
1,482 additions
and
270 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
135 changes: 135 additions & 0 deletions
135
internal/app/machined/pkg/controllers/runtime/diagnostics.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
// This Source Code Form is subject to the terms of the Mozilla Public | ||
// License, v. 2.0. If a copy of the MPL was not distributed with this | ||
// file, You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
||
package runtime | ||
|
||
import ( | ||
"context" | ||
"time" | ||
|
||
"github.com/cosi-project/runtime/pkg/controller" | ||
"github.com/cosi-project/runtime/pkg/safe" | ||
"go.uber.org/zap" | ||
|
||
"github.com/siderolabs/talos/internal/app/machined/pkg/controllers/runtime/internal/diagnostics" | ||
"github.com/siderolabs/talos/pkg/machinery/resources/config" | ||
"github.com/siderolabs/talos/pkg/machinery/resources/k8s" | ||
"github.com/siderolabs/talos/pkg/machinery/resources/network" | ||
"github.com/siderolabs/talos/pkg/machinery/resources/runtime" | ||
"github.com/siderolabs/talos/pkg/machinery/resources/v1alpha1" | ||
) | ||
|
||
// DiagnosticsController analyzes state of Talos Linux system and provides warnings on common problems. | ||
type DiagnosticsController struct{} | ||
|
||
// Name implements controller.Controller interface. | ||
func (ctrl *DiagnosticsController) Name() string { | ||
return "runtime.DiagnosticsController" | ||
} | ||
|
||
// Inputs implements controller.Controller interface. | ||
func (ctrl *DiagnosticsController) Inputs() []controller.Input { | ||
return []controller.Input{ | ||
{ | ||
Namespace: network.NamespaceName, | ||
Type: network.NodeAddressType, | ||
Kind: controller.InputWeak, | ||
}, | ||
{ | ||
Namespace: config.NamespaceName, | ||
Type: config.MachineConfigType, | ||
Kind: controller.InputWeak, | ||
}, | ||
{ | ||
Namespace: v1alpha1.NamespaceName, | ||
Type: v1alpha1.ServiceType, | ||
Kind: controller.InputWeak, | ||
}, | ||
{ | ||
Namespace: k8s.NamespaceName, | ||
Type: k8s.NodenameType, | ||
Kind: controller.InputWeak, | ||
}, | ||
} | ||
} | ||
|
||
// Outputs implements controller.Controller interface. | ||
func (ctrl *DiagnosticsController) Outputs() []controller.Output { | ||
return []controller.Output{ | ||
{ | ||
Type: runtime.DiagnosticType, | ||
Kind: controller.OutputExclusive, | ||
}, | ||
} | ||
} | ||
|
||
const ( | ||
diagnosticsCheckTimeout = time.Minute | ||
diagnostricsCheckInterval = time.Minute | ||
) | ||
|
||
// Run implements controller.Controller interface. | ||
// | ||
//nolint:gocyclo | ||
func (ctrl *DiagnosticsController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error { | ||
// firstDiscovery is used to track when a warning was first discovered. | ||
firstDiscovered := map[string]time.Time{} | ||
|
||
ticker := time.NewTicker(diagnostricsCheckInterval) | ||
defer ticker.Stop() | ||
|
||
for { | ||
select { | ||
case <-ctx.Done(): | ||
return nil | ||
case <-r.EventCh(): | ||
case <-ticker.C: | ||
} | ||
|
||
r.StartTrackingOutputs() | ||
|
||
for _, checkDescription := range diagnostics.Checks() { | ||
if err := func() error { | ||
checkCtx, checkCtxCancel := context.WithTimeout(ctx, diagnosticsCheckTimeout) | ||
defer checkCtxCancel() | ||
|
||
warning, err := checkDescription.Check(checkCtx, r, logger) | ||
if err != nil { | ||
logger.Debug("diagnostic check failed", zap.String("check", checkDescription.ID), zap.Error(err)) | ||
|
||
return nil | ||
} | ||
|
||
if warning == nil { | ||
delete(firstDiscovered, checkDescription.ID) | ||
|
||
return nil | ||
} | ||
|
||
firstDiscoveredTime, ok := firstDiscovered[checkDescription.ID] | ||
if !ok { | ||
firstDiscoveredTime = time.Now() | ||
firstDiscovered[checkDescription.ID] = firstDiscoveredTime | ||
} | ||
|
||
if time.Since(firstDiscoveredTime) < checkDescription.Hysteresis { | ||
// don't publish it yet | ||
return nil | ||
} | ||
|
||
return safe.WriterModify(ctx, r, runtime.NewDiagnstic(runtime.NamespaceName, checkDescription.ID), func(res *runtime.Diagnostic) error { | ||
*res.TypedSpec() = *warning | ||
|
||
return nil | ||
}) | ||
}(); err != nil { | ||
return err | ||
} | ||
} | ||
|
||
if err := safe.CleanupOutputs[*runtime.Diagnostic](ctx, r); err != nil { | ||
return err | ||
} | ||
} | ||
} |
114 changes: 114 additions & 0 deletions
114
internal/app/machined/pkg/controllers/runtime/diagnostics_logger.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
// This Source Code Form is subject to the terms of the Mozilla Public | ||
// License, v. 2.0. If a copy of the MPL was not distributed with this | ||
// file, You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
||
package runtime | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/cosi-project/runtime/pkg/controller" | ||
"github.com/cosi-project/runtime/pkg/safe" | ||
"go.uber.org/zap" | ||
|
||
"github.com/siderolabs/talos/pkg/machinery/resources/runtime" | ||
) | ||
|
||
// DiagnosticsLoggerController logs warnings generated by DiagnosticsController. | ||
type DiagnosticsLoggerController struct{} | ||
|
||
// Name implements controller.Controller interface. | ||
func (ctrl *DiagnosticsLoggerController) Name() string { | ||
return "runtime.DiagnosticsLoggerController" | ||
} | ||
|
||
// Inputs implements controller.Controller interface. | ||
func (ctrl *DiagnosticsLoggerController) Inputs() []controller.Input { | ||
return []controller.Input{ | ||
{ | ||
Namespace: runtime.NamespaceName, | ||
Type: runtime.DiagnosticType, | ||
Kind: controller.InputWeak, | ||
}, | ||
} | ||
} | ||
|
||
// Outputs implements controller.Controller interface. | ||
func (ctrl *DiagnosticsLoggerController) Outputs() []controller.Output { | ||
return nil | ||
} | ||
|
||
const diagnosticsReportInterval = 5 * time.Minute | ||
|
||
// Run implements controller.Controller interface. | ||
// | ||
//nolint:gocyclo | ||
func (ctrl *DiagnosticsLoggerController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error { | ||
reportedWarnings := map[string]struct{}{} | ||
|
||
ticker := time.NewTicker(diagnosticsReportInterval) | ||
defer ticker.Stop() | ||
|
||
for { | ||
select { | ||
case <-ctx.Done(): | ||
return nil | ||
case <-r.EventCh(): | ||
warnings, err := safe.ReaderListAll[*runtime.Diagnostic](ctx, r) | ||
if err != nil { | ||
return fmt.Errorf("error listing diagnostics: %w", err) | ||
} | ||
|
||
seenWarnings := map[string]struct{}{} | ||
|
||
for iter := warnings.Iterator(); iter.Next(); { | ||
warning := iter.Value() | ||
|
||
seenWarnings[warning.Metadata().ID()] = struct{}{} | ||
|
||
if _, reported := reportedWarnings[warning.Metadata().ID()]; !reported { | ||
logger.Warn("new diagnostic", | ||
zap.String("id", warning.Metadata().ID()), | ||
zap.String("message", warning.TypedSpec().Message), | ||
zap.Strings("details", warning.TypedSpec().Details), | ||
zap.String("url", warning.TypedSpec().DocumentationURL(warning.Metadata().ID())), | ||
) | ||
|
||
reportedWarnings[warning.Metadata().ID()] = struct{}{} | ||
} | ||
} | ||
|
||
for id := range reportedWarnings { | ||
if _, seen := seenWarnings[id]; !seen { | ||
logger.Info("diagnostic resolved", zap.String("id", id)) | ||
|
||
delete(reportedWarnings, id) | ||
} | ||
} | ||
case <-ticker.C: | ||
if len(reportedWarnings) == 0 { | ||
continue | ||
} | ||
|
||
warnings, err := safe.ReaderListAll[*runtime.Diagnostic](ctx, r) | ||
if err != nil { | ||
return fmt.Errorf("error listing diagnostics: %w", err) | ||
} | ||
|
||
for iter := warnings.Iterator(); iter.Next(); { | ||
warning := iter.Value() | ||
|
||
logger.Warn("diagnostic still active", | ||
zap.String("id", warning.Metadata().ID()), | ||
zap.String("message", warning.TypedSpec().Message), | ||
zap.Strings("details", warning.TypedSpec().Details), | ||
zap.String("url", warning.TypedSpec().DocumentationURL(warning.Metadata().ID())), | ||
) | ||
} | ||
} | ||
|
||
r.ResetRestartBackoff() | ||
} | ||
} |
70 changes: 70 additions & 0 deletions
70
internal/app/machined/pkg/controllers/runtime/internal/diagnostics/address_overlap.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
// This Source Code Form is subject to the terms of the Mozilla Public | ||
// License, v. 2.0. If a copy of the MPL was not distributed with this | ||
// file, You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
||
package diagnostics | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"net/netip" | ||
|
||
"github.com/cosi-project/runtime/pkg/controller" | ||
"github.com/cosi-project/runtime/pkg/safe" | ||
"github.com/cosi-project/runtime/pkg/state" | ||
"github.com/siderolabs/gen/xslices" | ||
"go.uber.org/zap" | ||
|
||
"github.com/siderolabs/talos/pkg/machinery/resources/config" | ||
"github.com/siderolabs/talos/pkg/machinery/resources/k8s" | ||
"github.com/siderolabs/talos/pkg/machinery/resources/network" | ||
"github.com/siderolabs/talos/pkg/machinery/resources/runtime" | ||
) | ||
|
||
// AddressOverlapCheck checks for overlapping host and Kubernetes pod/service CIDR addresses. | ||
func AddressOverlapCheck(ctx context.Context, r controller.Reader, logger *zap.Logger) (*runtime.DiagnosticSpec, error) { | ||
hostAddresses, err := safe.ReaderGetByID[*network.NodeAddress](ctx, r, network.NodeAddressRoutedID) | ||
if err != nil { | ||
if state.IsNotFoundError(err) { | ||
return nil, nil | ||
} | ||
|
||
return nil, fmt.Errorf("error reading host addresses: %w", err) | ||
} | ||
|
||
hostMinusK8s, err := safe.ReaderGetByID[*network.NodeAddress](ctx, r, network.FilteredNodeAddressID(network.NodeAddressRoutedID, k8s.NodeAddressFilterNoK8s)) | ||
if err != nil { | ||
if state.IsNotFoundError(err) { | ||
return nil, nil | ||
} | ||
|
||
return nil, fmt.Errorf("error reading host minus k8s addresses: %w", err) | ||
} | ||
|
||
cfg, err := safe.ReaderGetByID[*config.MachineConfig](ctx, r, config.V1Alpha1ID) | ||
if err != nil { | ||
if state.IsNotFoundError(err) { | ||
return nil, nil | ||
} | ||
|
||
return nil, fmt.Errorf("error reading machine configuration: %w", err) | ||
} | ||
|
||
if len(hostAddresses.TypedSpec().Addresses) > 0 && len(hostMinusK8s.TypedSpec().Addresses) == 0 { | ||
details := []string{ | ||
fmt.Sprintf("host routed addresses: %q", xslices.Map(hostAddresses.TypedSpec().Addresses, netip.Prefix.String)), | ||
} | ||
|
||
if cfg.Config().Cluster() != nil { | ||
details = append(details, fmt.Sprintf("Kubernetes pod CIDRs: %q", cfg.Config().Cluster().Network().PodCIDRs())) | ||
details = append(details, fmt.Sprintf("Kubernetes service CIDRs: %q", cfg.Config().Cluster().Network().ServiceCIDRs())) | ||
} | ||
|
||
return &runtime.DiagnosticSpec{ | ||
Message: "host and Kubernetes pod/service CIDR addresses overlap", | ||
Details: details, | ||
}, nil | ||
} | ||
|
||
return nil, nil | ||
} |
Oops, something went wrong.