Skip to content

Commit

Permalink
feat: implement Talos diagnostics
Browse files Browse the repository at this point in the history
Talos diagnostics analyzes current system state and comes up with detailed
warnings on the system misconfiguration which might be tricky to figure
out other way.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
  • Loading branch information
smira committed Jun 5, 2024
1 parent 357d775 commit 8dbe212
Show file tree
Hide file tree
Showing 25 changed files with 1,482 additions and 270 deletions.
6 changes: 6 additions & 0 deletions api/resource/definitions/runtime/runtime.proto
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ message DevicesStatusSpec {
bool ready = 1;
}

// DiagnosticSpec is the spec for devices status.
message DiagnosticSpec {
string message = 1;
repeated string details = 2;
}

// EventSinkConfigSpec describes configuration of Talos event log streaming.
message EventSinkConfigSpec {
string endpoint = 1;
Expand Down
7 changes: 7 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ Please note that on running cluster you will have to kill CoreDNS pods for this
A list of PCI devices can now be obtained via `PCIDevices` resource, e.g. `talosctl get pcidevices`.
"""

[notes.diagnostics]
title = "Diagnostics"
description = """\
Talos Linux now shows diagnostics information for common problems related to misconfiguration via `talosctl health` and Talos dashboard.
"""


[make_deps]

[make_deps.tools]
Expand Down
135 changes: 135 additions & 0 deletions internal/app/machined/pkg/controllers/runtime/diagnostics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime

import (
"context"
"time"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"go.uber.org/zap"

"github.com/siderolabs/talos/internal/app/machined/pkg/controllers/runtime/internal/diagnostics"
"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
"github.com/siderolabs/talos/pkg/machinery/resources/network"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
"github.com/siderolabs/talos/pkg/machinery/resources/v1alpha1"
)

// DiagnosticsController analyzes state of Talos Linux system and provides warnings on common problems.
type DiagnosticsController struct{}

// Name implements controller.Controller interface.
func (ctrl *DiagnosticsController) Name() string {
return "runtime.DiagnosticsController"
}

// Inputs implements controller.Controller interface.
func (ctrl *DiagnosticsController) Inputs() []controller.Input {
return []controller.Input{
{
Namespace: network.NamespaceName,
Type: network.NodeAddressType,
Kind: controller.InputWeak,
},
{
Namespace: config.NamespaceName,
Type: config.MachineConfigType,
Kind: controller.InputWeak,
},
{
Namespace: v1alpha1.NamespaceName,
Type: v1alpha1.ServiceType,
Kind: controller.InputWeak,
},
{
Namespace: k8s.NamespaceName,
Type: k8s.NodenameType,
Kind: controller.InputWeak,
},
}
}

// Outputs implements controller.Controller interface.
func (ctrl *DiagnosticsController) Outputs() []controller.Output {
return []controller.Output{
{
Type: runtime.DiagnosticType,
Kind: controller.OutputExclusive,
},
}
}

const (
diagnosticsCheckTimeout = time.Minute
diagnostricsCheckInterval = time.Minute
)

// Run implements controller.Controller interface.
//
//nolint:gocyclo
func (ctrl *DiagnosticsController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
// firstDiscovery is used to track when a warning was first discovered.
firstDiscovered := map[string]time.Time{}

ticker := time.NewTicker(diagnostricsCheckInterval)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
return nil
case <-r.EventCh():
case <-ticker.C:
}

r.StartTrackingOutputs()

for _, checkDescription := range diagnostics.Checks() {
if err := func() error {
checkCtx, checkCtxCancel := context.WithTimeout(ctx, diagnosticsCheckTimeout)
defer checkCtxCancel()

warning, err := checkDescription.Check(checkCtx, r, logger)
if err != nil {
logger.Debug("diagnostic check failed", zap.String("check", checkDescription.ID), zap.Error(err))

return nil
}

if warning == nil {
delete(firstDiscovered, checkDescription.ID)

return nil
}

firstDiscoveredTime, ok := firstDiscovered[checkDescription.ID]
if !ok {
firstDiscoveredTime = time.Now()
firstDiscovered[checkDescription.ID] = firstDiscoveredTime
}

if time.Since(firstDiscoveredTime) < checkDescription.Hysteresis {
// don't publish it yet
return nil
}

return safe.WriterModify(ctx, r, runtime.NewDiagnstic(runtime.NamespaceName, checkDescription.ID), func(res *runtime.Diagnostic) error {
*res.TypedSpec() = *warning

return nil
})
}(); err != nil {
return err
}
}

if err := safe.CleanupOutputs[*runtime.Diagnostic](ctx, r); err != nil {
return err
}
}
}
114 changes: 114 additions & 0 deletions internal/app/machined/pkg/controllers/runtime/diagnostics_logger.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime

import (
"context"
"fmt"
"time"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"go.uber.org/zap"

"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)

// DiagnosticsLoggerController logs warnings generated by DiagnosticsController.
type DiagnosticsLoggerController struct{}

// Name implements controller.Controller interface.
func (ctrl *DiagnosticsLoggerController) Name() string {
return "runtime.DiagnosticsLoggerController"
}

// Inputs implements controller.Controller interface.
func (ctrl *DiagnosticsLoggerController) Inputs() []controller.Input {
return []controller.Input{
{
Namespace: runtime.NamespaceName,
Type: runtime.DiagnosticType,
Kind: controller.InputWeak,
},
}
}

// Outputs implements controller.Controller interface.
func (ctrl *DiagnosticsLoggerController) Outputs() []controller.Output {
return nil
}

const diagnosticsReportInterval = 5 * time.Minute

// Run implements controller.Controller interface.
//
//nolint:gocyclo
func (ctrl *DiagnosticsLoggerController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
reportedWarnings := map[string]struct{}{}

ticker := time.NewTicker(diagnosticsReportInterval)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
return nil
case <-r.EventCh():
warnings, err := safe.ReaderListAll[*runtime.Diagnostic](ctx, r)
if err != nil {
return fmt.Errorf("error listing diagnostics: %w", err)
}

seenWarnings := map[string]struct{}{}

for iter := warnings.Iterator(); iter.Next(); {
warning := iter.Value()

seenWarnings[warning.Metadata().ID()] = struct{}{}

if _, reported := reportedWarnings[warning.Metadata().ID()]; !reported {
logger.Warn("new diagnostic",
zap.String("id", warning.Metadata().ID()),
zap.String("message", warning.TypedSpec().Message),
zap.Strings("details", warning.TypedSpec().Details),
zap.String("url", warning.TypedSpec().DocumentationURL(warning.Metadata().ID())),
)

reportedWarnings[warning.Metadata().ID()] = struct{}{}
}
}

for id := range reportedWarnings {
if _, seen := seenWarnings[id]; !seen {
logger.Info("diagnostic resolved", zap.String("id", id))

delete(reportedWarnings, id)
}
}
case <-ticker.C:
if len(reportedWarnings) == 0 {
continue
}

warnings, err := safe.ReaderListAll[*runtime.Diagnostic](ctx, r)
if err != nil {
return fmt.Errorf("error listing diagnostics: %w", err)
}

for iter := warnings.Iterator(); iter.Next(); {
warning := iter.Value()

logger.Warn("diagnostic still active",
zap.String("id", warning.Metadata().ID()),
zap.String("message", warning.TypedSpec().Message),
zap.Strings("details", warning.TypedSpec().Details),
zap.String("url", warning.TypedSpec().DocumentationURL(warning.Metadata().ID())),
)
}
}

r.ResetRestartBackoff()
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package diagnostics

import (
"context"
"fmt"
"net/netip"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/gen/xslices"
"go.uber.org/zap"

"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
"github.com/siderolabs/talos/pkg/machinery/resources/network"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)

// AddressOverlapCheck checks for overlapping host and Kubernetes pod/service CIDR addresses.
func AddressOverlapCheck(ctx context.Context, r controller.Reader, logger *zap.Logger) (*runtime.DiagnosticSpec, error) {
hostAddresses, err := safe.ReaderGetByID[*network.NodeAddress](ctx, r, network.NodeAddressRoutedID)
if err != nil {
if state.IsNotFoundError(err) {
return nil, nil
}

return nil, fmt.Errorf("error reading host addresses: %w", err)
}

hostMinusK8s, err := safe.ReaderGetByID[*network.NodeAddress](ctx, r, network.FilteredNodeAddressID(network.NodeAddressRoutedID, k8s.NodeAddressFilterNoK8s))
if err != nil {
if state.IsNotFoundError(err) {
return nil, nil
}

return nil, fmt.Errorf("error reading host minus k8s addresses: %w", err)
}

cfg, err := safe.ReaderGetByID[*config.MachineConfig](ctx, r, config.V1Alpha1ID)
if err != nil {
if state.IsNotFoundError(err) {
return nil, nil
}

return nil, fmt.Errorf("error reading machine configuration: %w", err)
}

if len(hostAddresses.TypedSpec().Addresses) > 0 && len(hostMinusK8s.TypedSpec().Addresses) == 0 {
details := []string{
fmt.Sprintf("host routed addresses: %q", xslices.Map(hostAddresses.TypedSpec().Addresses, netip.Prefix.String)),
}

if cfg.Config().Cluster() != nil {
details = append(details, fmt.Sprintf("Kubernetes pod CIDRs: %q", cfg.Config().Cluster().Network().PodCIDRs()))
details = append(details, fmt.Sprintf("Kubernetes service CIDRs: %q", cfg.Config().Cluster().Network().ServiceCIDRs()))
}

return &runtime.DiagnosticSpec{
Message: "host and Kubernetes pod/service CIDR addresses overlap",
Details: details,
}, nil
}

return nil, nil
}
Loading

0 comments on commit 8dbe212

Please sign in to comment.