From 5699fef911b810571f068e8c5dcdf0e31d4ecf1e Mon Sep 17 00:00:00 2001 From: Josh Powers <powersj@fastmail.com> Date: Tue, 26 Mar 2024 13:29:04 -0600 Subject: [PATCH 1/8] feat(inputs.smartctl): Introduce smartctl JSON input plugin --- plugins/inputs/all/smartctl.go | 5 + plugins/inputs/smartctl/README.md | 100 ++++ plugins/inputs/smartctl/sample.conf | 30 ++ plugins/inputs/smartctl/smartctl.go | 93 ++++ plugins/inputs/smartctl/smartctl_device.go | 98 ++++ plugins/inputs/smartctl/smartctl_json.go | 184 +++++++ plugins/inputs/smartctl/smartctl_scan.go | 35 ++ plugins/inputs/smartctl/smartctl_test.go | 188 +++++++ .../smartctl/testcases_device/nvme/device | 1 + .../smartctl/testcases_device/nvme/deviceType | 1 + .../testcases_device/nvme/expected.out | 1 + .../testcases_device/nvme/response.json | 145 ++++++ .../smartctl/testcases_device/usb/device | 1 + .../smartctl/testcases_device/usb/deviceType | 1 + .../testcases_device/usb/expected.out | 14 + .../testcases_device/usb/response.json | 486 ++++++++++++++++++ .../smartctl/testcases_scan/all/expected.out | 1 + .../smartctl/testcases_scan/all/telegraf.toml | 1 + .../testcases_scan/exclude/expected.out | 1 + .../testcases_scan/exclude/telegraf.toml | 2 + .../testcases_scan/include/expected.out | 1 + .../testcases_scan/include/telegraf.toml | 2 + .../inputs/smartctl/testcases_scan/scan.json | 42 ++ 23 files changed, 1433 insertions(+) create mode 100644 plugins/inputs/all/smartctl.go create mode 100644 plugins/inputs/smartctl/README.md create mode 100644 plugins/inputs/smartctl/sample.conf create mode 100644 plugins/inputs/smartctl/smartctl.go create mode 100644 plugins/inputs/smartctl/smartctl_device.go create mode 100644 plugins/inputs/smartctl/smartctl_json.go create mode 100644 plugins/inputs/smartctl/smartctl_scan.go create mode 100644 plugins/inputs/smartctl/smartctl_test.go create mode 100644 plugins/inputs/smartctl/testcases_device/nvme/device create mode 100644 plugins/inputs/smartctl/testcases_device/nvme/deviceType create mode 100644 plugins/inputs/smartctl/testcases_device/nvme/expected.out create mode 100644 plugins/inputs/smartctl/testcases_device/nvme/response.json create mode 100644 plugins/inputs/smartctl/testcases_device/usb/device create mode 100644 plugins/inputs/smartctl/testcases_device/usb/deviceType create mode 100644 plugins/inputs/smartctl/testcases_device/usb/expected.out create mode 100644 plugins/inputs/smartctl/testcases_device/usb/response.json create mode 100644 plugins/inputs/smartctl/testcases_scan/all/expected.out create mode 100644 plugins/inputs/smartctl/testcases_scan/all/telegraf.toml create mode 100644 plugins/inputs/smartctl/testcases_scan/exclude/expected.out create mode 100644 plugins/inputs/smartctl/testcases_scan/exclude/telegraf.toml create mode 100644 plugins/inputs/smartctl/testcases_scan/include/expected.out create mode 100644 plugins/inputs/smartctl/testcases_scan/include/telegraf.toml create mode 100644 plugins/inputs/smartctl/testcases_scan/scan.json diff --git a/plugins/inputs/all/smartctl.go b/plugins/inputs/all/smartctl.go new file mode 100644 index 0000000000000..69155b83bdebe --- /dev/null +++ b/plugins/inputs/all/smartctl.go @@ -0,0 +1,5 @@ +//go:build !custom || inputs || inputs.smartctl + +package all + +import _ "github.com/influxdata/telegraf/plugins/inputs/smartctl" // register plugin diff --git a/plugins/inputs/smartctl/README.md b/plugins/inputs/smartctl/README.md new file mode 100644 index 0000000000000..d0ddbbc81e30e --- /dev/null +++ b/plugins/inputs/smartctl/README.md @@ -0,0 +1,100 @@ +# smartctl JSON Input Plugin + +Get metrics using the command line utility `smartctl` for S.M.A.R.T. +(Self-Monitoring, Analysis and Reporting Technology) storage devices. SMART is a +monitoring system included in computer hard disk drives (HDDs), solid-state +drives (SSDs), and nVME drives that detects and reports on various indicators of +drive reliability, with the intent of enabling the anticipation of hardware +failures. + +This version of the plugin requires support of the JSON flag from the `smartctl` +command. This flag was added in 7.0 (2019) and further enhanced in subsequent +releases. + +See smartmontools (<https://www.smartmontools.org/>) for more information. + +## Global configuration options <!-- @/docs/includes/plugin_config.md --> + +In addition to the plugin-specific configuration settings, plugins support +additional global and plugin configuration settings. These settings are used to +modify metrics, tags, and field or create aliases and configure ordering, etc. +See the [CONFIGURATION.md][CONFIGURATION.md] for more details. + +[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins + +## Configuration + +```toml @sample.conf +# Read metrics from SMART storage devices using smartclt's JSON output +[[inputs.smartctl]] + ## Optionally specify the path to the smartctl executable + # path = "/usr/bin/smartctl" + + ## Use sudo + ## On most platforms used, smartctl requires root access. Setting 'use_sudo' + ## to true will make use of sudo to run smartctl. Sudo must be configured to + ## allow the telegraf user to run smartctl without a password. + # use_sudo = false + + ## Devices to include or exclude + ## By default, the plugin will use all devices found in the output of + ## `smartctl --scan`. Only one option is allowed at a time. If set, include + ## sets the specific devices to scan, while exclude omits specific devices. + # devices_include = [] + # devices_exclude = [] + + ## Skip checking disks in specified power mode + ## Defaults to "standby" to not wake up disks that have stopped rotating. + ## For full details on the options here, see the --nocheck section in the + ## smartctl man page. Choose from: + ## * never: always check the device + ## * sleep: check the device unless it is in sleep mode + ## * standby: check the device unless it is in sleep or standby mode + ## * idle: check the device unless it is in sleep, standby, or idle mode + # nocheck = "standby" + + ## Timeout for the cli command to complete + # timeout = "30s" +``` + +## Permissions + +It is important to note that this plugin references `smartctl`, which may +require additional permissions to execute successfully. Depending on the +user/group permissions of the telegraf user executing this plugin, users may +need to use sudo. + +Users need the following in the Telegraf config: + +```toml +[[inputs.smart_json]] + use_sudo = true +``` + +And to update the `/etc/sudoers` file to allow running smartctl: + +```bash +$ visudo +# Add the following lines: +Cmnd_Alias SMARTCTL = /usr/bin/smartctl +telegraf ALL=(ALL) NOPASSWD: SMARTCTL +Defaults!SMARTCTL !logfile, !syslog, !pam_session +``` + +## Debugging Issues + +This plugin uses the following commands to determine devices and collect +metrics: + +* `smartctl --json --scan` +* `smartctl --json --all $DEVICE --device $TYPE --nocheck=$NOCHECK` + +Please include the output of the above two commands for all devices that are +having issues. + +## Metrics + +## Example Output + +```text +``` diff --git a/plugins/inputs/smartctl/sample.conf b/plugins/inputs/smartctl/sample.conf new file mode 100644 index 0000000000000..5b73ef012b8c5 --- /dev/null +++ b/plugins/inputs/smartctl/sample.conf @@ -0,0 +1,30 @@ +# Read metrics from SMART storage devices using smartclt's JSON output +[[inputs.smartctl]] + ## Optionally specify the path to the smartctl executable + # path = "/usr/bin/smartctl" + + ## Use sudo + ## On most platforms used, smartctl requires root access. Setting 'use_sudo' + ## to true will make use of sudo to run smartctl. Sudo must be configured to + ## allow the telegraf user to run smartctl without a password. + # use_sudo = false + + ## Devices to include or exclude + ## By default, the plugin will use all devices found in the output of + ## `smartctl --scan`. Only one option is allowed at a time. If set, include + ## sets the specific devices to scan, while exclude omits specific devices. + # devices_include = [] + # devices_exclude = [] + + ## Skip checking disks in specified power mode + ## Defaults to "standby" to not wake up disks that have stopped rotating. + ## For full details on the options here, see the --nocheck section in the + ## smartctl man page. Choose from: + ## * never: always check the device + ## * sleep: check the device unless it is in sleep mode + ## * standby: check the device unless it is in sleep or standby mode + ## * idle: check the device unless it is in sleep, standby, or idle mode + # nocheck = "standby" + + ## Timeout for the cli command to complete + # timeout = "30s" diff --git a/plugins/inputs/smartctl/smartctl.go b/plugins/inputs/smartctl/smartctl.go new file mode 100644 index 0000000000000..60a4320309257 --- /dev/null +++ b/plugins/inputs/smartctl/smartctl.go @@ -0,0 +1,93 @@ +//go:generate ../../../tools/readme_config_includer/generator +package smartctl + +import ( + _ "embed" + "errors" + "fmt" + "os" + "os/exec" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/config" + "github.com/influxdata/telegraf/filter" + "github.com/influxdata/telegraf/plugins/inputs" +) + +//go:embed sample.conf +var sampleConfig string + +// execCommand is used to mock commands in tests. +var execCommand = exec.Command + +type Smartctl struct { + Path string `toml:"path"` + NoCheck string `toml:"no_check"` + UseSudo bool `toml:"use_sudo"` + Timeout config.Duration `toml:"timeout"` + DevicesInclude []string `toml:"devices_include"` + DevicesExclude []string `toml:"devices_exclude"` + Log telegraf.Logger `toml:"-"` + + deviceFilter filter.Filter +} + +func (*Smartctl) SampleConfig() string { + return sampleConfig +} + +func (s *Smartctl) Init() error { + if s.Path == "" { + s.Path = "/usr/bin/smartctl" + } + + switch s.NoCheck { + case "never", "sleep", "standby", "idle": + case "": + s.NoCheck = "standby" + default: + return fmt.Errorf("invalid no_check value: %s", s.NoCheck) + } + + if s.Timeout == 0 { + s.Timeout = config.Duration(time.Second * 30) + } + + if len(s.DevicesInclude) != 0 && len(s.DevicesExclude) != 0 { + return errors.New("cannot specify both devices_include and devices_exclude") + } + + var err error + s.deviceFilter, err = filter.NewIncludeExcludeFilter(s.DevicesInclude, s.DevicesExclude) + if err != nil { + return err + } + + return nil +} + +func (s *Smartctl) Gather(acc telegraf.Accumulator) error { + devices, err := s.scan() + if err != nil { + return fmt.Errorf("Error scanning system: %w", err) + } + + for device, deviceType := range devices { + if err := s.scanDevice(acc, device, deviceType); err != nil { + return fmt.Errorf("Error getting device %s: %w", device, err) + } + } + + return nil +} + +func init() { + // Set LC_NUMERIC to uniform numeric output from cli tools + _ = os.Setenv("LC_NUMERIC", "en_US.UTF-8") + inputs.Add("smartctl", func() telegraf.Input { + return &Smartctl{ + Timeout: config.Duration(time.Second * 30), + } + }) +} diff --git a/plugins/inputs/smartctl/smartctl_device.go b/plugins/inputs/smartctl/smartctl_device.go new file mode 100644 index 0000000000000..c49cb36d52287 --- /dev/null +++ b/plugins/inputs/smartctl/smartctl_device.go @@ -0,0 +1,98 @@ +package smartctl + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" +) + +func (s *Smartctl) scanDevice(acc telegraf.Accumulator, deviceName string, deviceType string) error { + args := []string{"--json", "--all", deviceName, "--device", deviceType, "--nocheck=" + s.NoCheck} + cmd := execCommand(s.Path, args...) + if s.UseSudo { + cmd = execCommand("sudo", append([]string{"-n", s.Path}, args...)...) + } + + var device smartctlDeviceJSON + out, err := internal.CombinedOutputTimeout(cmd, time.Duration(s.Timeout)) + if err != nil { + // Try to still unmarshal the output to see if a specific message can + // be extracted from the output. + if err := json.Unmarshal(out, &device); err == nil { + if len(device.Smartctl.Messages) > 0 && device.Smartctl.Messages[0].String != "" { + return fmt.Errorf("error running smartctl with %s: %s", args, device.Smartctl.Messages[0].String) + } + } + return fmt.Errorf("error running smartctl with %s: %w", args, err) + } + t := time.Now() + + if err := json.Unmarshal(out, &device); err != nil { + return fmt.Errorf("error unmarshalling smartctl output: %w", err) + } + + tags := map[string]string{ + "name": device.Device.Name, + "type": device.Device.Type, + "model": device.ModelName, + "serial": device.SerialNumber, + } + + // The JSON WWN is in decimal and needs to be converted to hex + if device.Wwn.ID != 0 && device.Wwn.Naa != 0 && device.Wwn.Oui != 0 { + tags["wwn"] = fmt.Sprintf("%01x%06x%09x", device.Wwn.Naa, device.Wwn.Oui, device.Wwn.ID) + } + + fields := map[string]interface{}{ + "capacity": device.UserCapacity.Bytes, + "health_ok": device.SmartStatus.Passed, + "temperature": device.Temperature.Current, + "firmware": device.FirmwareVersion, + } + + // Add NVMe specific fields + if device.Device.Type == "nvme" { + fields["critical_warning"] = device.NvmeSmartHealthInformationLog.CriticalWarning + fields["temperature"] = device.NvmeSmartHealthInformationLog.Temperature + fields["available_spare"] = device.NvmeSmartHealthInformationLog.AvailableSpare + fields["available_spare_threshold"] = device.NvmeSmartHealthInformationLog.AvailableSpareThreshold + fields["percentage_used"] = device.NvmeSmartHealthInformationLog.PercentageUsed + fields["data_units_read"] = device.NvmeSmartHealthInformationLog.DataUnitsRead + fields["data_units_written"] = device.NvmeSmartHealthInformationLog.DataUnitsWritten + fields["host_reads"] = device.NvmeSmartHealthInformationLog.HostReads + fields["host_writes"] = device.NvmeSmartHealthInformationLog.HostWrites + fields["controller_busy_time"] = device.NvmeSmartHealthInformationLog.ControllerBusyTime + fields["power_cycles"] = device.NvmeSmartHealthInformationLog.PowerCycles + fields["power_on_hours"] = device.NvmeSmartHealthInformationLog.PowerOnHours + fields["unsafe_shutdowns"] = device.NvmeSmartHealthInformationLog.UnsafeShutdowns + fields["media_errors"] = device.NvmeSmartHealthInformationLog.MediaErrors + fields["num_err_log_entries"] = device.NvmeSmartHealthInformationLog.NumErrLogEntries + fields["warning_temp_time"] = device.NvmeSmartHealthInformationLog.WarningTempTime + fields["critical_comp_time"] = device.NvmeSmartHealthInformationLog.CriticalCompTime + } + + acc.AddFields("smartctl", fields, tags, t) + + // Check for ATA specific attribute fields + for _, attribute := range device.AtaSmartAttributes.Table { + attributeTags := make(map[string]string, len(tags)+1) + for k, v := range tags { + attributeTags[k] = v + } + attributeTags["name"] = attribute.Name + + fields := map[string]interface{}{ + "raw_value": attribute.Raw.Value, + "worst": attribute.Worst, + "threshold": attribute.Thresh, + "value": attribute.Value, + } + + acc.AddFields("smartctl_attributes", fields, attributeTags, t) + } + + return nil +} diff --git a/plugins/inputs/smartctl/smartctl_json.go b/plugins/inputs/smartctl/smartctl_json.go new file mode 100644 index 0000000000000..6e53db00b9b40 --- /dev/null +++ b/plugins/inputs/smartctl/smartctl_json.go @@ -0,0 +1,184 @@ +package smartctl + +type smartctlDeviceJSON struct { + JSONFormatVersion []int `json:"json_format_version"` + Smartctl struct { + Version []int `json:"version"` + PreRelease bool `json:"pre_release"` + SvnRevision string `json:"svn_revision"` + PlatformInfo string `json:"platform_info"` + BuildInfo string `json:"build_info"` + Argv []string `json:"argv"` + Messages []struct { + Severity string `json:"severity"` + String string `json:"string"` + } `json:"messages"` + ExitStatus int `json:"exit_status"` + } `json:"smartctl"` + LocalTime struct { + TimeT int `json:"time_t"` + Asctime string `json:"asctime"` + } `json:"local_time"` + Device struct { + Name string `json:"name"` + InfoName string `json:"info_name"` + Type string `json:"type"` + Protocol string `json:"protocol"` + } `json:"device"` + ModelFamily string `json:"model_family"` + ModelName string `json:"model_name"` + SerialNumber string `json:"serial_number"` + FirmwareVersion string `json:"firmware_version"` + Wwn struct { + Naa int `json:"naa"` + Oui int `json:"oui"` + ID int64 `json:"id"` + } `json:"wwn"` + NvmePciVendor struct { + ID int `json:"id"` + SubsystemID int `json:"subsystem_id"` + } `json:"nvme_pci_vendor"` + NvmeIeeeOuiIdentifier int `json:"nvme_ieee_oui_identifier"` + NvmeTotalCapacity int64 `json:"nvme_total_capacity"` + NvmeUnallocatedCapacity int `json:"nvme_unallocated_capacity"` + NvmeControllerID int `json:"nvme_controller_id"` + NvmeVersion struct { + String string `json:"string"` + Value int `json:"value"` + } `json:"nvme_version"` + NvmeNumberOfNamespaces int `json:"nvme_number_of_namespaces"` + NvmeNamespaces []struct { + ID int `json:"id"` + Size struct { + Blocks int `json:"blocks"` + Bytes int64 `json:"bytes"` + } `json:"size"` + Capacity struct { + Blocks int `json:"blocks"` + Bytes int64 `json:"bytes"` + } `json:"capacity"` + Utilization struct { + Blocks int `json:"blocks"` + Bytes int64 `json:"bytes"` + } `json:"utilization"` + FormattedLbaSize int `json:"formatted_lba_size"` + Eui64 struct { + Oui int `json:"oui"` + ExtID int64 `json:"ext_id"` + } `json:"eui64"` + } `json:"nvme_namespaces"` + UserCapacity struct { + Blocks int `json:"blocks"` + Bytes int64 `json:"bytes"` + } `json:"user_capacity"` + LogicalBlockSize int `json:"logical_block_size"` + SmartSupport struct { + Available bool `json:"available"` + Enabled bool `json:"enabled"` + } `json:"smart_support"` + SmartStatus struct { + Passed bool `json:"passed"` + Nvme struct { + Value int `json:"value"` + } `json:"nvme"` + } `json:"smart_status"` + NvmeSmartHealthInformationLog struct { + CriticalWarning int `json:"critical_warning"` + Temperature int `json:"temperature"` + AvailableSpare int `json:"available_spare"` + AvailableSpareThreshold int `json:"available_spare_threshold"` + PercentageUsed int `json:"percentage_used"` + DataUnitsRead int `json:"data_units_read"` + DataUnitsWritten int `json:"data_units_written"` + HostReads int `json:"host_reads"` + HostWrites int `json:"host_writes"` + ControllerBusyTime int `json:"controller_busy_time"` + PowerCycles int `json:"power_cycles"` + PowerOnHours int `json:"power_on_hours"` + UnsafeShutdowns int `json:"unsafe_shutdowns"` + MediaErrors int `json:"media_errors"` + NumErrLogEntries int `json:"num_err_log_entries"` + WarningTempTime int `json:"warning_temp_time"` + CriticalCompTime int `json:"critical_comp_time"` + } `json:"nvme_smart_health_information_log"` + Temperature struct { + Current int `json:"current"` + } `json:"temperature"` + PowerCycleCount int `json:"power_cycle_count"` + PowerOnTime struct { + Hours int `json:"hours"` + } `json:"power_on_time"` + NvmeErrorInformationLog struct { + Size int `json:"size"` + Read int `json:"read"` + Unread int `json:"unread"` + Table []struct { + ErrorCount int `json:"error_count"` + SubmissionQueueID int `json:"submission_queue_id"` + CommandID int `json:"command_id"` + StatusField struct { + Value int `json:"value"` + DoNotRetry bool `json:"do_not_retry"` + StatusCodeType int `json:"status_code_type"` + StatusCode int `json:"status_code"` + String string `json:"string"` + } `json:"status_field"` + PhaseTag bool `json:"phase_tag"` + ParmErrorLocation int `json:"parm_error_location"` + Lba struct { + Value int `json:"value"` + } `json:"lba"` + Nsid int `json:"nsid"` + } `json:"table"` + } `json:"nvme_error_information_log"` + NvmeSelfTestLog struct { + CurrentSelfTestOperation struct { + Value int `json:"value"` + String string `json:"string"` + } `json:"current_self_test_operation"` + } `json:"nvme_self_test_log"` + AtaSmartAttributes struct { + Revision int `json:"revision"` + Table []struct { + ID int `json:"id"` + Name string `json:"name"` + Value int `json:"value"` + Worst int `json:"worst"` + Thresh int `json:"thresh"` + WhenFailed string `json:"when_failed"` + Flags struct { + Value int `json:"value"` + String string `json:"string"` + Prefailure bool `json:"prefailure"` + UpdatedOnline bool `json:"updated_online"` + Performance bool `json:"performance"` + ErrorRate bool `json:"error_rate"` + EventCount bool `json:"event_count"` + AutoKeep bool `json:"auto_keep"` + } `json:"flags"` + Raw struct { + Value int `json:"value"` + String string `json:"string"` + } `json:"raw"` + } `json:"table"` + } `json:"ata_smart_attributes"` +} + +type smartctlScanJSON struct { + JSONFormatVersion []int `json:"json_format_version"` + Smartctl struct { + Version []int `json:"version"` + PreRelease bool `json:"pre_release"` + SvnRevision string `json:"svn_revision"` + PlatformInfo string `json:"platform_info"` + BuildInfo string `json:"build_info"` + Argv []string `json:"argv"` + ExitStatus int `json:"exit_status"` + } `json:"smartctl"` + Devices []struct { + Name string `json:"name"` + InfoName string `json:"info_name"` + Type string `json:"type"` + Protocol string `json:"protocol"` + } `json:"devices"` +} diff --git a/plugins/inputs/smartctl/smartctl_scan.go b/plugins/inputs/smartctl/smartctl_scan.go new file mode 100644 index 0000000000000..c96b07f6e4509 --- /dev/null +++ b/plugins/inputs/smartctl/smartctl_scan.go @@ -0,0 +1,35 @@ +package smartctl + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/influxdata/telegraf/internal" +) + +func (s *Smartctl) scan() (map[string]string, error) { + args := []string{"--json", "--scan"} + cmd := execCommand(s.Path, args...) + if s.UseSudo { + cmd = execCommand("sudo", append([]string{"-n", s.Path}, args...)...) + } + out, err := internal.CombinedOutputTimeout(cmd, time.Duration(s.Timeout)) + if err != nil { + return nil, fmt.Errorf("error running smartctl with %s: %w", args, err) + } + + var scan smartctlScanJSON + if err := json.Unmarshal(out, &scan); err != nil { + return nil, fmt.Errorf("error unmarshalling smartctl scan output: %w", err) + } + + devices := make(map[string]string, len(scan.Devices)) + for _, device := range scan.Devices { + if s.deviceFilter.Match(device.Name) { + devices[device.Name] = device.Type + } + } + + return devices, nil +} diff --git a/plugins/inputs/smartctl/smartctl_test.go b/plugins/inputs/smartctl/smartctl_test.go new file mode 100644 index 0000000000000..c4a48edd110f2 --- /dev/null +++ b/plugins/inputs/smartctl/smartctl_test.go @@ -0,0 +1,188 @@ +package smartctl + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "slices" + "strconv" + "strings" + "testing" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/config" + "github.com/influxdata/telegraf/plugins/inputs" + "github.com/influxdata/telegraf/plugins/parsers/influx" + "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/require" +) + +func TestCasesScan(t *testing.T) { + // Get all directories in testdata + folders, err := os.ReadDir("testcases_scan") + require.NoError(t, err) + + // Register the plugin + inputs.Add("smartctl", func() telegraf.Input { + return &Smartctl{} + }) + + for _, f := range folders { + if !f.IsDir() { + continue + } + testcasePath := filepath.Join("testcases_scan", f.Name()) + configFilename := filepath.Join(testcasePath, "telegraf.toml") + expectedFilename := filepath.Join(testcasePath, "expected.out") + + t.Run(f.Name(), func(t *testing.T) { + parser := &influx.Parser{} + require.NoError(t, parser.Init()) + + // Read the expected output if any + var expected int + if _, err := os.Stat(expectedFilename); err == nil { + var err error + expectedBytes, err := os.ReadFile(expectedFilename) + require.NoError(t, err) + expected, err = strconv.Atoi(strings.TrimSpace(string(expectedBytes))) + require.NoError(t, err) + } + + // Update exec to return fake data. + execCommand = fakeScanExecCommand + defer func() { execCommand = exec.Command }() + + // Configure the plugin + cfg := config.NewConfig() + require.NoError(t, cfg.LoadConfig(configFilename)) + require.Len(t, cfg.Inputs, 1) + plugin := cfg.Inputs[0].Input.(*Smartctl) + require.NoError(t, plugin.Init()) + + devices, err := plugin.scan() + require.NoError(t, err) + require.Len(t, devices, expected) + }) + } +} + +func fakeScanExecCommand(command string, args ...string) *exec.Cmd { + cs := []string{"-test.run=TestScanHelperProcess", "--", command} + cs = append(cs, args...) + cmd := exec.Command(os.Args[0], cs...) + cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"} + return cmd +} + +func TestScanHelperProcess(t *testing.T) { + if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { + return + } + + scanBytes, err := os.ReadFile("testcases_scan/scan.json") + require.NoError(t, err) + fmt.Fprint(os.Stdout, string(scanBytes)) + + //nolint:revive // os.Exit called intentionally + os.Exit(0) +} + +func TestCasesDevices(t *testing.T) { + // Get all directories in testdata + folders, err := os.ReadDir("testcases_device") + require.NoError(t, err) + + // Register the plugin + inputs.Add("smartctl", func() telegraf.Input { + return &Smartctl{} + }) + + for _, f := range folders { + if !f.IsDir() { + continue + } + testcasePath := filepath.Join("testcases_device", f.Name()) + deviceFilename := filepath.Join(testcasePath, "device") + deviceTypeFilename := filepath.Join(testcasePath, "deviceType") + expectedFilename := filepath.Join(testcasePath, "expected.out") + + t.Run(f.Name(), func(t *testing.T) { + parser := &influx.Parser{} + require.NoError(t, parser.Init()) + + // Read the expected output if any + var expected []telegraf.Metric + if _, err := os.Stat(expectedFilename); err == nil { + var err error + expected, err = testutil.ParseMetricsFromFile(expectedFilename, parser) + require.NoError(t, err) + } + + // Read the devices to scan + deviceBytes, err := os.ReadFile(deviceFilename) + require.NoError(t, err) + deviceTypeBytes, err := os.ReadFile(deviceTypeFilename) + require.NoError(t, err) + + // Update exec to return fake data. + execCommand = fakeDeviceExecCommand + defer func() { execCommand = exec.Command }() + + // Configure the plugin + plugin := Smartctl{} + require.NoError(t, plugin.Init()) + + var acc testutil.Accumulator + require.NoError(t, + plugin.scanDevice( + &acc, + strings.TrimSpace(string(deviceBytes)), + strings.TrimSpace(string(deviceTypeBytes)), + ), + ) + + // Check the metric nevertheless as we might get some metrics despite errors. + actual := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, expected, actual, testutil.IgnoreTime()) + acc.Lock() + defer acc.Unlock() + require.Empty(t, acc.Errors) + }) + } +} + +func fakeDeviceExecCommand(command string, args ...string) *exec.Cmd { + cs := []string{"-test.run=TestDeviceHelperProcess", "--", command} + cs = append(cs, args...) + cmd := exec.Command(os.Args[0], cs...) + cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"} + return cmd +} + +func TestDeviceHelperProcess(t *testing.T) { + if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { + return + } + args := os.Args + + var filename string + if slices.Contains(args, "/dev/nvme0") { + filename = "testcases_device/nvme/response.json" + } else if slices.Contains(args, "/dev/sda") { + filename = "testcases_device/usb/response.json" + } else { + panic("unknown device") + } + + if filename == "" { + fmt.Fprint(os.Stdout, "unknown filename") + os.Exit(1) //nolint:revive // os.Exit called intentionally + } + + scanBytes, err := os.ReadFile(filename) + require.NoError(t, err) + fmt.Fprint(os.Stdout, string(scanBytes)) + os.Exit(0) //nolint:revive // os.Exit called intentionally +} diff --git a/plugins/inputs/smartctl/testcases_device/nvme/device b/plugins/inputs/smartctl/testcases_device/nvme/device new file mode 100644 index 0000000000000..a6a0c2fbc79b5 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/nvme/device @@ -0,0 +1 @@ +/dev/nvme0 diff --git a/plugins/inputs/smartctl/testcases_device/nvme/deviceType b/plugins/inputs/smartctl/testcases_device/nvme/deviceType new file mode 100644 index 0000000000000..9158fff478b2a --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/nvme/deviceType @@ -0,0 +1 @@ +nvme diff --git a/plugins/inputs/smartctl/testcases_device/nvme/expected.out b/plugins/inputs/smartctl/testcases_device/nvme/expected.out new file mode 100644 index 0000000000000..f8316ab3708af --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/nvme/expected.out @@ -0,0 +1 @@ +smartctl,model=Sabrent\ Rocket\ 4.0\ 1TB,name=/dev/nvme0,serial=6D1107091C9583054511,type=nvme available_spare=100i,available_spare_threshold=5i,capacity=1000204886016i,controller_busy_time=1635i,critical_comp_time=0i,critical_warning=0i,data_units_read=28337502i,data_units_written=76471882i,firmware="RKT401.3",health_ok=true,host_reads=294243226i,host_writes=733021025i,media_errors=0i,num_err_log_entries=4871i,percentage_used=4i,power_cycles=1815i,power_on_hours=8733i,temperature=48i,unsafe_shutdowns=39i,warning_temp_time=0i 1711480345635747372 diff --git a/plugins/inputs/smartctl/testcases_device/nvme/response.json b/plugins/inputs/smartctl/testcases_device/nvme/response.json new file mode 100644 index 0000000000000..814527a750836 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/nvme/response.json @@ -0,0 +1,145 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 4 + ], + "pre_release": false, + "svn_revision": "5530", + "platform_info": "x86_64-linux-6.8.1-arch1-1", + "build_info": "(local build)", + "argv": [ + "smartctl", + "-a", + "-d", + "nvme", + "/dev/nvme0", + "--json" + ], + "exit_status": 0 + }, + "local_time": { + "time_t": 1711371013, + "asctime": "Mon Mar 25 06:50:13 2024 MDT" + }, + "device": { + "name": "/dev/nvme0", + "info_name": "/dev/nvme0", + "type": "nvme", + "protocol": "NVMe" + }, + "model_name": "Sabrent Rocket 4.0 1TB", + "serial_number": "6D1107091C9583054511", + "firmware_version": "RKT401.3", + "nvme_pci_vendor": { + "id": 6535, + "subsystem_id": 6535 + }, + "nvme_ieee_oui_identifier": 6584743, + "nvme_total_capacity": 1000204886016, + "nvme_unallocated_capacity": 0, + "nvme_controller_id": 1, + "nvme_version": { + "string": "1.3", + "value": 66304 + }, + "nvme_number_of_namespaces": 1, + "nvme_namespaces": [ + { + "id": 1, + "size": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "capacity": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "utilization": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "formatted_lba_size": 512, + "eui64": { + "oui": 6584743, + "ext_id": 268705991866 + } + } + ], + "user_capacity": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "logical_block_size": 512, + "smart_support": { + "available": true, + "enabled": true + }, + "smart_status": { + "passed": true, + "nvme": { + "value": 0 + } + }, + "nvme_smart_health_information_log": { + "critical_warning": 0, + "temperature": 48, + "available_spare": 100, + "available_spare_threshold": 5, + "percentage_used": 4, + "data_units_read": 28337502, + "data_units_written": 76471882, + "host_reads": 294243226, + "host_writes": 733021025, + "controller_busy_time": 1635, + "power_cycles": 1815, + "power_on_hours": 8733, + "unsafe_shutdowns": 39, + "media_errors": 0, + "num_err_log_entries": 4871, + "warning_temp_time": 0, + "critical_comp_time": 0 + }, + "temperature": { + "current": 48 + }, + "power_cycle_count": 1815, + "power_on_time": { + "hours": 8733 + }, + "nvme_error_information_log": { + "size": 63, + "read": 16, + "unread": 0, + "table": [ + { + "error_count": 4871, + "submission_queue_id": 0, + "command_id": 20495, + "status_field": { + "value": 8194, + "do_not_retry": false, + "status_code_type": 0, + "status_code": 2, + "string": "Invalid Field in Command" + }, + "phase_tag": false, + "parm_error_location": 40, + "lba": { + "value": 0 + }, + "nsid": 0 + } + ] + }, + "nvme_self_test_log": { + "current_self_test_operation": { + "value": 0, + "string": "No self-test in progress" + } + } +} diff --git a/plugins/inputs/smartctl/testcases_device/usb/device b/plugins/inputs/smartctl/testcases_device/usb/device new file mode 100644 index 0000000000000..c7b6a1bc03332 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/usb/device @@ -0,0 +1 @@ +/dev/sda diff --git a/plugins/inputs/smartctl/testcases_device/usb/deviceType b/plugins/inputs/smartctl/testcases_device/usb/deviceType new file mode 100644 index 0000000000000..6b8a2c3d2dbc9 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/usb/deviceType @@ -0,0 +1 @@ +sat diff --git a/plugins/inputs/smartctl/testcases_device/usb/expected.out b/plugins/inputs/smartctl/testcases_device/usb/expected.out new file mode 100644 index 0000000000000..3eadb53dde1d2 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/usb/expected.out @@ -0,0 +1,14 @@ +smartctl,model=SanDisk\ pSSD,name=/dev/sda,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c capacity=15693664256i,firmware="3",health_ok=true,temperature=0i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Reallocated_Sector_Ct,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=0i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Power_On_Hours,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=11i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Power_Cycle_Count,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=223i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Program_Fail_Count,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=0i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Erase_Fail_Count,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=0i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Avg_Write/Erase_Count,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=3i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Unexpect_Power_Loss_Ct,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=114i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Reported_Uncorrect,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=0i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Perc_Write/Erase_Count,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=10i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Perc_Avail_Resrvd_Space,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=0i,threshold=5i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Perc_Write/Erase_Ct_BC,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=0i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Total_LBAs_Written,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=10171055i,threshold=0i,value=100i,worst=100i 1711480345675066854 +smartctl_attributes,model=SanDisk\ pSSD,name=Total_LBAs_Read,serial=06c9f4c44,type=sat,wwn=5001b4409f6c444c raw_value=94845144i,threshold=0i,value=100i,worst=100i 1711480345675066854 diff --git a/plugins/inputs/smartctl/testcases_device/usb/response.json b/plugins/inputs/smartctl/testcases_device/usb/response.json new file mode 100644 index 0000000000000..dedeef0460f96 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/usb/response.json @@ -0,0 +1,486 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 4 + ], + "pre_release": false, + "svn_revision": "5530", + "platform_info": "x86_64-linux-6.8.1-arch1-1", + "build_info": "(local build)", + "argv": [ + "smartctl", + "-a", + "-d", + "sat", + "/dev/sda", + "--json" + ], + "drive_database_version": { + "string": "7.3/5528" + }, + "exit_status": 0 + }, + "local_time": { + "time_t": 1711370961, + "asctime": "Mon Mar 25 06:49:21 2024 MDT" + }, + "device": { + "name": "/dev/sda", + "info_name": "/dev/sda [SAT]", + "type": "sat", + "protocol": "ATA" + }, + "model_family": "SanDisk based SSDs", + "model_name": "SanDisk pSSD", + "serial_number": "06c9f4c44", + "wwn": { + "naa": 5, + "oui": 6980, + "id": 2674672716 + }, + "firmware_version": "3", + "user_capacity": { + "blocks": 30651688, + "bytes": 15693664256 + }, + "logical_block_size": 512, + "physical_block_size": 512, + "rotation_rate": 0, + "form_factor": { + "ata_value": 4, + "name": "1.8 inches" + }, + "trim": { + "supported": true, + "deterministic": true, + "zeroed": true + }, + "in_smartctl_database": true, + "ata_version": { + "string": "ATA8-ACS T13/1699-D revision 2d", + "major_value": 496, + "minor_value": 263 + }, + "sata_version": { + "string": "SATA 2.6", + "value": 17 + }, + "interface_speed": { + "max": { + "sata_value": 14, + "string": "6.0 Gb/s", + "units_per_second": 60, + "bits_per_unit": 100000000 + }, + "current": { + "sata_value": 2, + "string": "3.0 Gb/s", + "units_per_second": 30, + "bits_per_unit": 100000000 + } + }, + "smart_support": { + "available": true, + "enabled": true + }, + "smart_status": { + "passed": true + }, + "ata_smart_data": { + "offline_data_collection": { + "status": { + "value": 0, + "string": "was never started" + }, + "completion_seconds": 120 + }, + "self_test": { + "status": { + "value": 0, + "string": "completed without error", + "passed": true + }, + "polling_minutes": { + "short": 2, + "extended": 3 + } + }, + "capabilities": { + "values": [ + 81, + 3 + ], + "exec_offline_immediate_supported": true, + "offline_is_aborted_upon_new_cmd": false, + "offline_surface_scan_supported": false, + "self_tests_supported": true, + "conveyance_self_test_supported": false, + "selective_self_test_supported": true, + "attribute_autosave_enabled": true, + "error_logging_supported": true, + "gp_logging_supported": true + } + }, + "ata_smart_attributes": { + "revision": 1, + "table": [ + { + "id": 5, + "name": "Reallocated_Sector_Ct", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 9, + "name": "Power_On_Hours", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 11, + "string": "11" + } + }, + { + "id": 12, + "name": "Power_Cycle_Count", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 223, + "string": "223" + } + }, + { + "id": 171, + "name": "Program_Fail_Count", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 172, + "name": "Erase_Fail_Count", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 173, + "name": "Avg_Write/Erase_Count", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 3, + "string": "3" + } + }, + { + "id": 174, + "name": "Unexpect_Power_Loss_Ct", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 114, + "string": "114" + } + }, + { + "id": 187, + "name": "Reported_Uncorrect", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 230, + "name": "Perc_Write/Erase_Count", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 10, + "string": "10" + } + }, + { + "id": 232, + "name": "Perc_Avail_Resrvd_Space", + "value": 100, + "worst": 100, + "thresh": 5, + "when_failed": "", + "flags": { + "value": 3, + "string": "PO---- ", + "prefailure": true, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 234, + "name": "Perc_Write/Erase_Ct_BC", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 241, + "name": "Total_LBAs_Written", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 10171055, + "string": "10171055" + } + }, + { + "id": 242, + "name": "Total_LBAs_Read", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 2, + "string": "-O---- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 94845144, + "string": "94845144" + } + } + ] + }, + "power_on_time": { + "hours": 11 + }, + "power_cycle_count": 223, + "ata_smart_error_log": { + "summary": { + "revision": 1, + "count": 0 + } + }, + "ata_smart_self_test_log": { + "standard": { + "revision": 1, + "count": 0 + } + }, + "ata_smart_selective_self_test_log": { + "revision": 1, + "table": [ + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + }, + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + }, + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + }, + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + }, + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + } + ], + "flags": { + "value": 0, + "remainder_scan_enabled": false + }, + "power_up_scan_resume_minutes": 0 + } +} diff --git a/plugins/inputs/smartctl/testcases_scan/all/expected.out b/plugins/inputs/smartctl/testcases_scan/all/expected.out new file mode 100644 index 0000000000000..00750edc07d64 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/all/expected.out @@ -0,0 +1 @@ +3 diff --git a/plugins/inputs/smartctl/testcases_scan/all/telegraf.toml b/plugins/inputs/smartctl/testcases_scan/all/telegraf.toml new file mode 100644 index 0000000000000..6cd853f61d473 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/all/telegraf.toml @@ -0,0 +1 @@ +[[inputs.smartctl]] diff --git a/plugins/inputs/smartctl/testcases_scan/exclude/expected.out b/plugins/inputs/smartctl/testcases_scan/exclude/expected.out new file mode 100644 index 0000000000000..d00491fd7e5bb --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/exclude/expected.out @@ -0,0 +1 @@ +1 diff --git a/plugins/inputs/smartctl/testcases_scan/exclude/telegraf.toml b/plugins/inputs/smartctl/testcases_scan/exclude/telegraf.toml new file mode 100644 index 0000000000000..0a3f7b2987527 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/exclude/telegraf.toml @@ -0,0 +1,2 @@ +[[inputs.smartctl]] + devices_exclude = ["/dev/nvme0", "/dev/nvme1"] diff --git a/plugins/inputs/smartctl/testcases_scan/include/expected.out b/plugins/inputs/smartctl/testcases_scan/include/expected.out new file mode 100644 index 0000000000000..d00491fd7e5bb --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/include/expected.out @@ -0,0 +1 @@ +1 diff --git a/plugins/inputs/smartctl/testcases_scan/include/telegraf.toml b/plugins/inputs/smartctl/testcases_scan/include/telegraf.toml new file mode 100644 index 0000000000000..82d4e111476df --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/include/telegraf.toml @@ -0,0 +1,2 @@ +[[inputs.smartctl]] + devices_include = ["/dev/sda"] diff --git a/plugins/inputs/smartctl/testcases_scan/scan.json b/plugins/inputs/smartctl/testcases_scan/scan.json new file mode 100644 index 0000000000000..8cb1cf4298893 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/scan.json @@ -0,0 +1,42 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 4 + ], + "pre_release": false, + "svn_revision": "5530", + "platform_info": "x86_64-linux-6.8.1-arch1-1", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--scan", + "--json" + ], + "exit_status": 0 + }, + "devices": [ + { + "name": "/dev/sda", + "info_name": "/dev/sda [SAT]", + "type": "sat", + "protocol": "ATA" + }, + { + "name": "/dev/nvme0", + "info_name": "/dev/nvme0", + "type": "nvme", + "protocol": "NVMe" + }, + { + "name": "/dev/nvme1", + "info_name": "/dev/nvme1", + "type": "nvme", + "protocol": "NVMe" + } + ] +} From b0dd372b98c590b3fc7c66cf48ffec2916ccabfa Mon Sep 17 00:00:00 2001 From: Josh Powers <powersj@fastmail.com> Date: Fri, 29 Mar 2024 09:46:51 -0600 Subject: [PATCH 2/8] Add megaraid test case, and continue to parse if only warning --- plugins/inputs/smartctl/smartctl_device.go | 23 +- plugins/inputs/smartctl/smartctl_json.go | 81 +- plugins/inputs/smartctl/smartctl_test.go | 8 +- .../smartctl/testcases_device/megaraid/device | 1 + .../testcases_device/megaraid/deviceType | 1 + .../testcases_device/megaraid/expected.out | 25 + .../testcases_device/megaraid/response.json | 733 ++++++++++++++++++ 7 files changed, 776 insertions(+), 96 deletions(-) create mode 100644 plugins/inputs/smartctl/testcases_device/megaraid/device create mode 100644 plugins/inputs/smartctl/testcases_device/megaraid/deviceType create mode 100644 plugins/inputs/smartctl/testcases_device/megaraid/expected.out create mode 100644 plugins/inputs/smartctl/testcases_device/megaraid/response.json diff --git a/plugins/inputs/smartctl/smartctl_device.go b/plugins/inputs/smartctl/smartctl_device.go index c49cb36d52287..9bb6b2023c469 100644 --- a/plugins/inputs/smartctl/smartctl_device.go +++ b/plugins/inputs/smartctl/smartctl_device.go @@ -18,22 +18,23 @@ func (s *Smartctl) scanDevice(acc telegraf.Accumulator, deviceName string, devic var device smartctlDeviceJSON out, err := internal.CombinedOutputTimeout(cmd, time.Duration(s.Timeout)) - if err != nil { - // Try to still unmarshal the output to see if a specific message can - // be extracted from the output. - if err := json.Unmarshal(out, &device); err == nil { - if len(device.Smartctl.Messages) > 0 && device.Smartctl.Messages[0].String != "" { - return fmt.Errorf("error running smartctl with %s: %s", args, device.Smartctl.Messages[0].String) - } - } + + // Error running the command and unable to parse the JSON, then bail + if jsonErr := json.Unmarshal(out, &device); jsonErr != nil { return fmt.Errorf("error running smartctl with %s: %w", args, err) } - t := time.Now() - if err := json.Unmarshal(out, &device); err != nil { - return fmt.Errorf("error unmarshalling smartctl output: %w", err) + // If we were able to parse the result, then only exit if we get an error + // as sometimes we can get warnings, that still produce data. + if err != nil && + len(device.Smartctl.Messages) > 0 && + device.Smartctl.Messages[0].Severity == "error" && + device.Smartctl.Messages[0].String != "" { + return fmt.Errorf("error running smartctl with %s got smartctl error message: %s", args, device.Smartctl.Messages[0].String) } + t := time.Now() + tags := map[string]string{ "name": device.Device.Name, "type": device.Device.Type, diff --git a/plugins/inputs/smartctl/smartctl_json.go b/plugins/inputs/smartctl/smartctl_json.go index 6e53db00b9b40..760cecd633c9f 100644 --- a/plugins/inputs/smartctl/smartctl_json.go +++ b/plugins/inputs/smartctl/smartctl_json.go @@ -15,10 +15,6 @@ type smartctlDeviceJSON struct { } `json:"messages"` ExitStatus int `json:"exit_status"` } `json:"smartctl"` - LocalTime struct { - TimeT int `json:"time_t"` - Asctime string `json:"asctime"` - } `json:"local_time"` Device struct { Name string `json:"name"` InfoName string `json:"info_name"` @@ -34,53 +30,11 @@ type smartctlDeviceJSON struct { Oui int `json:"oui"` ID int64 `json:"id"` } `json:"wwn"` - NvmePciVendor struct { - ID int `json:"id"` - SubsystemID int `json:"subsystem_id"` - } `json:"nvme_pci_vendor"` - NvmeIeeeOuiIdentifier int `json:"nvme_ieee_oui_identifier"` - NvmeTotalCapacity int64 `json:"nvme_total_capacity"` - NvmeUnallocatedCapacity int `json:"nvme_unallocated_capacity"` - NvmeControllerID int `json:"nvme_controller_id"` - NvmeVersion struct { - String string `json:"string"` - Value int `json:"value"` - } `json:"nvme_version"` - NvmeNumberOfNamespaces int `json:"nvme_number_of_namespaces"` - NvmeNamespaces []struct { - ID int `json:"id"` - Size struct { - Blocks int `json:"blocks"` - Bytes int64 `json:"bytes"` - } `json:"size"` - Capacity struct { - Blocks int `json:"blocks"` - Bytes int64 `json:"bytes"` - } `json:"capacity"` - Utilization struct { - Blocks int `json:"blocks"` - Bytes int64 `json:"bytes"` - } `json:"utilization"` - FormattedLbaSize int `json:"formatted_lba_size"` - Eui64 struct { - Oui int `json:"oui"` - ExtID int64 `json:"ext_id"` - } `json:"eui64"` - } `json:"nvme_namespaces"` UserCapacity struct { - Blocks int `json:"blocks"` - Bytes int64 `json:"bytes"` + Bytes int64 `json:"bytes"` } `json:"user_capacity"` - LogicalBlockSize int `json:"logical_block_size"` - SmartSupport struct { - Available bool `json:"available"` - Enabled bool `json:"enabled"` - } `json:"smart_support"` SmartStatus struct { Passed bool `json:"passed"` - Nvme struct { - Value int `json:"value"` - } `json:"nvme"` } `json:"smart_status"` NvmeSmartHealthInformationLog struct { CriticalWarning int `json:"critical_warning"` @@ -104,39 +58,6 @@ type smartctlDeviceJSON struct { Temperature struct { Current int `json:"current"` } `json:"temperature"` - PowerCycleCount int `json:"power_cycle_count"` - PowerOnTime struct { - Hours int `json:"hours"` - } `json:"power_on_time"` - NvmeErrorInformationLog struct { - Size int `json:"size"` - Read int `json:"read"` - Unread int `json:"unread"` - Table []struct { - ErrorCount int `json:"error_count"` - SubmissionQueueID int `json:"submission_queue_id"` - CommandID int `json:"command_id"` - StatusField struct { - Value int `json:"value"` - DoNotRetry bool `json:"do_not_retry"` - StatusCodeType int `json:"status_code_type"` - StatusCode int `json:"status_code"` - String string `json:"string"` - } `json:"status_field"` - PhaseTag bool `json:"phase_tag"` - ParmErrorLocation int `json:"parm_error_location"` - Lba struct { - Value int `json:"value"` - } `json:"lba"` - Nsid int `json:"nsid"` - } `json:"table"` - } `json:"nvme_error_information_log"` - NvmeSelfTestLog struct { - CurrentSelfTestOperation struct { - Value int `json:"value"` - String string `json:"string"` - } `json:"current_self_test_operation"` - } `json:"nvme_self_test_log"` AtaSmartAttributes struct { Revision int `json:"revision"` Table []struct { diff --git a/plugins/inputs/smartctl/smartctl_test.go b/plugins/inputs/smartctl/smartctl_test.go index c4a48edd110f2..5de7d7e51352d 100644 --- a/plugins/inputs/smartctl/smartctl_test.go +++ b/plugins/inputs/smartctl/smartctl_test.go @@ -172,13 +172,11 @@ func TestDeviceHelperProcess(t *testing.T) { filename = "testcases_device/nvme/response.json" } else if slices.Contains(args, "/dev/sda") { filename = "testcases_device/usb/response.json" + } else if slices.Contains(args, "/dev/bus/6") { + filename = "testcases_device/megaraid/response.json" } else { - panic("unknown device") - } - - if filename == "" { fmt.Fprint(os.Stdout, "unknown filename") - os.Exit(1) //nolint:revive // os.Exit called intentionally + os.Exit(42) //nolint:revive // os.Exit called intentionally } scanBytes, err := os.ReadFile(filename) diff --git a/plugins/inputs/smartctl/testcases_device/megaraid/device b/plugins/inputs/smartctl/testcases_device/megaraid/device new file mode 100644 index 0000000000000..474992ed4deb7 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/megaraid/device @@ -0,0 +1 @@ +/dev/bus/6 diff --git a/plugins/inputs/smartctl/testcases_device/megaraid/deviceType b/plugins/inputs/smartctl/testcases_device/megaraid/deviceType new file mode 100644 index 0000000000000..62c2cd720a5ca --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/megaraid/deviceType @@ -0,0 +1 @@ +megaraid,14 diff --git a/plugins/inputs/smartctl/testcases_device/megaraid/expected.out b/plugins/inputs/smartctl/testcases_device/megaraid/expected.out new file mode 100644 index 0000000000000..4d242d4747ef1 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/megaraid/expected.out @@ -0,0 +1,25 @@ +smartctl,model=ST6000NM0115-1YZ110,name=/dev/bus/6,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d capacity=6001175126016i,firmware="SN04",health_ok=true,temperature=25i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Raw_Read_Error_Rate,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=181426040i,threshold=44i,value=83i,worst=64i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Spin_Up_Time,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=0i,value=91i,worst=91i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Start_Stop_Count,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=62i,threshold=20i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Reallocated_Sector_Ct,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=10i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Seek_Error_Rate,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=2827960730i,threshold=45i,value=95i,worst=60i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Power_On_Hours,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=44316i,threshold=0i,value=50i,worst=50i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Spin_Retry_Count,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=97i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Power_Cycle_Count,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=62i,threshold=20i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=End-to-End_Error,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=99i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Reported_Uncorrect,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=0i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Command_Timeout,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=0i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=High_Fly_Writes,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=0i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Airflow_Temperature_Cel,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=504627225i,threshold=40i,value=75i,worst=64i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=G-Sense_Error_Rate,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=3295i,threshold=0i,value=99i,worst=99i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Power-Off_Retract_Count,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=1865i,threshold=0i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Load_Cycle_Count,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=1894i,threshold=0i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Temperature_Celsius,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=30064771097i,threshold=0i,value=25i,worst=40i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Hardware_ECC_Recovered,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=181426040i,threshold=0i,value=83i,worst=64i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Current_Pending_Sector,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=0i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Offline_Uncorrectable,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=0i,value=100i,worst=100i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=UDMA_CRC_Error_Count,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=0i,threshold=0i,value=200i,worst=200i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Head_Flying_Hours,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=1265348905053451i,threshold=0i,value=100i,worst=253i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Total_LBAs_Written,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=28111553262i,threshold=0i,value=100i,worst=253i 1711726425026052398 +smartctl_attributes,model=ST6000NM0115-1YZ110,name=Total_LBAs_Read,serial=ZAD2C11G,type=sat+megaraid\,14,wwn=5000c500a496983d raw_value=3197497186480i,threshold=0i,value=100i,worst=253i 1711726425026052398 diff --git a/plugins/inputs/smartctl/testcases_device/megaraid/response.json b/plugins/inputs/smartctl/testcases_device/megaraid/response.json new file mode 100644 index 0000000000000..9c92c077bc968 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/megaraid/response.json @@ -0,0 +1,733 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 1 + ], + "svn_revision": "5022", + "platform_info": "x86_64-linux-5.4.0-172-generic", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--json", + "--all", + "/dev/bus/6", + "--device", + "megaraid,14", + "--nocheck=standby" + ], + "messages": [ + { + "string": "Warning: This result is based on an Attribute check.", + "severity": "warning" + } + ], + "exit_status": 4 + }, + "device": { + "name": "/dev/bus/6", + "info_name": "/dev/bus/6 [megaraid_disk_14] [SAT]", + "type": "sat+megaraid,14", + "protocol": "ATA" + }, + "model_family": "Seagate Enterprise Capacity 3.5 HDD", + "model_name": "ST6000NM0115-1YZ110", + "serial_number": "ZAD2C11G", + "wwn": { + "naa": 5, + "oui": 3152, + "id": 2761332797 + }, + "firmware_version": "SN04", + "user_capacity": { + "blocks": 11721045168, + "bytes": 6001175126016 + }, + "logical_block_size": 512, + "physical_block_size": 4096, + "rotation_rate": 7200, + "form_factor": { + "ata_value": 2, + "name": "3.5 inches" + }, + "in_smartctl_database": true, + "ata_version": { + "string": "ACS-3 T13/2161-D revision 5", + "major_value": 2032, + "minor_value": 109 + }, + "sata_version": { + "string": "SATA 3.1", + "value": 127 + }, + "interface_speed": { + "max": { + "sata_value": 14, + "string": "6.0 Gb/s", + "units_per_second": 60, + "bits_per_unit": 100000000 + }, + "current": { + "sata_value": 3, + "string": "6.0 Gb/s", + "units_per_second": 60, + "bits_per_unit": 100000000 + } + }, + "local_time": { + "time_t": 1711639509, + "asctime": "Thu Mar 28 16:25:09 2024 CET" + }, + "smart_status": { + "passed": true + }, + "ata_smart_data": { + "offline_data_collection": { + "status": { + "value": 130, + "string": "was completed without error", + "passed": true + }, + "completion_seconds": 567 + }, + "self_test": { + "status": { + "value": 0, + "string": "completed without error", + "passed": true + }, + "polling_minutes": { + "short": 1, + "extended": 584, + "conveyance": 2 + } + }, + "capabilities": { + "values": [ + 123, + 3 + ], + "exec_offline_immediate_supported": true, + "offline_is_aborted_upon_new_cmd": false, + "offline_surface_scan_supported": true, + "self_tests_supported": true, + "conveyance_self_test_supported": true, + "selective_self_test_supported": true, + "attribute_autosave_enabled": true, + "error_logging_supported": true, + "gp_logging_supported": true + } + }, + "ata_sct_capabilities": { + "value": 28861, + "error_recovery_control_supported": true, + "feature_control_supported": true, + "data_table_supported": true + }, + "ata_smart_attributes": { + "revision": 10, + "table": [ + { + "id": 1, + "name": "Raw_Read_Error_Rate", + "value": 83, + "worst": 64, + "thresh": 44, + "when_failed": "", + "flags": { + "value": 15, + "string": "POSR-- ", + "prefailure": true, + "updated_online": true, + "performance": true, + "error_rate": true, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 181426040, + "string": "181426040" + } + }, + { + "id": 3, + "name": "Spin_Up_Time", + "value": 91, + "worst": 91, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 3, + "string": "PO---- ", + "prefailure": true, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 4, + "name": "Start_Stop_Count", + "value": 100, + "worst": 100, + "thresh": 20, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 62, + "string": "62" + } + }, + { + "id": 5, + "name": "Reallocated_Sector_Ct", + "value": 100, + "worst": 100, + "thresh": 10, + "when_failed": "", + "flags": { + "value": 51, + "string": "PO--CK ", + "prefailure": true, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 7, + "name": "Seek_Error_Rate", + "value": 95, + "worst": 60, + "thresh": 45, + "when_failed": "", + "flags": { + "value": 15, + "string": "POSR-- ", + "prefailure": true, + "updated_online": true, + "performance": true, + "error_rate": true, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 2827960730, + "string": "2827960730" + } + }, + { + "id": 9, + "name": "Power_On_Hours", + "value": 50, + "worst": 50, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 44316, + "string": "44316" + } + }, + { + "id": 10, + "name": "Spin_Retry_Count", + "value": 100, + "worst": 100, + "thresh": 97, + "when_failed": "", + "flags": { + "value": 19, + "string": "PO--C- ", + "prefailure": true, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 12, + "name": "Power_Cycle_Count", + "value": 100, + "worst": 100, + "thresh": 20, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 62, + "string": "62" + } + }, + { + "id": 184, + "name": "End-to-End_Error", + "value": 100, + "worst": 100, + "thresh": 99, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 187, + "name": "Reported_Uncorrect", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 188, + "name": "Command_Timeout", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 0, + "string": "0 0 0" + } + }, + { + "id": 189, + "name": "High_Fly_Writes", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 58, + "string": "-O-RCK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": true, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 190, + "name": "Airflow_Temperature_Cel", + "value": 75, + "worst": 64, + "thresh": 40, + "when_failed": "", + "flags": { + "value": 34, + "string": "-O---K ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": true + }, + "raw": { + "value": 504627225, + "string": "25 (Min/Max 20/30)" + } + }, + { + "id": 191, + "name": "G-Sense_Error_Rate", + "value": 99, + "worst": 99, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 3295, + "string": "3295" + } + }, + { + "id": 192, + "name": "Power-Off_Retract_Count", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 1865, + "string": "1865" + } + }, + { + "id": 193, + "name": "Load_Cycle_Count", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 50, + "string": "-O--CK ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 1894, + "string": "1894" + } + }, + { + "id": 194, + "name": "Temperature_Celsius", + "value": 25, + "worst": 40, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 34, + "string": "-O---K ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": true + }, + "raw": { + "value": 30064771097, + "string": "25 (0 7 0 0 0)" + } + }, + { + "id": 195, + "name": "Hardware_ECC_Recovered", + "value": 83, + "worst": 64, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 26, + "string": "-O-RC- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": true, + "event_count": true, + "auto_keep": false + }, + "raw": { + "value": 181426040, + "string": "181426040" + } + }, + { + "id": 197, + "name": "Current_Pending_Sector", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 18, + "string": "-O--C- ", + "prefailure": false, + "updated_online": true, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 198, + "name": "Offline_Uncorrectable", + "value": 100, + "worst": 100, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 16, + "string": "----C- ", + "prefailure": false, + "updated_online": false, + "performance": false, + "error_rate": false, + "event_count": true, + "auto_keep": false + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 199, + "name": "UDMA_CRC_Error_Count", + "value": 200, + "worst": 200, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 62, + "string": "-OSRCK ", + "prefailure": false, + "updated_online": true, + "performance": true, + "error_rate": true, + "event_count": true, + "auto_keep": true + }, + "raw": { + "value": 0, + "string": "0" + } + }, + { + "id": 240, + "name": "Head_Flying_Hours", + "value": 100, + "worst": 253, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 0, + "string": "------ ", + "prefailure": false, + "updated_online": false, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 1265348905053451, + "string": "44299h+04m+54.612s" + } + }, + { + "id": 241, + "name": "Total_LBAs_Written", + "value": 100, + "worst": 253, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 0, + "string": "------ ", + "prefailure": false, + "updated_online": false, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 28111553262, + "string": "28111553262" + } + }, + { + "id": 242, + "name": "Total_LBAs_Read", + "value": 100, + "worst": 253, + "thresh": 0, + "when_failed": "", + "flags": { + "value": 0, + "string": "------ ", + "prefailure": false, + "updated_online": false, + "performance": false, + "error_rate": false, + "event_count": false, + "auto_keep": false + }, + "raw": { + "value": 3197497186480, + "string": "3197497186480" + } + } + ] + }, + "power_on_time": { + "hours": 44316 + }, + "power_cycle_count": 62, + "temperature": { + "current": 25 + }, + "ata_smart_error_log": { + "summary": { + "revision": 1, + "count": 0 + } + }, + "ata_smart_self_test_log": { + "standard": { + "revision": 1, + "count": 0 + } + }, + "ata_smart_selective_self_test_log": { + "revision": 1, + "table": [ + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + }, + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + }, + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + }, + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + }, + { + "lba_min": 0, + "lba_max": 0, + "status": { + "value": 0, + "string": "Not_testing" + } + } + ], + "flags": { + "value": 0, + "remainder_scan_enabled": false + }, + "power_up_scan_resume_minutes": 0 + } +} From e73eb02318e063c6e8286c11f7d48a246d1725dc Mon Sep 17 00:00:00 2001 From: Josh Powers <powersj@fastmail.com> Date: Fri, 29 Mar 2024 10:23:09 -0600 Subject: [PATCH 3/8] int64, correct message --- plugins/inputs/smartctl/smartctl_device.go | 24 ++++++----- plugins/inputs/smartctl/smartctl_json.go | 46 +++++++++++----------- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/plugins/inputs/smartctl/smartctl_device.go b/plugins/inputs/smartctl/smartctl_device.go index 9bb6b2023c469..976561ef9b287 100644 --- a/plugins/inputs/smartctl/smartctl_device.go +++ b/plugins/inputs/smartctl/smartctl_device.go @@ -18,19 +18,23 @@ func (s *Smartctl) scanDevice(acc telegraf.Accumulator, deviceName string, devic var device smartctlDeviceJSON out, err := internal.CombinedOutputTimeout(cmd, time.Duration(s.Timeout)) + if err != nil { + // Error running the command and unable to parse the JSON, then bail + if jsonErr := json.Unmarshal(out, &device); jsonErr != nil { + return fmt.Errorf("error running smartctl with %s: %w", args, err) + } - // Error running the command and unable to parse the JSON, then bail - if jsonErr := json.Unmarshal(out, &device); jsonErr != nil { - return fmt.Errorf("error running smartctl with %s: %w", args, err) + // If we were able to parse the result, then only exit if we get an error + // as sometimes we can get warnings, that still produce data. + if len(device.Smartctl.Messages) > 0 && + device.Smartctl.Messages[0].Severity == "error" && + device.Smartctl.Messages[0].String != "" { + return fmt.Errorf("error running smartctl with %s got smartctl error message: %s", args, device.Smartctl.Messages[0].String) + } } - // If we were able to parse the result, then only exit if we get an error - // as sometimes we can get warnings, that still produce data. - if err != nil && - len(device.Smartctl.Messages) > 0 && - device.Smartctl.Messages[0].Severity == "error" && - device.Smartctl.Messages[0].String != "" { - return fmt.Errorf("error running smartctl with %s got smartctl error message: %s", args, device.Smartctl.Messages[0].String) + if err := json.Unmarshal(out, &device); err != nil { + return fmt.Errorf("error unable to unmarshall response %s: %w", args, err) } t := time.Now() diff --git a/plugins/inputs/smartctl/smartctl_json.go b/plugins/inputs/smartctl/smartctl_json.go index 760cecd633c9f..1758c1d55c0f5 100644 --- a/plugins/inputs/smartctl/smartctl_json.go +++ b/plugins/inputs/smartctl/smartctl_json.go @@ -37,23 +37,23 @@ type smartctlDeviceJSON struct { Passed bool `json:"passed"` } `json:"smart_status"` NvmeSmartHealthInformationLog struct { - CriticalWarning int `json:"critical_warning"` - Temperature int `json:"temperature"` - AvailableSpare int `json:"available_spare"` - AvailableSpareThreshold int `json:"available_spare_threshold"` - PercentageUsed int `json:"percentage_used"` - DataUnitsRead int `json:"data_units_read"` - DataUnitsWritten int `json:"data_units_written"` - HostReads int `json:"host_reads"` - HostWrites int `json:"host_writes"` - ControllerBusyTime int `json:"controller_busy_time"` - PowerCycles int `json:"power_cycles"` - PowerOnHours int `json:"power_on_hours"` - UnsafeShutdowns int `json:"unsafe_shutdowns"` - MediaErrors int `json:"media_errors"` - NumErrLogEntries int `json:"num_err_log_entries"` - WarningTempTime int `json:"warning_temp_time"` - CriticalCompTime int `json:"critical_comp_time"` + CriticalWarning int64 `json:"critical_warning"` + Temperature int64 `json:"temperature"` + AvailableSpare int64 `json:"available_spare"` + AvailableSpareThreshold int64 `json:"available_spare_threshold"` + PercentageUsed int64 `json:"percentage_used"` + DataUnitsRead int64 `json:"data_units_read"` + DataUnitsWritten int64 `json:"data_units_written"` + HostReads int64 `json:"host_reads"` + HostWrites int64 `json:"host_writes"` + ControllerBusyTime int64 `json:"controller_busy_time"` + PowerCycles int64 `json:"power_cycles"` + PowerOnHours int64 `json:"power_on_hours"` + UnsafeShutdowns int64 `json:"unsafe_shutdowns"` + MediaErrors int64 `json:"media_errors"` + NumErrLogEntries int64 `json:"num_err_log_entries"` + WarningTempTime int64 `json:"warning_temp_time"` + CriticalCompTime int64 `json:"critical_comp_time"` } `json:"nvme_smart_health_information_log"` Temperature struct { Current int `json:"current"` @@ -61,14 +61,14 @@ type smartctlDeviceJSON struct { AtaSmartAttributes struct { Revision int `json:"revision"` Table []struct { - ID int `json:"id"` + ID int64 `json:"id"` Name string `json:"name"` - Value int `json:"value"` - Worst int `json:"worst"` - Thresh int `json:"thresh"` + Value int64 `json:"value"` + Worst int64 `json:"worst"` + Thresh int64 `json:"thresh"` WhenFailed string `json:"when_failed"` Flags struct { - Value int `json:"value"` + Value int64 `json:"value"` String string `json:"string"` Prefailure bool `json:"prefailure"` UpdatedOnline bool `json:"updated_online"` @@ -78,7 +78,7 @@ type smartctlDeviceJSON struct { AutoKeep bool `json:"auto_keep"` } `json:"flags"` Raw struct { - Value int `json:"value"` + Value int64 `json:"value"` String string `json:"string"` } `json:"raw"` } `json:"table"` From 3aa973e2ef9c773b5165cdc1514fbb8f14c97a02 Mon Sep 17 00:00:00 2001 From: Josh Powers <powersj@fastmail.com> Date: Mon, 1 Apr 2024 09:32:18 -0600 Subject: [PATCH 4/8] scan: do not assume the same name --- plugins/inputs/smartctl/smartctl.go | 4 +- plugins/inputs/smartctl/smartctl_scan.go | 25 ++++-- plugins/inputs/smartctl/smartctl_test.go | 13 ++- .../{scan.json => all/response.json} | 0 .../testcases_scan/exclude/response.json | 42 ++++++++++ .../testcases_scan/include/response.json | 42 ++++++++++ .../testcases_scan/megaraid/expected.out | 1 + .../testcases_scan/megaraid/response.json | 83 +++++++++++++++++++ .../testcases_scan/megaraid/telegraf.toml | 1 + 9 files changed, 199 insertions(+), 12 deletions(-) rename plugins/inputs/smartctl/testcases_scan/{scan.json => all/response.json} (100%) create mode 100644 plugins/inputs/smartctl/testcases_scan/exclude/response.json create mode 100644 plugins/inputs/smartctl/testcases_scan/include/response.json create mode 100644 plugins/inputs/smartctl/testcases_scan/megaraid/expected.out create mode 100644 plugins/inputs/smartctl/testcases_scan/megaraid/response.json create mode 100644 plugins/inputs/smartctl/testcases_scan/megaraid/telegraf.toml diff --git a/plugins/inputs/smartctl/smartctl.go b/plugins/inputs/smartctl/smartctl.go index 60a4320309257..1456ee99c6f46 100644 --- a/plugins/inputs/smartctl/smartctl.go +++ b/plugins/inputs/smartctl/smartctl.go @@ -73,8 +73,8 @@ func (s *Smartctl) Gather(acc telegraf.Accumulator) error { return fmt.Errorf("Error scanning system: %w", err) } - for device, deviceType := range devices { - if err := s.scanDevice(acc, device, deviceType); err != nil { + for _, device := range devices { + if err := s.scanDevice(acc, device.Name, device.Type); err != nil { return fmt.Errorf("Error getting device %s: %w", device, err) } } diff --git a/plugins/inputs/smartctl/smartctl_scan.go b/plugins/inputs/smartctl/smartctl_scan.go index c96b07f6e4509..36784b61acff1 100644 --- a/plugins/inputs/smartctl/smartctl_scan.go +++ b/plugins/inputs/smartctl/smartctl_scan.go @@ -8,15 +8,22 @@ import ( "github.com/influxdata/telegraf/internal" ) -func (s *Smartctl) scan() (map[string]string, error) { - args := []string{"--json", "--scan"} - cmd := execCommand(s.Path, args...) +// This is here so we can override it during testing +var scanArgs = []string{"--json", "--scan"} + +type scanDevice struct { + Name string + Type string +} + +func (s *Smartctl) scan() ([]scanDevice, error) { + cmd := execCommand(s.Path, scanArgs...) if s.UseSudo { - cmd = execCommand("sudo", append([]string{"-n", s.Path}, args...)...) + cmd = execCommand("sudo", append([]string{"-n", s.Path}, scanArgs...)...) } out, err := internal.CombinedOutputTimeout(cmd, time.Duration(s.Timeout)) if err != nil { - return nil, fmt.Errorf("error running smartctl with %s: %w", args, err) + return nil, fmt.Errorf("error running smartctl with %s: %w", scanArgs, err) } var scan smartctlScanJSON @@ -24,10 +31,14 @@ func (s *Smartctl) scan() (map[string]string, error) { return nil, fmt.Errorf("error unmarshalling smartctl scan output: %w", err) } - devices := make(map[string]string, len(scan.Devices)) + devices := make([]scanDevice, 0) for _, device := range scan.Devices { if s.deviceFilter.Match(device.Name) { - devices[device.Name] = device.Type + device := scanDevice{ + Name: device.Name, + Type: device.Type, + } + devices = append(devices, device) } } diff --git a/plugins/inputs/smartctl/smartctl_test.go b/plugins/inputs/smartctl/smartctl_test.go index 5de7d7e51352d..b15ecc0973bcc 100644 --- a/plugins/inputs/smartctl/smartctl_test.go +++ b/plugins/inputs/smartctl/smartctl_test.go @@ -34,6 +34,7 @@ func TestCasesScan(t *testing.T) { } testcasePath := filepath.Join("testcases_scan", f.Name()) configFilename := filepath.Join(testcasePath, "telegraf.toml") + scanFilename := filepath.Join(testcasePath, "response.json") expectedFilename := filepath.Join(testcasePath, "expected.out") t.Run(f.Name(), func(t *testing.T) { @@ -61,6 +62,7 @@ func TestCasesScan(t *testing.T) { plugin := cfg.Inputs[0].Input.(*Smartctl) require.NoError(t, plugin.Init()) + scanArgs = append(scanArgs, scanFilename) devices, err := plugin.scan() require.NoError(t, err) require.Len(t, devices, expected) @@ -80,11 +82,16 @@ func TestScanHelperProcess(t *testing.T) { if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { return } + args := os.Args - scanBytes, err := os.ReadFile("testcases_scan/scan.json") - require.NoError(t, err) - fmt.Fprint(os.Stdout, string(scanBytes)) + scanBytes, err := os.ReadFile(args[len(args)-1]) + if err != nil { + //nolint:revive // os.Exit called intentionally + fmt.Fprint(os.Stdout, "unknown filename") + os.Exit(42) + } + fmt.Fprint(os.Stdout, string(scanBytes)) //nolint:revive // os.Exit called intentionally os.Exit(0) } diff --git a/plugins/inputs/smartctl/testcases_scan/scan.json b/plugins/inputs/smartctl/testcases_scan/all/response.json similarity index 100% rename from plugins/inputs/smartctl/testcases_scan/scan.json rename to plugins/inputs/smartctl/testcases_scan/all/response.json diff --git a/plugins/inputs/smartctl/testcases_scan/exclude/response.json b/plugins/inputs/smartctl/testcases_scan/exclude/response.json new file mode 100644 index 0000000000000..8cb1cf4298893 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/exclude/response.json @@ -0,0 +1,42 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 4 + ], + "pre_release": false, + "svn_revision": "5530", + "platform_info": "x86_64-linux-6.8.1-arch1-1", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--scan", + "--json" + ], + "exit_status": 0 + }, + "devices": [ + { + "name": "/dev/sda", + "info_name": "/dev/sda [SAT]", + "type": "sat", + "protocol": "ATA" + }, + { + "name": "/dev/nvme0", + "info_name": "/dev/nvme0", + "type": "nvme", + "protocol": "NVMe" + }, + { + "name": "/dev/nvme1", + "info_name": "/dev/nvme1", + "type": "nvme", + "protocol": "NVMe" + } + ] +} diff --git a/plugins/inputs/smartctl/testcases_scan/include/response.json b/plugins/inputs/smartctl/testcases_scan/include/response.json new file mode 100644 index 0000000000000..8cb1cf4298893 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/include/response.json @@ -0,0 +1,42 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 4 + ], + "pre_release": false, + "svn_revision": "5530", + "platform_info": "x86_64-linux-6.8.1-arch1-1", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--scan", + "--json" + ], + "exit_status": 0 + }, + "devices": [ + { + "name": "/dev/sda", + "info_name": "/dev/sda [SAT]", + "type": "sat", + "protocol": "ATA" + }, + { + "name": "/dev/nvme0", + "info_name": "/dev/nvme0", + "type": "nvme", + "protocol": "NVMe" + }, + { + "name": "/dev/nvme1", + "info_name": "/dev/nvme1", + "type": "nvme", + "protocol": "NVMe" + } + ] +} diff --git a/plugins/inputs/smartctl/testcases_scan/megaraid/expected.out b/plugins/inputs/smartctl/testcases_scan/megaraid/expected.out new file mode 100644 index 0000000000000..f599e28b8ab0d --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/megaraid/expected.out @@ -0,0 +1 @@ +10 diff --git a/plugins/inputs/smartctl/testcases_scan/megaraid/response.json b/plugins/inputs/smartctl/testcases_scan/megaraid/response.json new file mode 100644 index 0000000000000..b41f469fb4888 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/megaraid/response.json @@ -0,0 +1,83 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 1 + ], + "svn_revision": "5022", + "platform_info": "x86_64-linux-5.4.0-172-generic", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--scan", + "--json" + ], + "exit_status": 0 + }, + "devices": [ + { + "name": "/dev/sda", + "info_name": "/dev/sda", + "type": "scsi", + "protocol": "SCSI" + }, + { + "name": "/dev/sdb", + "info_name": "/dev/sdb", + "type": "scsi", + "protocol": "SCSI" + }, + { + "name": "/dev/sdc", + "info_name": "/dev/sdc [SAT]", + "type": "sat", + "protocol": "ATA" + }, + { + "name": "/dev/bus/6", + "info_name": "/dev/bus/6 [megaraid_disk_08]", + "type": "megaraid,8", + "protocol": "SCSI" + }, + { + "name": "/dev/bus/6", + "info_name": "/dev/bus/6 [megaraid_disk_09]", + "type": "megaraid,9", + "protocol": "SCSI" + }, + { + "name": "/dev/bus/6", + "info_name": "/dev/bus/6 [megaraid_disk_10]", + "type": "megaraid,10", + "protocol": "SCSI" + }, + { + "name": "/dev/bus/6", + "info_name": "/dev/bus/6 [megaraid_disk_11]", + "type": "megaraid,11", + "protocol": "SCSI" + }, + { + "name": "/dev/bus/6", + "info_name": "/dev/bus/6 [megaraid_disk_12]", + "type": "megaraid,12", + "protocol": "SCSI" + }, + { + "name": "/dev/bus/6", + "info_name": "/dev/bus/6 [megaraid_disk_13]", + "type": "megaraid,13", + "protocol": "SCSI" + }, + { + "name": "/dev/bus/6", + "info_name": "/dev/bus/6 [megaraid_disk_14]", + "type": "megaraid,14", + "protocol": "SCSI" + } + ] +} diff --git a/plugins/inputs/smartctl/testcases_scan/megaraid/telegraf.toml b/plugins/inputs/smartctl/testcases_scan/megaraid/telegraf.toml new file mode 100644 index 0000000000000..6cd853f61d473 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_scan/megaraid/telegraf.toml @@ -0,0 +1 @@ +[[inputs.smartctl]] From dc9a4421b784fcdcb54d6a7952b2ae38dd3776d7 Mon Sep 17 00:00:00 2001 From: Josh Powers <powersj@fastmail.com> Date: Mon, 1 Apr 2024 09:35:25 -0600 Subject: [PATCH 5/8] lint clean up --- plugins/inputs/smartctl/smartctl_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/inputs/smartctl/smartctl_test.go b/plugins/inputs/smartctl/smartctl_test.go index b15ecc0973bcc..63dcda52b6c2f 100644 --- a/plugins/inputs/smartctl/smartctl_test.go +++ b/plugins/inputs/smartctl/smartctl_test.go @@ -78,7 +78,7 @@ func fakeScanExecCommand(command string, args ...string) *exec.Cmd { return cmd } -func TestScanHelperProcess(t *testing.T) { +func TestScanHelperProcess(_ *testing.T) { if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { return } @@ -86,8 +86,8 @@ func TestScanHelperProcess(t *testing.T) { scanBytes, err := os.ReadFile(args[len(args)-1]) if err != nil { - //nolint:revive // os.Exit called intentionally fmt.Fprint(os.Stdout, "unknown filename") + //nolint:revive // os.Exit called intentionally os.Exit(42) } From 22967b14c0cf54c1dc40705cfb210353a4ca8674 Mon Sep 17 00:00:00 2001 From: Josh Powers <powersj@fastmail.com> Date: Tue, 2 Apr 2024 09:30:53 -0600 Subject: [PATCH 6/8] Add vendor and scsi error counter logs --- plugins/inputs/smartctl/smartctl_device.go | 48 ++++++++++ plugins/inputs/smartctl/smartctl_json.go | 31 ++++++ plugins/inputs/smartctl/smartctl_test.go | 2 + .../smartctl/testcases_device/scsi/device | 1 + .../smartctl/testcases_device/scsi/deviceType | 1 + .../testcases_device/scsi/expected.out | 4 + .../testcases_device/scsi/response.json | 96 +++++++++++++++++++ 7 files changed, 183 insertions(+) create mode 100644 plugins/inputs/smartctl/testcases_device/scsi/device create mode 100644 plugins/inputs/smartctl/testcases_device/scsi/deviceType create mode 100644 plugins/inputs/smartctl/testcases_device/scsi/expected.out create mode 100644 plugins/inputs/smartctl/testcases_device/scsi/response.json diff --git a/plugins/inputs/smartctl/smartctl_device.go b/plugins/inputs/smartctl/smartctl_device.go index 976561ef9b287..03ef6d60eaeb1 100644 --- a/plugins/inputs/smartctl/smartctl_device.go +++ b/plugins/inputs/smartctl/smartctl_device.go @@ -46,6 +46,10 @@ func (s *Smartctl) scanDevice(acc telegraf.Accumulator, deviceName string, devic "serial": device.SerialNumber, } + if device.Vendor != "" { + tags["vendor"] = device.Vendor + } + // The JSON WWN is in decimal and needs to be converted to hex if device.Wwn.ID != 0 && device.Wwn.Naa != 0 && device.Wwn.Oui != 0 { tags["wwn"] = fmt.Sprintf("%01x%06x%09x", device.Wwn.Naa, device.Wwn.Oui, device.Wwn.ID) @@ -99,5 +103,49 @@ func (s *Smartctl) scanDevice(acc telegraf.Accumulator, deviceName string, devic acc.AddFields("smartctl_attributes", fields, attributeTags, t) } + // Check for SCSI error counter entries + if device.Device.Type == "scsi" { + counterTags := make(map[string]string, len(tags)+1) + for k, v := range tags { + counterTags[k] = v + } + + counterTags["page"] = "read" + fields := map[string]interface{}{ + "errors_corrected_by_eccfast": device.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, + "errors_corrected_by_eccdelayed": device.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, + "errors_corrected_by_rereads_rewrites": device.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, + "total_errors_corrected": device.ScsiErrorCounterLog.Read.TotalErrorsCorrected, + "correction_algorithm_invocations": device.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, + "gigabytes_processed": device.ScsiErrorCounterLog.Read.GigabytesProcessed, + "total_uncorrected_errors": device.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, + } + acc.AddFields("smartctl_scsi_error_counter_log", fields, counterTags, t) + + counterTags["page"] = "write" + fields = map[string]interface{}{ + "errors_corrected_by_eccfast": device.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, + "errors_corrected_by_eccdelayed": device.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, + "errors_corrected_by_rereads_rewrites": device.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, + "total_errors_corrected": device.ScsiErrorCounterLog.Write.TotalErrorsCorrected, + "correction_algorithm_invocations": device.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, + "gigabytes_processed": device.ScsiErrorCounterLog.Write.GigabytesProcessed, + "total_uncorrected_errors": device.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, + } + acc.AddFields("smartctl_scsi_error_counter_log", fields, counterTags, t) + + counterTags["page"] = "verify" + fields = map[string]interface{}{ + "errors_corrected_by_eccfast": device.ScsiErrorCounterLog.Verify.ErrorsCorrectedByEccfast, + "errors_corrected_by_eccdelayed": device.ScsiErrorCounterLog.Verify.ErrorsCorrectedByEccdelayed, + "errors_corrected_by_rereads_rewrites": device.ScsiErrorCounterLog.Verify.ErrorsCorrectedByRereadsRewrites, + "total_errors_corrected": device.ScsiErrorCounterLog.Verify.TotalErrorsCorrected, + "correction_algorithm_invocations": device.ScsiErrorCounterLog.Verify.CorrectionAlgorithmInvocations, + "gigabytes_processed": device.ScsiErrorCounterLog.Verify.GigabytesProcessed, + "total_uncorrected_errors": device.ScsiErrorCounterLog.Verify.TotalUncorrectedErrors, + } + acc.AddFields("smartctl_scsi_error_counter_log", fields, counterTags, t) + } + return nil } diff --git a/plugins/inputs/smartctl/smartctl_json.go b/plugins/inputs/smartctl/smartctl_json.go index 1758c1d55c0f5..a5bdb5ce8467b 100644 --- a/plugins/inputs/smartctl/smartctl_json.go +++ b/plugins/inputs/smartctl/smartctl_json.go @@ -21,6 +21,8 @@ type smartctlDeviceJSON struct { Type string `json:"type"` Protocol string `json:"protocol"` } `json:"device"` + Vendor string `json:"vendor"` + Product string `json:"product"` ModelFamily string `json:"model_family"` ModelName string `json:"model_name"` SerialNumber string `json:"serial_number"` @@ -83,6 +85,35 @@ type smartctlDeviceJSON struct { } `json:"raw"` } `json:"table"` } `json:"ata_smart_attributes"` + ScsiErrorCounterLog struct { + Read struct { + ErrorsCorrectedByEccfast int `json:"errors_corrected_by_eccfast"` + ErrorsCorrectedByEccdelayed int `json:"errors_corrected_by_eccdelayed"` + ErrorsCorrectedByRereadsRewrites int `json:"errors_corrected_by_rereads_rewrites"` + TotalErrorsCorrected int `json:"total_errors_corrected"` + CorrectionAlgorithmInvocations int `json:"correction_algorithm_invocations"` + GigabytesProcessed string `json:"gigabytes_processed"` + TotalUncorrectedErrors int `json:"total_uncorrected_errors"` + } `json:"read"` + Write struct { + ErrorsCorrectedByEccfast int `json:"errors_corrected_by_eccfast"` + ErrorsCorrectedByEccdelayed int `json:"errors_corrected_by_eccdelayed"` + ErrorsCorrectedByRereadsRewrites int `json:"errors_corrected_by_rereads_rewrites"` + TotalErrorsCorrected int `json:"total_errors_corrected"` + CorrectionAlgorithmInvocations int `json:"correction_algorithm_invocations"` + GigabytesProcessed string `json:"gigabytes_processed"` + TotalUncorrectedErrors int `json:"total_uncorrected_errors"` + } `json:"write"` + Verify struct { + ErrorsCorrectedByEccfast int `json:"errors_corrected_by_eccfast"` + ErrorsCorrectedByEccdelayed int `json:"errors_corrected_by_eccdelayed"` + ErrorsCorrectedByRereadsRewrites int `json:"errors_corrected_by_rereads_rewrites"` + TotalErrorsCorrected int `json:"total_errors_corrected"` + CorrectionAlgorithmInvocations int `json:"correction_algorithm_invocations"` + GigabytesProcessed string `json:"gigabytes_processed"` + TotalUncorrectedErrors int `json:"total_uncorrected_errors"` + } `json:"verify"` + } `json:"scsi_error_counter_log"` } type smartctlScanJSON struct { diff --git a/plugins/inputs/smartctl/smartctl_test.go b/plugins/inputs/smartctl/smartctl_test.go index 63dcda52b6c2f..7e3dde07a7ec7 100644 --- a/plugins/inputs/smartctl/smartctl_test.go +++ b/plugins/inputs/smartctl/smartctl_test.go @@ -181,6 +181,8 @@ func TestDeviceHelperProcess(t *testing.T) { filename = "testcases_device/usb/response.json" } else if slices.Contains(args, "/dev/bus/6") { filename = "testcases_device/megaraid/response.json" + } else if slices.Contains(args, "/dev/sdb") { + filename = "testcases_device/scsi/response.json" } else { fmt.Fprint(os.Stdout, "unknown filename") os.Exit(42) //nolint:revive // os.Exit called intentionally diff --git a/plugins/inputs/smartctl/testcases_device/scsi/device b/plugins/inputs/smartctl/testcases_device/scsi/device new file mode 100644 index 0000000000000..32962ffd4f83a --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/scsi/device @@ -0,0 +1 @@ +/dev/sdb diff --git a/plugins/inputs/smartctl/testcases_device/scsi/deviceType b/plugins/inputs/smartctl/testcases_device/scsi/deviceType new file mode 100644 index 0000000000000..7e90c4b42440a --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/scsi/deviceType @@ -0,0 +1 @@ +scsi diff --git a/plugins/inputs/smartctl/testcases_device/scsi/expected.out b/plugins/inputs/smartctl/testcases_device/scsi/expected.out new file mode 100644 index 0000000000000..1994b51ccc6a9 --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/scsi/expected.out @@ -0,0 +1,4 @@ +smartctl,model=XXXX\ XX0000NM123,name=/dev/sdb,serial=XXXXXXX,type=scsi,vendor=XXXXXXX capacity=13715978077777i,firmware="",health_ok=true,temperature=24i 1712071085987864368 +smartctl_scsi_error_counter_log,model=XXXX\ XX0000NM123,name=/dev/sdb,serial=XXXXXXX,type=scsi,page=read,vendor=XXXXXXX correction_algorithm_invocations=0i,errors_corrected_by_eccdelayed=0i,errors_corrected_by_eccfast=1i,errors_corrected_by_rereads_rewrites=5i,gigabytes_processed="315926.142",total_errors_corrected=3i,total_uncorrected_errors=0i 1712071085987864368 +smartctl_scsi_error_counter_log,model=XXXX\ XX0000NM123,name=/dev/sdb,serial=XXXXXXX,type=scsi,page=write,vendor=XXXXXXX correction_algorithm_invocations=20i,errors_corrected_by_eccdelayed=0i,errors_corrected_by_eccfast=0i,errors_corrected_by_rereads_rewrites=20i,gigabytes_processed="132513.233",total_errors_corrected=20i,total_uncorrected_errors=0i 1712071085987864368 +smartctl_scsi_error_counter_log,model=XXXX\ XX0000NM123,name=/dev/sdb,serial=XXXXXXX,type=scsi,page=verify,vendor=XXXXXXX correction_algorithm_invocations=0i,errors_corrected_by_eccdelayed=0i,errors_corrected_by_eccfast=12i,errors_corrected_by_rereads_rewrites=0i,gigabytes_processed="1437.032",total_errors_corrected=3i,total_uncorrected_errors=0i 1712071085987864368 diff --git a/plugins/inputs/smartctl/testcases_device/scsi/response.json b/plugins/inputs/smartctl/testcases_device/scsi/response.json new file mode 100644 index 0000000000000..bc821a6bb035d --- /dev/null +++ b/plugins/inputs/smartctl/testcases_device/scsi/response.json @@ -0,0 +1,96 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 2 + ], + "svn_revision": "5123", + "platform_info": "x86_64-linux-4.12.0-1-amd64", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--json", + "--all", + "/dev/sdb", + "--device", + "scsi" + ], + "exit_status": 0 + }, + "device": { + "name": "/dev/sdb", + "info_name": "/dev/sdb", + "type": "scsi", + "protocol": "SCSI" + }, + "vendor": "XXXXXXX", + "product": "XXXX000OOOO", + "model_name": "XXXX XX0000NM123", + "revision": "RSL5", + "scsi_version": "SPC-5", + "user_capacity": { + "blocks": 26789019888, + "bytes": 13715978077777 + }, + "logical_block_size": 512, + "physical_block_size": 4096, + "rotation_rate": 7200, + "form_factor": { + "scsi_value": 3, + "name": "3.5 inches" + }, + "serial_number": "XXXXXXX", + "device_type": { + "scsi_value": 0, + "name": "disk" + }, + "local_time": { + "time_t": 1711977687, + "asctime": "Sun Mar 31 13:21:27 2024 UTC" + }, + "smart_status": { + "passed": true + }, + "temperature": { + "current": 24, + "drive_trip": 60 + }, + "power_on_time": { + "hours": 32978, + "minutes": 46 + }, + "scsi_grown_defect_list": 0, + "scsi_error_counter_log": { + "read": { + "errors_corrected_by_eccfast": 1, + "errors_corrected_by_eccdelayed": 0, + "errors_corrected_by_rereads_rewrites": 5, + "total_errors_corrected": 3, + "correction_algorithm_invocations": 0, + "gigabytes_processed": "315926.142", + "total_uncorrected_errors": 0 + }, + "write": { + "errors_corrected_by_eccfast": 0, + "errors_corrected_by_eccdelayed": 0, + "errors_corrected_by_rereads_rewrites": 20, + "total_errors_corrected": 20, + "correction_algorithm_invocations": 20, + "gigabytes_processed": "132513.233", + "total_uncorrected_errors": 0 + }, + "verify": { + "errors_corrected_by_eccfast": 12, + "errors_corrected_by_eccdelayed": 0, + "errors_corrected_by_rereads_rewrites": 0, + "total_errors_corrected": 3, + "correction_algorithm_invocations": 0, + "gigabytes_processed": "1437.032", + "total_uncorrected_errors": 0 + } + } +} From f4a1878f706accc7aa549ab51dc06e2c69a39604 Mon Sep 17 00:00:00 2001 From: Josh Powers <powersj@fastmail.com> Date: Thu, 4 Apr 2024 14:18:41 -0600 Subject: [PATCH 7/8] Update default path to sbin --- plugins/inputs/smartctl/README.md | 4 ++-- plugins/inputs/smartctl/sample.conf | 2 +- plugins/inputs/smartctl/smartctl.go | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/plugins/inputs/smartctl/README.md b/plugins/inputs/smartctl/README.md index d0ddbbc81e30e..57b303f7911a1 100644 --- a/plugins/inputs/smartctl/README.md +++ b/plugins/inputs/smartctl/README.md @@ -28,7 +28,7 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. # Read metrics from SMART storage devices using smartclt's JSON output [[inputs.smartctl]] ## Optionally specify the path to the smartctl executable - # path = "/usr/bin/smartctl" + # path = "/usr/sbin/smartctl" ## Use sudo ## On most platforms used, smartctl requires root access. Setting 'use_sudo' @@ -76,7 +76,7 @@ And to update the `/etc/sudoers` file to allow running smartctl: ```bash $ visudo # Add the following lines: -Cmnd_Alias SMARTCTL = /usr/bin/smartctl +Cmnd_Alias SMARTCTL = /usr/sbin/smartctl telegraf ALL=(ALL) NOPASSWD: SMARTCTL Defaults!SMARTCTL !logfile, !syslog, !pam_session ``` diff --git a/plugins/inputs/smartctl/sample.conf b/plugins/inputs/smartctl/sample.conf index 5b73ef012b8c5..e4bbe243e7656 100644 --- a/plugins/inputs/smartctl/sample.conf +++ b/plugins/inputs/smartctl/sample.conf @@ -1,7 +1,7 @@ # Read metrics from SMART storage devices using smartclt's JSON output [[inputs.smartctl]] ## Optionally specify the path to the smartctl executable - # path = "/usr/bin/smartctl" + # path = "/usr/sbin/smartctl" ## Use sudo ## On most platforms used, smartctl requires root access. Setting 'use_sudo' diff --git a/plugins/inputs/smartctl/smartctl.go b/plugins/inputs/smartctl/smartctl.go index 1456ee99c6f46..7c82ac78d544b 100644 --- a/plugins/inputs/smartctl/smartctl.go +++ b/plugins/inputs/smartctl/smartctl.go @@ -39,7 +39,7 @@ func (*Smartctl) SampleConfig() string { func (s *Smartctl) Init() error { if s.Path == "" { - s.Path = "/usr/bin/smartctl" + s.Path = "/usr/sbin/smartctl" } switch s.NoCheck { From 212c21006e4fd34b37e627072dfd65b298ed0dff Mon Sep 17 00:00:00 2001 From: Josh Powers <powersj@fastmail.com> Date: Thu, 11 Apr 2024 07:27:12 -0600 Subject: [PATCH 8/8] docs: Readme clarify differences --- plugins/inputs/smartctl/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/plugins/inputs/smartctl/README.md b/plugins/inputs/smartctl/README.md index 57b303f7911a1..a43f5876b07a1 100644 --- a/plugins/inputs/smartctl/README.md +++ b/plugins/inputs/smartctl/README.md @@ -13,6 +13,16 @@ releases. See smartmontools (<https://www.smartmontools.org/>) for more information. +## smart vs smartctl + +The smartctl plugin is an alternative to the smart plugin. The biggest +difference is that the smart plugin can also call `nvmectl` to collect +additional details about NVMe devices as well as some vendor specific device +information. + +This plugin will also require a version of the `smartctl` command that supports +JSON output versus the smart plugin will parse the raw output. + ## Global configuration options <!-- @/docs/includes/plugin_config.md --> In addition to the plugin-specific configuration settings, plugins support