Skip to content

Commit

Permalink
smart: add a "device_type" tag to tell devices apart
Browse files Browse the repository at this point in the history
when they're standing behind the same controller
  • Loading branch information
Thomas Delbende committed Jan 23, 2024
1 parent 57021be commit e0f3dd4
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 15 deletions.
6 changes: 6 additions & 0 deletions plugins/inputs/smart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli
## without a password.
# use_sudo = false

## Adds an extra tag "device_type", which can be used to differentiate
## multiple disks behind the same controller (e.g., MegaRAID).
# tag_with_device_type = false

## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stopped rotating.
Expand Down Expand Up @@ -182,6 +186,7 @@ execute this script.
- tags:
- capacity
- device
- device_type (only emitted if `tag_with_device_type` is set to `true`)
- enabled
- model
- serial_no
Expand All @@ -201,6 +206,7 @@ execute this script.
- tags:
- capacity
- device
- device_type (only emitted if `tag_with_device_type` is set to `true`)
- enabled
- fail
- flags
Expand Down
4 changes: 4 additions & 0 deletions plugins/inputs/smart/sample.conf
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
## without a password.
# use_sudo = false

## Adds an extra tag "device_type", which can be used to differentiate
## multiple disks behind the same controller (e.g., MegaRAID).
# tag_with_device_type = false

## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stopped rotating.
## See --nocheck in the man pages for smartctl.
Expand Down
39 changes: 24 additions & 15 deletions plugins/inputs/smart/smart.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,18 +353,19 @@ var (

// Smart plugin reads metrics from storage devices supporting S.M.A.R.T.
type Smart struct {
Path string `toml:"path" deprecated:"1.16.0;use 'path_smartctl' instead"`
PathSmartctl string `toml:"path_smartctl"`
PathNVMe string `toml:"path_nvme"`
Nocheck string `toml:"nocheck"`
EnableExtensions []string `toml:"enable_extensions"`
Attributes bool `toml:"attributes"`
Excludes []string `toml:"excludes"`
Devices []string `toml:"devices"`
UseSudo bool `toml:"use_sudo"`
Timeout config.Duration `toml:"timeout"`
ReadMethod string `toml:"read_method"`
Log telegraf.Logger `toml:"-"`
Path string `toml:"path" deprecated:"1.16.0;use 'path_smartctl' instead"`
PathSmartctl string `toml:"path_smartctl"`
PathNVMe string `toml:"path_nvme"`
Nocheck string `toml:"nocheck"`
EnableExtensions []string `toml:"enable_extensions"`
Attributes bool `toml:"attributes"`
Excludes []string `toml:"excludes"`
Devices []string `toml:"devices"`
UseSudo bool `toml:"use_sudo"`
TagWithDeviceType bool `toml:"tag_with_device_type"`
Timeout config.Duration `toml:"timeout"`
ReadMethod string `toml:"read_method"`
Log telegraf.Logger `toml:"-"`
}

type nvmeDevice struct {
Expand Down Expand Up @@ -741,8 +742,16 @@ func (m *Smart) gatherDisk(acc telegraf.Accumulator, device string, wg *sync.Wai
}

deviceTags := map[string]string{}
deviceNode := strings.Split(device, " ")[0]
deviceTags["device"] = path.Base(deviceNode)
if m.TagWithDeviceType {
deviceNode := strings.SplitN(device, " ", 2)
deviceTags["device"] = path.Base(deviceNode[0])
if len(deviceNode) == 2 && deviceNode[1] != "" {
deviceTags["device_type"] = strings.TrimPrefix(deviceNode[1], "-d ")
}
} else {
deviceNode := strings.Split(device, " ")[0]
deviceTags["device"] = path.Base(deviceNode)
}
deviceFields := make(map[string]interface{})
deviceFields["exit_status"] = exitStatus

Expand Down Expand Up @@ -798,7 +807,7 @@ func (m *Smart) gatherDisk(acc telegraf.Accumulator, device string, wg *sync.Wai

if m.Attributes {
//add power mode
keys := [...]string{"device", "model", "serial_no", "wwn", "capacity", "enabled", "power"}
keys := [...]string{"device", "device_type", "model", "serial_no", "wwn", "capacity", "enabled", "power"}
for _, key := range keys {
if value, ok := deviceTags[key]; ok {
tags[key] = value
Expand Down
159 changes: 159 additions & 0 deletions plugins/inputs/smart/smart_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package smart

import (
"errors"
"fmt"
"sync"
"testing"
"time"
Expand Down Expand Up @@ -313,6 +314,38 @@ func TestGatherSSDRaid(t *testing.T) {
require.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered")
}

func TestGatherDeviceTypeTag(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
if args[0] == "--scan" {
return nil, errors.New("scan command should not be run, since devices are provided in config")
} else if args[0] == "--info" {
switch args[len(args)-1] {
case "megaraid,0":
return []byte(smartctlMegaraidInfo1), nil
case "megaraid,1":
return []byte(smartctlMegaraidInfo2), nil
default:
return nil, fmt.Errorf("unexpected device type %q", args[len(args)-1])
}
} else {
return nil, fmt.Errorf("unexpected command %q", args[0])
}
}

s := newSmart()
s.Devices = []string{"/dev/bus/0 -d megaraid,0", "/dev/bus/0 -d megaraid,1"}
s.TagWithDeviceType = true

acc := testutil.Accumulator{}

err := s.Gather(&acc)
require.NoError(t, err)
require.NoError(t, errors.Join(acc.Errors...))

result := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, testSmartctlDeviceTypeTag, result, testutil.SortMetrics(), testutil.IgnoreTime())
}

func TestGatherNVMe(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
return []byte(smartctlNVMeInfoData), nil
Expand Down Expand Up @@ -792,6 +825,45 @@ var (
mockModel = "INTEL SSDPEDABCDEFG"
mockSerial = "CVFT5123456789ABCD"

testSmartctlDeviceTypeTag = []telegraf.Metric{
testutil.MustMetric(
"smart_device",
map[string]string{
"capacity": "600000000000",
"device": "0",
"device_type": "megaraid,0",
"enabled": "Enabled",
"model": "ST3450857SS",
"power": "ACTIVE",
"serial_no": "xxx",
},
map[string]any{
"exit_status": int64(0),
"health_ok": true,
"temp_c": int64(37),
},
time.Unix(0, 0),
),
testutil.MustMetric(
"smart_device",
map[string]string{
"capacity": "600000000000",
"device": "0",
"device_type": "megaraid,1",
"enabled": "Enabled",
"model": "ST3450857SS",
"power": "ACTIVE",
"serial_no": "xxx",
},
map[string]any{
"exit_status": int64(0),
"health_ok": true,
"temp_c": int64(47),
},
time.Unix(0, 0),
),
}

testSmartctlNVMeAttributes = []telegraf.Metric{
testutil.MustMetric("smart_device",
map[string]string{
Expand Down Expand Up @@ -2237,6 +2309,93 @@ Selective self-test flags (0x0):
After scanning selected spans, do NOT read-scan remainder of disk.
If Selective self-test is pending on power-up, resume after 0 minute delay.
`

smartctlMegaraidInfo1 = `smartctl 7.3 2022-02-28 r5338 [x86_64-linux-6.2.16-12-pve] (local build)
Copyright (C) 2002-22, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
Vendor: SEAGATE
Product: ST3450857SS
Revision: ES12
Compliance: SPC-3
User Capacity: 600,000,000,000 bytes [600 GB]
Logical block size: 512 bytes
Rotation Rate: 15000 rpm
Form Factor: 3.5 inches
Logical Unit id: 0x6000c60641d10397
Serial number: xxx
Device type: disk
Transport protocol: SAS (SPL-4)
Local Time is: Fri Jan 12 11:43:49 2024 CET
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
Temperature Warning: Disabled or Not Supported
Power mode is: ACTIVE
=== START OF READ SMART DATA SECTION ===
SMART Health Status: OK
Current Drive Temperature: 37 C
Drive Trip Temperature: 63 C
Accumulated power on time, hours:minutes 16003:18
Elements in grown defect list: 0
Vendor (Seagate Cache) information
Blocks sent to initiator = 3000000000
Blocks received from initiator = 3000000000
Blocks read from cache and sent to initiator = 3000000000
Number of read and write commands whose size <= segment size = 3000000000
Number of read and write commands whose size > segment size = 300
Vendor (Seagate/Hitachi) factory information
number of hours powered up = 30000.30
number of minutes until next internal SMART test = 7
`

smartctlMegaraidInfo2 = `smartctl 7.3 2022-02-28 r5338 [x86_64-linux-6.2.16-12-pve] (local build)
Copyright (C) 2002-22, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
Vendor: SEAGATE
Product: ST3450857SS
Revision: ES12
Compliance: SPC-3
User Capacity: 600,000,000,000 bytes [600 GB]
Logical block size: 512 bytes
Rotation Rate: 15000 rpm
Form Factor: 3.5 inches
Logical Unit id: 0x6000c60641d10497
Serial number: xxx
Device type: disk
Transport protocol: SAS (SPL-4)
Local Time is: Fri Jan 12 11:44:49 2024 CET
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
Temperature Warning: Disabled or Not Supported
Power mode is: ACTIVE
=== START OF READ SMART DATA SECTION ===
SMART Health Status: OK
Current Drive Temperature: 47 C
Drive Trip Temperature: 64 C
Accumulated power on time, hours:minutes 16004:18
Elements in grown defect list: 0
Vendor (Seagate Cache) information
Blocks sent to initiator = 4000000000
Blocks received from initiator = 4000000000
Blocks read from cache and sent to initiator = 4000000000
Number of read and write commands whose size <= segment size = 4000000000
Number of read and write commands whose size > segment size = 400
Vendor (Seagate/Hitachi) factory information
number of hours powered up = 30000.30
number of minutes until next internal SMART test = 7
`

smartctlNVMeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
Expand Down

0 comments on commit e0f3dd4

Please sign in to comment.