From 2ff869ee918268f509ef42d849d57a1224d65df7 Mon Sep 17 00:00:00 2001 From: Jonathan G Date: Tue, 31 Jul 2018 17:56:03 -0600 Subject: [PATCH] Split multiple sensor keys in ipmi input (#4450) --- etc/telegraf.conf | 3 + plugins/inputs/ipmi_sensor/README.md | 63 ++++++-- plugins/inputs/ipmi_sensor/ipmi.go | 137 ++++++++++++++--- plugins/inputs/ipmi_sensor/ipmi_test.go | 196 +++++++++++++++++++++++- 4 files changed, 361 insertions(+), 38 deletions(-) diff --git a/etc/telegraf.conf b/etc/telegraf.conf index 38942adee0a6a..912a93d1062a8 100644 --- a/etc/telegraf.conf +++ b/etc/telegraf.conf @@ -1976,6 +1976,9 @@ # ## Timeout for the ipmitool command to complete # timeout = "20s" +# ## Schema Version: (Optional, defaults to version 1) +# schemaVersion = 2 + # # Gather packets and bytes counters from Linux ipsets # [[inputs.ipset]] diff --git a/plugins/inputs/ipmi_sensor/README.md b/plugins/inputs/ipmi_sensor/README.md index 74cfe3bc5eb98..fb2e8f26e0c07 100644 --- a/plugins/inputs/ipmi_sensor/README.md +++ b/plugins/inputs/ipmi_sensor/README.md @@ -8,6 +8,10 @@ If no servers are specified, the plugin will query the local machine sensor stat ``` ipmitool sdr ``` +or with the version 2 schema: +``` +ipmitool sdr elist +``` When one or more servers are specified, the plugin will use the following command to collect remote host sensor stats: @@ -41,19 +45,36 @@ ipmitool -I lan -H SERVER -U USERID -P PASSW0RD sdr ## Timeout for the ipmitool command to complete. Default is 20 seconds. timeout = "20s" + + ## Schema Version: (Optional, defaults to version 1) + metric_version = 2 ``` ### Measurements +Version 1 schema: - ipmi_sensor: - tags: - name - unit + - host - server (only when retrieving stats from remote servers) - fields: - - status (int) + - status (int, 1=ok status_code/0=anything else) - value (float) +Version 2 schema: +- ipmi_sensor: + - tags: + - name + - entity_id (can help uniquify duplicate names) + - status_code (two letter code from IPMI documentation) + - status_desc (extended status description field) + - unit (only on analog values) + - host + - server (only when retrieving stats from remote) + - fields: + - value (float) #### Permissions @@ -68,24 +89,36 @@ KERNEL=="ipmi*", MODE="660", GROUP="telegraf" ### Example Output +#### Version 1 Schema When retrieving stats from a remote server: ``` -ipmi_sensor,server=10.20.2.203,unit=degrees_c,name=ambient_temp status=1i,value=20 1458488465012559455 -ipmi_sensor,server=10.20.2.203,unit=feet,name=altitude status=1i,value=80 1458488465012688613 -ipmi_sensor,server=10.20.2.203,unit=watts,name=avg_power status=1i,value=220 1458488465012776511 -ipmi_sensor,server=10.20.2.203,unit=volts,name=planar_3.3v status=1i,value=3.28 1458488465012861875 -ipmi_sensor,server=10.20.2.203,unit=volts,name=planar_vbat status=1i,value=3.04 1458488465013072508 -ipmi_sensor,server=10.20.2.203,unit=rpm,name=fan_1a_tach status=1i,value=2610 1458488465013137932 -ipmi_sensor,server=10.20.2.203,unit=rpm,name=fan_1b_tach status=1i,value=1775 1458488465013279896 +ipmi_sensor,server=10.20.2.203,name=uid_light value=0,status=1i 1517125513000000000 +ipmi_sensor,server=10.20.2.203,name=sys._health_led status=1i,value=0 1517125513000000000 +ipmi_sensor,server=10.20.2.203,name=power_supply_1,unit=watts status=1i,value=110 1517125513000000000 +ipmi_sensor,server=10.20.2.203,name=power_supply_2,unit=watts status=1i,value=120 1517125513000000000 +ipmi_sensor,server=10.20.2.203,name=power_supplies value=0,status=1i 1517125513000000000 +ipmi_sensor,server=10.20.2.203,name=fan_1,unit=percent status=1i,value=43.12 1517125513000000000 ``` + +When retrieving stats from the local machine (no server specified): +``` +ipmi_sensor,name=uid_light value=0,status=1i 1517125513000000000 +ipmi_sensor,name=sys._health_led status=1i,value=0 1517125513000000000 +ipmi_sensor,name=power_supply_1,unit=watts status=1i,value=110 1517125513000000000 +ipmi_sensor,name=power_supply_2,unit=watts status=1i,value=120 1517125513000000000 +ipmi_sensor,name=power_supplies value=0,status=1i 1517125513000000000 +ipmi_sensor,name=fan_1,unit=percent status=1i,value=43.12 1517125513000000000 +``` + +#### Version 2 Schema + When retrieving stats from the local machine (no server specified): ``` -ipmi_sensor,unit=degrees_c,name=ambient_temp status=1i,value=20 1458488465012559455 -ipmi_sensor,unit=feet,name=altitude status=1i,value=80 1458488465012688613 -ipmi_sensor,unit=watts,name=avg_power status=1i,value=220 1458488465012776511 -ipmi_sensor,unit=volts,name=planar_3.3v status=1i,value=3.28 1458488465012861875 -ipmi_sensor,unit=volts,name=planar_vbat status=1i,value=3.04 1458488465013072508 -ipmi_sensor,unit=rpm,name=fan_1a_tach status=1i,value=2610 1458488465013137932 -ipmi_sensor,unit=rpm,name=fan_1b_tach status=1i,value=1775 1458488465013279896 +ipmi_sensor,name=uid_light,entity_id=23.1,status_code=ok,status_desc=ok value=0 1517125474000000000 +ipmi_sensor,name=sys._health_led,entity_id=23.2,status_code=ok,status_desc=ok value=0 1517125474000000000 +ipmi_sensor,entity_id=10.1,name=power_supply_1,status_code=ok,status_desc=presence_detected,unit=watts value=110 1517125474000000000 +ipmi_sensor,name=power_supply_2,entity_id=10.2,status_code=ok,unit=watts,status_desc=presence_detected value=125 1517125474000000000 +ipmi_sensor,name=power_supplies,entity_id=10.3,status_code=ok,status_desc=fully_redundant value=0 1517125474000000000 +ipmi_sensor,entity_id=7.1,name=fan_1,status_code=ok,status_desc=transition_to_running,unit=percent value=43.12 1517125474000000000 ``` diff --git a/plugins/inputs/ipmi_sensor/ipmi.go b/plugins/inputs/ipmi_sensor/ipmi.go index ee99b0a3d15fe..65506e11835c7 100644 --- a/plugins/inputs/ipmi_sensor/ipmi.go +++ b/plugins/inputs/ipmi_sensor/ipmi.go @@ -1,8 +1,11 @@ package ipmi_sensor import ( + "bufio" + "bytes" "fmt" "os/exec" + "regexp" "strconv" "strings" "sync" @@ -14,14 +17,20 @@ import ( ) var ( - execCommand = exec.Command // execCommand is used to mock commands in tests. + execCommand = exec.Command // execCommand is used to mock commands in tests. + re_v1_parse_line = regexp.MustCompile(`^(?P[^|]*)\|(?P[^|]*)\|(?P.*)`) + re_v2_parse_line = regexp.MustCompile(`^(?P[^|]*)\|[^|]+\|(?P[^|]*)\|(?P[^|]*)\|(?:(?P[^|]+))?`) + re_v2_parse_description = regexp.MustCompile(`^(?P[0-9.]+)\s(?P.*)|(?P.+)|^$`) + re_v2_parse_unit = regexp.MustCompile(`^(?P[^,]+)(?:,\s*(?P.*))?`) ) +// Ipmi stores the configuration values for the ipmi_sensor input plugin type Ipmi struct { - Path string - Privilege string - Servers []string - Timeout internal.Duration + Path string + Privilege string + Servers []string + Timeout internal.Duration + MetricVersion int } var sampleConfig = ` @@ -46,16 +55,22 @@ var sampleConfig = ` ## Timeout for the ipmitool command to complete timeout = "20s" + + ## Schema Version: (Optional, defaults to version 1) + metric_version = 2 ` +// SampleConfig returns the documentation about the sample configuration func (m *Ipmi) SampleConfig() string { return sampleConfig } +// Description returns a basic description for the plugin functions func (m *Ipmi) Description() string { return "Read metrics from the bare metal servers via IPMI" } +// Gather is the main execution function for the plugin func (m *Ipmi) Gather(acc telegraf.Accumulator) error { if len(m.Path) == 0 { return fmt.Errorf("ipmitool not found: verify that ipmitool is installed and that ipmitool is in your PATH") @@ -93,23 +108,33 @@ func (m *Ipmi) parse(acc telegraf.Accumulator, server string) error { opts = conn.options() } opts = append(opts, "sdr") + if m.MetricVersion == 2 { + opts = append(opts, "elist") + } cmd := execCommand(m.Path, opts...) out, err := internal.CombinedOutputTimeout(cmd, m.Timeout.Duration) + timestamp := time.Now() if err != nil { return fmt.Errorf("failed to run command %s: %s - %s", strings.Join(cmd.Args, " "), err, string(out)) } + if m.MetricVersion == 2 { + return parseV2(acc, hostname, out, timestamp) + } + return parseV1(acc, hostname, out, timestamp) +} +func parseV1(acc telegraf.Accumulator, hostname string, cmdOut []byte, measured_at time.Time) error { // each line will look something like // Planar VBAT | 3.05 Volts | ok - lines := strings.Split(string(out), "\n") - for i := 0; i < len(lines); i++ { - vals := strings.Split(lines[i], "|") - if len(vals) != 3 { + scanner := bufio.NewScanner(bytes.NewReader(cmdOut)) + for scanner.Scan() { + ipmiFields := extractFieldsFromRegex(re_v1_parse_line, scanner.Text()) + if len(ipmiFields) != 3 { continue } tags := map[string]string{ - "name": transform(vals[0]), + "name": transform(ipmiFields["name"]), } // tag the server is we have one @@ -118,18 +143,20 @@ func (m *Ipmi) parse(acc telegraf.Accumulator, server string) error { } fields := make(map[string]interface{}) - if strings.EqualFold("ok", trim(vals[2])) { + if strings.EqualFold("ok", trim(ipmiFields["status_code"])) { fields["status"] = 1 } else { fields["status"] = 0 } - val1 := trim(vals[1]) - - if strings.Index(val1, " ") > 0 { + if strings.Index(ipmiFields["description"], " ") > 0 { // split middle column into value and unit - valunit := strings.SplitN(val1, " ", 2) - fields["value"] = Atofloat(valunit[0]) + valunit := strings.SplitN(ipmiFields["description"], " ", 2) + var err error + fields["value"], err = aToFloat(valunit[0]) + if err != nil { + continue + } if len(valunit) > 1 { tags["unit"] = transform(valunit[1]) } @@ -137,19 +164,85 @@ func (m *Ipmi) parse(acc telegraf.Accumulator, server string) error { fields["value"] = 0.0 } - acc.AddFields("ipmi_sensor", fields, tags, time.Now()) + acc.AddFields("ipmi_sensor", fields, tags, measured_at) } - return nil + return scanner.Err() } -func Atofloat(val string) float64 { +func parseV2(acc telegraf.Accumulator, hostname string, cmdOut []byte, measured_at time.Time) error { + // each line will look something like + // CMOS Battery | 65h | ok | 7.1 | + // Temp | 0Eh | ok | 3.1 | 55 degrees C + // Drive 0 | A0h | ok | 7.1 | Drive Present + scanner := bufio.NewScanner(bytes.NewReader(cmdOut)) + for scanner.Scan() { + ipmiFields := extractFieldsFromRegex(re_v2_parse_line, scanner.Text()) + if len(ipmiFields) < 3 || len(ipmiFields) > 4 { + continue + } + + tags := map[string]string{ + "name": transform(ipmiFields["name"]), + } + + // tag the server is we have one + if hostname != "" { + tags["server"] = hostname + } + tags["entity_id"] = transform(ipmiFields["entity_id"]) + tags["status_code"] = trim(ipmiFields["status_code"]) + fields := make(map[string]interface{}) + descriptionResults := extractFieldsFromRegex(re_v2_parse_description, trim(ipmiFields["description"])) + // This is an analog value with a unit + if descriptionResults["analogValue"] != "" && len(descriptionResults["analogUnit"]) >= 1 { + var err error + fields["value"], err = aToFloat(descriptionResults["analogValue"]) + if err != nil { + continue + } + // Some implementations add an extra status to their analog units + unitResults := extractFieldsFromRegex(re_v2_parse_unit, descriptionResults["analogUnit"]) + tags["unit"] = transform(unitResults["realAnalogUnit"]) + if unitResults["statusDesc"] != "" { + tags["status_desc"] = transform(unitResults["statusDesc"]) + } + } else { + // This is a status value + fields["value"] = 0.0 + // Extended status descriptions aren't required, in which case for consistency re-use the status code + if descriptionResults["status"] != "" { + tags["status_desc"] = transform(descriptionResults["status"]) + } else { + tags["status_desc"] = transform(ipmiFields["status_code"]) + } + } + + acc.AddFields("ipmi_sensor", fields, tags, measured_at) + } + + return scanner.Err() +} + +// extractFieldsFromRegex consumes a regex with named capture groups and returns a kvp map of strings with the results +func extractFieldsFromRegex(re *regexp.Regexp, input string) map[string]string { + submatches := re.FindStringSubmatch(input) + results := make(map[string]string) + for i, name := range re.SubexpNames() { + if name != input && name != "" && input != "" { + results[name] = trim(submatches[i]) + } + } + return results +} + +// aToFloat converts string representations of numbers to float64 values +func aToFloat(val string) (float64, error) { f, err := strconv.ParseFloat(val, 64) if err != nil { - return 0.0 - } else { - return f + return 0.0, err } + return f, nil } func trim(s string) string { diff --git a/plugins/inputs/ipmi_sensor/ipmi_test.go b/plugins/inputs/ipmi_sensor/ipmi_test.go index 3d45f2fa843b0..d781ce7b51d25 100644 --- a/plugins/inputs/ipmi_sensor/ipmi_test.go +++ b/plugins/inputs/ipmi_sensor/ipmi_test.go @@ -28,7 +28,7 @@ func TestGather(t *testing.T) { require.NoError(t, err) - assert.Equal(t, acc.NFields(), 266, "non-numeric measurements should be ignored") + assert.Equal(t, acc.NFields(), 262, "non-numeric measurements should be ignored") conn := NewConnection(i.Servers[0], i.Privilege) assert.Equal(t, "USERID", conn.Username) @@ -127,6 +127,7 @@ func TestGather(t *testing.T) { } err = acc.GatherError(i.Gather) + require.NoError(t, err) var testsWithoutServer = []struct { fields map[string]interface{} @@ -378,3 +379,196 @@ OS RealTime Mod | 0x00 | ok } os.Exit(0) } + +func TestGatherV2(t *testing.T) { + i := &Ipmi{ + Servers: []string{"USERID:PASSW0RD@lan(192.168.1.1)"}, + Path: "ipmitool", + Privilege: "USER", + Timeout: internal.Duration{Duration: time.Second * 5}, + MetricVersion: 2, + } + // overwriting exec commands with mock commands + execCommand = fakeExecCommandV2 + var acc testutil.Accumulator + + err := acc.GatherError(i.Gather) + + require.NoError(t, err) + + conn := NewConnection(i.Servers[0], i.Privilege) + assert.Equal(t, "USERID", conn.Username) + assert.Equal(t, "lan", conn.Interface) + + var testsWithServer = []struct { + fields map[string]interface{} + tags map[string]string + }{ + //SEL | 72h | ns | 7.1 | No Reading + { + map[string]interface{}{ + "value": float64(0), + }, + map[string]string{ + "name": "sel", + "entity_id": "7.1", + "status_code": "ns", + "status_desc": "no_reading", + "server": "192.168.1.1", + }, + }, + } + + for _, test := range testsWithServer { + acc.AssertContainsTaggedFields(t, "ipmi_sensor", test.fields, test.tags) + } + + i = &Ipmi{ + Path: "ipmitool", + Timeout: internal.Duration{Duration: time.Second * 5}, + MetricVersion: 2, + } + + err = acc.GatherError(i.Gather) + require.NoError(t, err) + + var testsWithoutServer = []struct { + fields map[string]interface{} + tags map[string]string + }{ + //SEL | 72h | ns | 7.1 | No Reading + { + map[string]interface{}{ + "value": float64(0), + }, + map[string]string{ + "name": "sel", + "entity_id": "7.1", + "status_code": "ns", + "status_desc": "no_reading", + }, + }, + //Intrusion | 73h | ok | 7.1 | + { + map[string]interface{}{ + "value": float64(0), + }, + map[string]string{ + "name": "intrusion", + "entity_id": "7.1", + "status_code": "ok", + "status_desc": "ok", + }, + }, + //Fan1 | 30h | ok | 7.1 | 5040 RPM + { + map[string]interface{}{ + "value": float64(5040), + }, + map[string]string{ + "name": "fan1", + "entity_id": "7.1", + "status_code": "ok", + "unit": "rpm", + }, + }, + //Inlet Temp | 04h | ok | 7.1 | 25 degrees C + { + map[string]interface{}{ + "value": float64(25), + }, + map[string]string{ + "name": "inlet_temp", + "entity_id": "7.1", + "status_code": "ok", + "unit": "degrees_c", + }, + }, + //USB Cable Pres | 50h | ok | 7.1 | Connected + { + map[string]interface{}{ + "value": float64(0), + }, + map[string]string{ + "name": "usb_cable_pres", + "entity_id": "7.1", + "status_code": "ok", + "status_desc": "connected", + }, + }, + //Current 1 | 6Ah | ok | 10.1 | 7.20 Amps + { + map[string]interface{}{ + "value": float64(7.2), + }, + map[string]string{ + "name": "current_1", + "entity_id": "10.1", + "status_code": "ok", + "unit": "amps", + }, + }, + //Power Supply 1 | 03h | ok | 10.1 | 110 Watts, Presence detected + { + map[string]interface{}{ + "value": float64(110), + }, + map[string]string{ + "name": "power_supply_1", + "entity_id": "10.1", + "status_code": "ok", + "unit": "watts", + "status_desc": "presence_detected", + }, + }, + } + + for _, test := range testsWithoutServer { + acc.AssertContainsTaggedFields(t, "ipmi_sensor", test.fields, test.tags) + } +} + +// fackeExecCommandV2 is a helper function that mock +// the exec.Command call (and call the test binary) +func fakeExecCommandV2(command string, args ...string) *exec.Cmd { + cs := []string{"-test.run=TestHelperProcessV2", "--", command} + cs = append(cs, args...) + cmd := exec.Command(os.Args[0], cs...) + cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"} + return cmd +} + +// TestHelperProcessV2 isn't a real test. It's used to mock exec.Command +// For example, if you run: +// GO_WANT_HELPER_PROCESS=1 go test -test.run=TestHelperProcessV2 -- chrony tracking +// it returns below mockData. +func TestHelperProcessV2(t *testing.T) { + if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { + return + } + + // Curated list of use cases instead of full dumps + mockData := `SEL | 72h | ns | 7.1 | No Reading +Intrusion | 73h | ok | 7.1 | +Fan1 | 30h | ok | 7.1 | 5040 RPM +Inlet Temp | 04h | ok | 7.1 | 25 degrees C +USB Cable Pres | 50h | ok | 7.1 | Connected +Current 1 | 6Ah | ok | 10.1 | 7.20 Amps +Power Supply 1 | 03h | ok | 10.1 | 110 Watts, Presence detected +` + + args := os.Args + + // Previous arguments are tests stuff, that looks like : + // /tmp/go-build970079519/…/_test/integration.test -test.run=TestHelperProcess -- + cmd, args := args[3], args[4:] + + if cmd == "ipmitool" { + fmt.Fprint(os.Stdout, mockData) + } else { + fmt.Fprint(os.Stdout, "command not found") + os.Exit(1) + + } + os.Exit(0) +}