Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: poller's should collect and export their status and memory #2944

Merged
merged 1 commit into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 12 additions & 40 deletions cmd/collectors/unix/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,24 +51,21 @@ var _Histograms = map[string]func(*matrix.Metric, string, *matrix.Instance, *Pro

// list of (scalar) metrics
var _Metrics = map[string]func(*matrix.Metric, *matrix.Instance, *Process, *System){
"start_time": setStartTime,
"cpu_percent": setCPUPercent,
"memory_percent": setMemoryPercent,
"threads": setNumThreads,
"fds": setNumFds,
"start_time": setStartTime,
"cpu_percent": setCPUPercent,
"threads": setNumThreads,
"fds": setNumFds,
}

var _DataTypes = map[string]string{
"cpu": "float64",
"memory": "uint64",
"io": "uint64",
"net": "uint64",
"ctx": "uint64",
"start_time": "float64",
"cpu_percent": "float64",
"memory_percent": "float64",
"threads": "uint64",
"fds": "uint64",
"cpu": "float64",
"io": "uint64",
"net": "uint64",
"ctx": "uint64",
"start_time": "float64",
"cpu_percent": "float64",
"threads": "uint64",
"fds": "uint64",
}

func init() {
Expand Down Expand Up @@ -263,10 +260,6 @@ func (u *Unix) loadMetrics(counters *node.Node) error {
}
}

if _, err = mat.NewMetricUint8("status"); err != nil {
return err
}

u.Logger.Debug().Msgf("initialized cache with %d metrics", len(mat.GetMetrics()))
return nil
}
Expand Down Expand Up @@ -352,12 +345,6 @@ func (u *Unix) PollData() (map[string]*matrix.Matrix, error) {

for key, instance := range mat.GetInstances() {

// assume not running
err = mat.LazySetValueUint8("status", key, 0)
if err != nil {
u.Logger.Error().Stack().Err(err).Msgf("error while parsing metric key [%s]", key)
}

if proc, ok = u.processes[key]; ok {
if err = proc.Reload(); err != nil {
delete(u.processes, key)
Expand Down Expand Up @@ -389,14 +376,6 @@ func (u *Unix) PollData() (map[string]*matrix.Matrix, error) {
continue
}

// if we got here poller is running
err = mat.LazySetValueUint64("status", key, 1)
if err != nil {
u.Logger.Error().Stack().Err(err).Msgf("error while parsing metric key [%s]", key)
}

u.Logger.Debug().Msgf("populating instance [%s]: PID (%d) with [%s]\n", key, pid, cmd)

// process scalar metrics
for key, foo := range _Metrics {
if metric := mat.GetMetric(key); metric != nil {
Expand Down Expand Up @@ -444,13 +423,6 @@ func setNumFds(m *matrix.Metric, i *matrix.Instance, p *Process, _ *System) {
}
}

func setMemoryPercent(m *matrix.Metric, i *matrix.Instance, p *Process, s *System) {
err := m.SetValueFloat64(i, float64(p.mem["rss"])/float64(s.memTotal)*100)
if err != nil {
logging.Get().Error().Stack().Err(err).Msg("error")
}
}

func setCPUPercent(m *matrix.Metric, i *matrix.Instance, p *Process, _ *System) {
if p.elapsedTime != 0 {
err := m.SetValueFloat64(i, p.cpuTotal/p.elapsedTime*100)
Expand Down
41 changes: 0 additions & 41 deletions cmd/collectors/unix/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,6 @@ func (p *Process) Reload() error {
return err
}

if err := p.loadSmaps(); err != nil {
return err
}

if err := p.loadIo(); err != nil {
return err
}
Expand Down Expand Up @@ -224,43 +220,6 @@ func (p *Process) loadStat() error {
return err
}

func (p *Process) loadSmaps() error {

var (
data []byte
err error
num uint64
line, key string
fields []string
)

// this may fail see https://github.com/NetApp/harvest/issues/249
// when it does, ignore so the other /proc checks are given a chance to run
if data, err = os.ReadFile(path.Join(p.dirpath, "smaps")); err != nil {
return nil //nolint:nilerr
}

p.mem = make(map[string]uint64)

for _, line = range strings.Split(string(data), "\n") {

if fields = strings.Fields(line); len(fields) == 3 {
if num, err = strconv.ParseUint(fields[1], 10, 64); err == nil {

key = strings.ToLower(strings.TrimSuffix(strings.Split(fields[0], "_")[0], ":"))

if key == "rss" || key == "swap" || key == "anonymous" || key == "shared" || key == "private" {
p.mem[key] += num
} else if key == "size" {
p.mem["vms"] += num
}

}
}
}
return nil
}

func (p *Process) loadIo() error {

var (
Expand Down
84 changes: 77 additions & 7 deletions cmd/poller/poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ import (
"github.com/netapp/harvest/v2/pkg/requests"
"github.com/netapp/harvest/v2/pkg/tree/node"
"github.com/netapp/harvest/v2/pkg/util"
"github.com/shirou/gopsutil/v3/mem"
"github.com/shirou/gopsutil/v3/process"
"github.com/spf13/cobra"
"gopkg.in/yaml.v3"
"io"
Expand Down Expand Up @@ -118,7 +120,8 @@ type Poller struct {
exporterParams map[string]conf.Exporter
params *conf.Poller
metadata *matrix.Matrix
status *matrix.Matrix
status *matrix.Matrix // exported as metadata_target_
status2 *matrix.Matrix // exported as poller_status
certPool *x509.CertPool
client *http.Client
auth *auth.Credentials
Expand Down Expand Up @@ -444,16 +447,22 @@ func (p *Poller) Run() {
task.Start()
// flush metadata
p.status.Reset()
p.status2.Reset()
p.metadata.Reset()

// ping target system
if ping, ok := p.ping(); ok {
_ = p.status.LazySetValueUint8("status", "host", 0)
_ = p.status.LazySetValueFloat64("ping", "host", float64(ping))
_ = p.status2.LazySetValueUint8("status", "host", 1)
_ = p.status2.LazySetValueFloat64("ping", "host", float64(ping))
} else {
_ = p.status.LazySetValueUint8("status", "host", 1)
_ = p.status2.LazySetValueUint8("status", "host", 0)
}

p.addMemoryMetadata()

// add number of goroutines to metadata
// @TODO: cleanup, does not belong to "status"
_ = p.status.LazySetValueInt64("goroutines", "host", int64(runtime.NumGoroutine()))
Expand Down Expand Up @@ -511,6 +520,9 @@ func (p *Poller) Run() {
if _, err := ee.Export(p.status); err != nil {
logger.Error().Err(err).Msg("export target metadata:")
}
if _, err := ee.Export(p.status2); err != nil {
logger.Error().Err(err).Msg("export poller status:")
}
}

// only log when there are changes, which we expect to be infrequent
Expand Down Expand Up @@ -549,7 +561,7 @@ func (p *Poller) handleSignals(signalChannel chan os.Signal) {
// and if available, response time
func (p *Poller) ping() (float32, bool) {

cmd := exec.Command("ping", p.target, "-w", "5", "-c", "1", "-q") //nolint:gosec
cmd := exec.Command("ping", p.target, "-W", "5", "-c", "1", "-q") //nolint:gosec
output, err := cmd.Output()
if err != nil {
return 0, false
Expand Down Expand Up @@ -920,29 +932,58 @@ func (p *Poller) loadMetadata() {
}
p.metadata.SetExportOptions(matrix.DefaultExportOptions())

// metadata for target system
// metadata for the target system
p.status = matrix.New("poller", "metadata_target", "metadata_component")
_, _ = p.status.NewMetricUint8("status")
_, _ = p.status.NewMetricFloat64("ping")
_, _ = p.status.NewMetricUint64("goroutines")

// metadata for the poller itself
p.status2 = matrix.New("poller", "poller", "poller_target")
_, _ = p.status2.NewMetricUint8("status")
_, _ = p.status2.NewMetricFloat64("memory_percent")
newMemoryMetric(p.status2, "memory", "rss")
newMemoryMetric(p.status2, "memory", "vms")
newMemoryMetric(p.status2, "memory", "swap")

instance, _ := p.status.NewInstance("host")
pInstance, _ := p.status2.NewInstance("host")
instance.SetLabel("addr", p.target)
p.status.SetGlobalLabel("poller", p.name)
p.status.SetGlobalLabel("version", p.options.Version)
p.status.SetGlobalLabel("datacenter", p.params.Datacenter)
p.status.SetGlobalLabel("hostname", p.options.Hostname)
pInstance.SetLabel("addr", p.target)

globalKVs := []string{
"poller", p.name,
"version", p.options.Version,
"datacenter", p.params.Datacenter,
"hostname", p.options.Hostname,
}

for i := 0; i < len(globalKVs); i += 2 {
p.status.SetGlobalLabel(globalKVs[i], globalKVs[i+1])
p.status2.SetGlobalLabel(globalKVs[i], globalKVs[i+1])
}

if p.options.PromPort != 0 {
p.status.SetGlobalLabel("promport", strconv.Itoa(p.options.PromPort))
p.status2.SetGlobalLabel("promport", strconv.Itoa(p.options.PromPort))
}

labels := p.params.Labels
if labels != nil {
for _, labelPtr := range *labels {
p.metadata.SetGlobalLabels(labelPtr)
p.status.SetGlobalLabels(labelPtr)
p.status2.SetGlobalLabels(labelPtr)
}
}
p.status.SetExportOptions(matrix.DefaultExportOptions())
p.status2.SetExportOptions(matrix.DefaultExportOptions())
}

func newMemoryMetric(status *matrix.Matrix, label string, sub string) {
fullLabel := label + "." + sub
mm, _ := status.NewMetricType(fullLabel, "uint64", label)
mm.SetLabel("metric", sub)
}

var pollerCmd = &cobra.Command{
Expand Down Expand Up @@ -1199,6 +1240,35 @@ func (p *Poller) mergeConfPath() {
p.options.SetConfPath(path)
}

func (p *Poller) addMemoryMetadata() {

pid := os.Getpid()
proc, err := process.NewProcess(int32(pid))
if err != nil {
logger.Error().Err(err).Int("pid", pid).Msg("Failed to lookup process for poller")
return
}
memInfo, err := proc.MemoryInfo()
if err != nil {
logger.Error().Err(err).Int("pid", pid).Msg("Failed to get memory info for poller")
return
}

_ = p.status2.LazySetValueUint64("memory.rss", "host", memInfo.RSS)
_ = p.status2.LazySetValueUint64("memory.vms", "host", memInfo.VMS)
_ = p.status2.LazySetValueUint64("memory.swap", "host", memInfo.Swap)

// Calculate memory percentage
memory, err := mem.VirtualMemory()
if err != nil {
logger.Error().Err(err).Int("pid", pid).Msg("Failed to get memory for machine")
return
}

memPercentage := float64(memInfo.RSS) / float64(memory.Total) * 100
_ = p.status2.LazySetValueFloat64("memory_percent", "host", memPercentage)
}

func startPoller(_ *cobra.Command, _ []string) {
poller := &Poller{}
poller.options = opts
Expand Down
2 changes: 0 additions & 2 deletions conf/unix/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ counters:
- start_time
- cpu
- cpu_percent
- memory
- memory_percent
- io
- net
- ctx
Expand Down