Skip to content
This repository has been archived by the owner on Aug 23, 2023. It is now read-only.

replace statsd[aemon] metrics with built-in metrics library #384

Closed
wants to merge 36 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
8839ae0
remove metrics that are unused or are now created/used in other packages
Dieterbe Nov 15, 2016
7fe099a
remove "github.com/raintank/met" and its deps from vendor folder
Dieterbe Nov 27, 2016
8fd1ccc
WIP replace statsd[aemon] metrics with built-in metrics library
Dieterbe Nov 13, 2016
c49dfa8
new config for new metrics library
Dieterbe Nov 27, 2016
9982b0b
update install/quickstart guide for new stats library
Dieterbe Nov 27, 2016
21626d2
move instrumented_nsq to main repo
Dieterbe Dec 9, 2016
ee9ee4d
do metrics the new way
Dieterbe Nov 27, 2016
f86e60f
update metric prefix for a better namespace distinguished from statsd
Dieterbe Dec 20, 2016
4eaa1eb
rework the metrics hierarchy
Dieterbe Dec 20, 2016
5f4c8ab
do stats config setup via module
Dieterbe Dec 20, 2016
ab30abc
make each metric declare their mtype
Dieterbe Dec 21, 2016
39f1e71
histogram & meter: track count, also as rate. clarify duration
Dieterbe Dec 21, 2016
51b2e89
measure time spent calling into idx and tank (~backpressure)
Dieterbe Dec 21, 2016
dc415b9
clarify ok/fail metrics are for add operation
Dieterbe Dec 21, 2016
7deaa16
fix dashboard
Dieterbe Dec 21, 2016
fbd141f
add docker env which makes testing of new stats easier
Dieterbe Dec 22, 2016
c05e99a
remove print
Dieterbe Dec 26, 2016
9f03d59
improve metrics prefix
Dieterbe Dec 27, 2016
7d6d152
also report this type of invalid metric
Dieterbe Dec 27, 2016
2080c3c
reset registry in benches so adding new metrics works
Dieterbe Dec 27, 2016
ec47883
update metrics descriptions
Dieterbe Dec 27, 2016
767502e
end2end test should use binary in build dir
Dieterbe Dec 27, 2016
6bfae5f
set environment in docker config to docker-env
Dieterbe Dec 28, 2016
a03842a
docker-dev should use actual config file from dir
Dieterbe Dec 28, 2016
6978d4b
fix end2end metric name
Dieterbe Dec 27, 2016
0f9fc02
can now track metricsActive directly
Dieterbe Dec 28, 2016
98e9678
simplify how we instantiate new stats
Dieterbe Dec 29, 2016
e8def85
better lock
Dieterbe Dec 29, 2016
622e18f
better pressure visualization
Dieterbe Dec 29, 2016
47cca8b
fix stats buffer-size option
Dieterbe Dec 29, 2016
4c9fbc2
add metrics about stats itself
Dieterbe Dec 29, 2016
d40e06e
better default from
Dieterbe Dec 29, 2016
5eac470
support calling Start and Stop over and over
Dieterbe Dec 29, 2016
650d027
simplify outbound to graphite conn routine
Dieterbe Dec 30, 2016
7b46143
on err, can bail out earlier + simplify a bit
Dieterbe Dec 30, 2016
f2cfaae
simplify
Dieterbe Dec 30, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 16 additions & 15 deletions api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,30 @@ import (

_ "net/http/pprof"

"github.com/raintank/met"
"github.com/raintank/metrictank/idx"
"github.com/raintank/metrictank/mdata"
"github.com/raintank/metrictank/stats"
"github.com/raintank/worldping-api/pkg/log"
"gopkg.in/macaron.v1"
)

var LogLevel int

var (
getTargetDuration met.Timer
itersToPointsDuration met.Timer
// just 1 global timer of request handling time. includes mem/cassandra gets, chunk decode/iters, json building etc
// there is such a thing as too many metrics. we have this, and cassandra timings, that should be enough for realtime profiling
reqHandleDuration met.Timer
reqSpanBoth met.Meter
reqSpanMem met.Meter
// metric api.get_target is how long it takes to get a target
getTargetDuration = stats.NewLatencyHistogram15s32("api.get_target")

// metric api.iters_to_points is how long it takes to decode points from a chunk iterator
itersToPointsDuration = stats.NewLatencyHistogram15s32("api.iters_to_points")

// metric api.request_handle is how long it takes to handle a render request
reqHandleDuration = stats.NewLatencyHistogram15s32("api.request_handle")

// metric api.requests_span.mem_and_cassandra is the timerange of requests hitting both in-memory and cassandra
reqSpanBoth = stats.NewMeter32("api.requests_span.mem_and_cassandra", false)

// metric api.requests_span.mem is the timerange of requests hitting only the ringbuffer
reqSpanMem = stats.NewMeter32("api.requests_span.mem", false)
)

type Server struct {
Expand All @@ -50,13 +57,7 @@ func (s *Server) BindBackendStore(store mdata.Store) {
s.BackendStore = store
}

func NewServer(stats met.Backend) (*Server, error) {

reqSpanMem = stats.NewMeter("requests_span.mem", 0)
reqSpanBoth = stats.NewMeter("requests_span.mem_and_cassandra", 0)
getTargetDuration = stats.NewTimer("get_target_duration", 0)
itersToPointsDuration = stats.NewTimer("iters_to_points_duration", 0)
reqHandleDuration = stats.NewTimer("request_handle_duration", 0)
func NewServer() (*Server, error) {

m := macaron.New()
m.Use(macaron.Logger())
Expand Down
4 changes: 2 additions & 2 deletions api/dataprocessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ func (s *Server) getSeries(req models.Req, consolidator consolidation.Consolidat
}
}
if oldest > fromUnix {
reqSpanBoth.Value(int64(toUnix - fromUnix))
reqSpanBoth.ValueUint32(toUnix - fromUnix)
if consolidator != consolidation.None {
key = aggMetricKey(key, consolidator.Archive(), interval)
}
Expand All @@ -422,7 +422,7 @@ func (s *Server) getSeries(req models.Req, consolidator consolidation.Consolidat
}
iters = append(iters, storeIters...)
} else {
reqSpanMem.Value(int64(toUnix - fromUnix))
reqSpanMem.ValueUint32(toUnix - fromUnix)
}
pre := time.Now()
iters = append(iters, memIters...)
Expand Down
7 changes: 1 addition & 6 deletions api/dataprocessor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package api

import (
"fmt"
"github.com/raintank/met/helper"
"github.com/raintank/metrictank/api/models"
"github.com/raintank/metrictank/cluster"
"github.com/raintank/metrictank/consolidation"
Expand Down Expand Up @@ -542,17 +541,13 @@ func TestPrevBoundary(t *testing.T) {
// TestGetSeries assures that series data is returned in proper form.
func TestGetSeries(t *testing.T) {
cluster.Init("default", "test", time.Now())
stats, _ := helper.New(false, "", "standard", "metrictank", "")
store := mdata.NewDevnullStore()
metrics := mdata.NewAggMetrics(store, 600, 10, 0, 0, 0, 0, []mdata.AggSetting{})
mdata.InitMetrics(stats)
addr = "localhost:6060"
srv, _ := NewServer(stats)
srv, _ := NewServer()
srv.BindBackendStore(store)
srv.BindMemoryStore(metrics)

defer metrics.Stop()

// the tests below cycles through every possible combination of:
// * every possible data offset (against its quantized version) e.g. offset between 0 and interval-1
// * every possible `from` offset (against its quantized query results) e.g. offset between 0 and interval-1
Expand Down
80 changes: 54 additions & 26 deletions cassandra/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,45 +5,73 @@ import (
"strings"

"github.com/gocql/gocql"
"github.com/raintank/met"
"github.com/raintank/metrictank/stats"
)

type Metrics struct {
cassErrTimeout met.Count
cassErrTooManyTimeouts met.Count
cassErrConnClosed met.Count
cassErrNoConns met.Count
cassErrUnavailable met.Count
cassErrCannotAchieveConsistency met.Count
cassErrOther met.Count
type ErrMetrics struct {
cassErrTimeout *stats.Counter32
cassErrTooManyTimeouts *stats.Counter32
cassErrConnClosed *stats.Counter32
cassErrNoConns *stats.Counter32
cassErrUnavailable *stats.Counter32
cassErrCannotAchieveConsistency *stats.Counter32
cassErrOther *stats.Counter32
}

func NewMetrics(component string, stats met.Backend) Metrics {
return Metrics{
cassErrTimeout: stats.NewCount(fmt.Sprintf("%s.error.timeout", component)),
cassErrTooManyTimeouts: stats.NewCount(fmt.Sprintf("%s.error.too-many-timeouts", component)),
cassErrConnClosed: stats.NewCount(fmt.Sprintf("%s.error.conn-closed", component)),
cassErrNoConns: stats.NewCount(fmt.Sprintf("%s.error.no-connections", component)),
cassErrUnavailable: stats.NewCount(fmt.Sprintf("%s.error.unavailable", component)),
cassErrCannotAchieveConsistency: stats.NewCount(fmt.Sprintf("%s.error.cannot-achieve-consistency", component)),
cassErrOther: stats.NewCount(fmt.Sprintf("%s.error.other", component)),
func NewErrMetrics(component string) ErrMetrics {
return ErrMetrics{

// metric idx.cassandra.error.timeout is a counter of timeouts seen to the cassandra idx
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems outdated, no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we cleared this up on slack. but basically https://github.com/Dieterbe/metrics2docs is a bit simplistic, it can't parse the code very well, and as there are 2 places where we dynamically construct these metrics:

mdata/store_cassandra.go
68:    errmetrics = cassandra.NewErrMetrics("store.cassandra")

idx/cassandra/cassandra.go
48:    errmetrics           = cassandra.NewErrMetrics("idx.cassandra")

we need to add two comments for each metric here, and there needs to be an empty line between them because otherwise metrics2docs fails


// metric store.cassandra.error.timeout is a counter of timeouts seen to the cassandra store
cassErrTimeout: stats.NewCounter32(fmt.Sprintf("%s.error.timeout", component)),

// metric idx.cassandra.error.too-many-timeouts is a counter of how many times we saw to many timeouts and closed the connection to the cassandra idx

// metric store.cassandra.error.too-many-timeouts is a counter of how many times we saw to many timeouts and closed the connection to the cassandra store
cassErrTooManyTimeouts: stats.NewCounter32(fmt.Sprintf("%s.error.too-many-timeouts", component)),

// metric idx.cassandra.error.conn-closed is a counter of how many times we saw a connection closed to the cassandra idx

// metric store.cassandra.error.conn-closed is a counter of how many times we saw a connection closed to the cassandra store
cassErrConnClosed: stats.NewCounter32(fmt.Sprintf("%s.error.conn-closed", component)),

// metric idx.cassandra.error.no-connections is a counter of how many times we had no connections remaining to the cassandra idx

// metric store.cassandra.error.no-connections is a counter of how many times we had no connections remaining to the cassandra store
cassErrNoConns: stats.NewCounter32(fmt.Sprintf("%s.error.no-connections", component)),

// metric idx.cassandra.error.unavailable is a counter of how many times the cassandra idx was unavailable

// metric store.cassandra.error.unavailable is a counter of how many times the cassandra store was unavailable
cassErrUnavailable: stats.NewCounter32(fmt.Sprintf("%s.error.unavailable", component)),

// metric idx.cassandra.error.cannot-achieve-consistency is a counter of the cassandra idx not being able to achieve consistency for a given query

// metric store.cassandra.error.cannot-achieve-consistency is a counter of the cassandra store not being able to achieve consistency for a given query
cassErrCannotAchieveConsistency: stats.NewCounter32(fmt.Sprintf("%s.error.cannot-achieve-consistency", component)),

// metric idx.cassandra.error.other is a counter of other errors talking to the cassandra idx

// metric store.cassandra.error.other is a counter of other errors talking to the cassandra store
cassErrOther: stats.NewCounter32(fmt.Sprintf("%s.error.other", component)),
}
}

func (m *Metrics) Inc(err error) {
func (m *ErrMetrics) Inc(err error) {
if err == gocql.ErrTimeoutNoResponse {
m.cassErrTimeout.Inc(1)
m.cassErrTimeout.Inc()
} else if err == gocql.ErrTooManyTimeouts {
m.cassErrTooManyTimeouts.Inc(1)
m.cassErrTooManyTimeouts.Inc()
} else if err == gocql.ErrConnectionClosed {
m.cassErrConnClosed.Inc(1)
m.cassErrConnClosed.Inc()
} else if err == gocql.ErrNoConnections {
m.cassErrNoConns.Inc(1)
m.cassErrNoConns.Inc()
} else if err == gocql.ErrUnavailable {
m.cassErrUnavailable.Inc(1)
m.cassErrUnavailable.Inc()
} else if strings.HasPrefix(err.Error(), "Cannot achieve consistency level") {
m.cassErrCannotAchieveConsistency.Inc(1)
m.cassErrCannotAchieveConsistency.Inc()
} else {
m.cassErrOther.Inc(1)
m.cassErrOther.Inc()
}
}
6 changes: 6 additions & 0 deletions cluster/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ import (
"time"

"github.com/google/go-querystring/query"
"github.com/raintank/metrictank/stats"
"github.com/raintank/worldping-api/pkg/log"
)

var clusterPrimary = stats.NewBool("cluster.primary")

//go:generate stringer -type=NodeState
type NodeState int

Expand Down Expand Up @@ -125,11 +128,14 @@ func (n *Node) SetReadyIn(t time.Duration) {
}()
}

// SetPrimary sets the primary status.
// Note: since we set the primary metric here, this should only be called on ThisNode !
func (n *Node) SetPrimary(p bool) {
n.Lock()
n.primary = p
n.primaryChange = time.Now()
n.Unlock()
clusterPrimary.Set(p)
}

func (n *Node) SetState(s NodeState) {
Expand Down
Loading