diff --git a/api/api.go b/api/api.go index ec31aa9885..7906c5993d 100644 --- a/api/api.go +++ b/api/api.go @@ -9,9 +9,9 @@ import ( _ "net/http/pprof" - "github.com/raintank/met" "github.com/raintank/metrictank/idx" "github.com/raintank/metrictank/mdata" + "github.com/raintank/metrictank/stats" "github.com/raintank/worldping-api/pkg/log" "gopkg.in/macaron.v1" ) @@ -19,13 +19,20 @@ import ( var LogLevel int var ( - getTargetDuration met.Timer - itersToPointsDuration met.Timer - // just 1 global timer of request handling time. includes mem/cassandra gets, chunk decode/iters, json building etc - // there is such a thing as too many metrics. we have this, and cassandra timings, that should be enough for realtime profiling - reqHandleDuration met.Timer - reqSpanBoth met.Meter - reqSpanMem met.Meter + // metric api.get_target is how long it takes to get a target + getTargetDuration = stats.NewLatencyHistogram15s32("api.get_target") + + // metric api.iters_to_points is how long it takes to decode points from a chunk iterator + itersToPointsDuration = stats.NewLatencyHistogram15s32("api.iters_to_points") + + // metric api.request_handle is how long it takes to handle a render request + reqHandleDuration = stats.NewLatencyHistogram15s32("api.request_handle") + + // metric api.requests_span.mem_and_cassandra is the timerange of requests hitting both in-memory and cassandra + reqSpanBoth = stats.NewMeter32("api.requests_span.mem_and_cassandra", false) + + // metric api.requests_span.mem is the timerange of requests hitting only the ringbuffer + reqSpanMem = stats.NewMeter32("api.requests_span.mem", false) ) type Server struct { @@ -50,13 +57,7 @@ func (s *Server) BindBackendStore(store mdata.Store) { s.BackendStore = store } -func NewServer(stats met.Backend) (*Server, error) { - - reqSpanMem = stats.NewMeter("requests_span.mem", 0) - reqSpanBoth = stats.NewMeter("requests_span.mem_and_cassandra", 0) - getTargetDuration = stats.NewTimer("get_target_duration", 0) - itersToPointsDuration = stats.NewTimer("iters_to_points_duration", 0) - reqHandleDuration = stats.NewTimer("request_handle_duration", 0) +func NewServer() (*Server, error) { m := macaron.New() m.Use(macaron.Logger()) diff --git a/api/dataprocessor.go b/api/dataprocessor.go index 36ff9d28ac..6e6e96c5f0 100644 --- a/api/dataprocessor.go +++ b/api/dataprocessor.go @@ -408,7 +408,7 @@ func (s *Server) getSeries(req models.Req, consolidator consolidation.Consolidat } } if oldest > fromUnix { - reqSpanBoth.Value(int64(toUnix - fromUnix)) + reqSpanBoth.ValueUint32(toUnix - fromUnix) if consolidator != consolidation.None { key = aggMetricKey(key, consolidator.Archive(), interval) } @@ -422,7 +422,7 @@ func (s *Server) getSeries(req models.Req, consolidator consolidation.Consolidat } iters = append(iters, storeIters...) } else { - reqSpanMem.Value(int64(toUnix - fromUnix)) + reqSpanMem.ValueUint32(toUnix - fromUnix) } pre := time.Now() iters = append(iters, memIters...) diff --git a/api/dataprocessor_test.go b/api/dataprocessor_test.go index f378fbd3e9..9c3c03b03a 100644 --- a/api/dataprocessor_test.go +++ b/api/dataprocessor_test.go @@ -2,7 +2,6 @@ package api import ( "fmt" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/api/models" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/consolidation" @@ -542,17 +541,13 @@ func TestPrevBoundary(t *testing.T) { // TestGetSeries assures that series data is returned in proper form. func TestGetSeries(t *testing.T) { cluster.Init("default", "test", time.Now()) - stats, _ := helper.New(false, "", "standard", "metrictank", "") store := mdata.NewDevnullStore() metrics := mdata.NewAggMetrics(store, 600, 10, 0, 0, 0, 0, []mdata.AggSetting{}) - mdata.InitMetrics(stats) addr = "localhost:6060" - srv, _ := NewServer(stats) + srv, _ := NewServer() srv.BindBackendStore(store) srv.BindMemoryStore(metrics) - defer metrics.Stop() - // the tests below cycles through every possible combination of: // * every possible data offset (against its quantized version) e.g. offset between 0 and interval-1 // * every possible `from` offset (against its quantized query results) e.g. offset between 0 and interval-1 diff --git a/cassandra/metrics.go b/cassandra/metrics.go index 618807fbf5..7a03e3fd61 100644 --- a/cassandra/metrics.go +++ b/cassandra/metrics.go @@ -5,45 +5,73 @@ import ( "strings" "github.com/gocql/gocql" - "github.com/raintank/met" + "github.com/raintank/metrictank/stats" ) -type Metrics struct { - cassErrTimeout met.Count - cassErrTooManyTimeouts met.Count - cassErrConnClosed met.Count - cassErrNoConns met.Count - cassErrUnavailable met.Count - cassErrCannotAchieveConsistency met.Count - cassErrOther met.Count +type ErrMetrics struct { + cassErrTimeout *stats.Counter32 + cassErrTooManyTimeouts *stats.Counter32 + cassErrConnClosed *stats.Counter32 + cassErrNoConns *stats.Counter32 + cassErrUnavailable *stats.Counter32 + cassErrCannotAchieveConsistency *stats.Counter32 + cassErrOther *stats.Counter32 } -func NewMetrics(component string, stats met.Backend) Metrics { - return Metrics{ - cassErrTimeout: stats.NewCount(fmt.Sprintf("%s.error.timeout", component)), - cassErrTooManyTimeouts: stats.NewCount(fmt.Sprintf("%s.error.too-many-timeouts", component)), - cassErrConnClosed: stats.NewCount(fmt.Sprintf("%s.error.conn-closed", component)), - cassErrNoConns: stats.NewCount(fmt.Sprintf("%s.error.no-connections", component)), - cassErrUnavailable: stats.NewCount(fmt.Sprintf("%s.error.unavailable", component)), - cassErrCannotAchieveConsistency: stats.NewCount(fmt.Sprintf("%s.error.cannot-achieve-consistency", component)), - cassErrOther: stats.NewCount(fmt.Sprintf("%s.error.other", component)), +func NewErrMetrics(component string) ErrMetrics { + return ErrMetrics{ + + // metric idx.cassandra.error.timeout is a counter of timeouts seen to the cassandra idx + + // metric store.cassandra.error.timeout is a counter of timeouts seen to the cassandra store + cassErrTimeout: stats.NewCounter32(fmt.Sprintf("%s.error.timeout", component)), + + // metric idx.cassandra.error.too-many-timeouts is a counter of how many times we saw to many timeouts and closed the connection to the cassandra idx + + // metric store.cassandra.error.too-many-timeouts is a counter of how many times we saw to many timeouts and closed the connection to the cassandra store + cassErrTooManyTimeouts: stats.NewCounter32(fmt.Sprintf("%s.error.too-many-timeouts", component)), + + // metric idx.cassandra.error.conn-closed is a counter of how many times we saw a connection closed to the cassandra idx + + // metric store.cassandra.error.conn-closed is a counter of how many times we saw a connection closed to the cassandra store + cassErrConnClosed: stats.NewCounter32(fmt.Sprintf("%s.error.conn-closed", component)), + + // metric idx.cassandra.error.no-connections is a counter of how many times we had no connections remaining to the cassandra idx + + // metric store.cassandra.error.no-connections is a counter of how many times we had no connections remaining to the cassandra store + cassErrNoConns: stats.NewCounter32(fmt.Sprintf("%s.error.no-connections", component)), + + // metric idx.cassandra.error.unavailable is a counter of how many times the cassandra idx was unavailable + + // metric store.cassandra.error.unavailable is a counter of how many times the cassandra store was unavailable + cassErrUnavailable: stats.NewCounter32(fmt.Sprintf("%s.error.unavailable", component)), + + // metric idx.cassandra.error.cannot-achieve-consistency is a counter of the cassandra idx not being able to achieve consistency for a given query + + // metric store.cassandra.error.cannot-achieve-consistency is a counter of the cassandra store not being able to achieve consistency for a given query + cassErrCannotAchieveConsistency: stats.NewCounter32(fmt.Sprintf("%s.error.cannot-achieve-consistency", component)), + + // metric idx.cassandra.error.other is a counter of other errors talking to the cassandra idx + + // metric store.cassandra.error.other is a counter of other errors talking to the cassandra store + cassErrOther: stats.NewCounter32(fmt.Sprintf("%s.error.other", component)), } } -func (m *Metrics) Inc(err error) { +func (m *ErrMetrics) Inc(err error) { if err == gocql.ErrTimeoutNoResponse { - m.cassErrTimeout.Inc(1) + m.cassErrTimeout.Inc() } else if err == gocql.ErrTooManyTimeouts { - m.cassErrTooManyTimeouts.Inc(1) + m.cassErrTooManyTimeouts.Inc() } else if err == gocql.ErrConnectionClosed { - m.cassErrConnClosed.Inc(1) + m.cassErrConnClosed.Inc() } else if err == gocql.ErrNoConnections { - m.cassErrNoConns.Inc(1) + m.cassErrNoConns.Inc() } else if err == gocql.ErrUnavailable { - m.cassErrUnavailable.Inc(1) + m.cassErrUnavailable.Inc() } else if strings.HasPrefix(err.Error(), "Cannot achieve consistency level") { - m.cassErrCannotAchieveConsistency.Inc(1) + m.cassErrCannotAchieveConsistency.Inc() } else { - m.cassErrOther.Inc(1) + m.cassErrOther.Inc() } } diff --git a/cluster/node.go b/cluster/node.go index dd26b2a8a5..44c7c68362 100644 --- a/cluster/node.go +++ b/cluster/node.go @@ -13,9 +13,12 @@ import ( "time" "github.com/google/go-querystring/query" + "github.com/raintank/metrictank/stats" "github.com/raintank/worldping-api/pkg/log" ) +var clusterPrimary = stats.NewBool("cluster.primary") + //go:generate stringer -type=NodeState type NodeState int @@ -125,11 +128,14 @@ func (n *Node) SetReadyIn(t time.Duration) { }() } +// SetPrimary sets the primary status. +// Note: since we set the primary metric here, this should only be called on ThisNode ! func (n *Node) SetPrimary(p bool) { n.Lock() n.primary = p n.primaryChange = time.Now() n.Unlock() + clusterPrimary.Set(p) } func (n *Node) SetState(s NodeState) { diff --git a/dashboard.json b/dashboard.json index 9e68643b54..a9e40d10f9 100644 --- a/dashboard.json +++ b/dashboard.json @@ -100,10 +100,11 @@ "color": "#890F02" }, { - "alias": "/backpressure/", - "fill": 4, + "alias": "/pressure\\./", + "fill": 0, "lines": true, - "nullPointMode": "null", + "linewidth": 1, + "nullPointMode": "connected", "points": false, "yaxis": 2 } @@ -114,88 +115,32 @@ "targets": [ { "refId": "A", - "target": "aliasSub(stats.$environment.metrictank.$instance.*.metrics_received, '.*\\.([^\\.]+)\\.metrics_received', '\\1 in')", + "target": "aliasSub(perSecond(metrictank.stats.$environment.$instance.input.*.metrics_received.counter32), '.*\\.([^\\.]+)\\.metrics_received.*', '\\1 in')", "textEditor": false }, { - "hide": true, "refId": "B", - "target": "alias(stats.$environment.gauges.metrictank.$instance.cluster.primary, 'primary')" + "target": "alias(perSecond(metrictank.stats.$environment.$instance.tank.metrics_too_old.counter32), 'too old')" }, { "refId": "C", - "target": "alias(stats.$environment.metrictank.$instance.metrics_too_old, 'too old')" + "target": "alias(perSecond(metrictank.stats.$environment.$instance.tank.add_to_saved_chunk.counter32), 'add-to-saved')" }, { "refId": "D", - "target": "alias(stats.$environment.metrictank.$instance.add_to_saved_chunk, 'add-to-saved')" + "target": "alias(perSecond(metrictank.stats.$environment.$instance.tank.add_to_saving_chunk.counter32), 'add-to-saving')" }, { "refId": "E", - "target": "alias(stats.$environment.metrictank.$instance.add_to_saving_chunk, 'add-to-saving')" + "target": "aliasSub(perSecond(metrictank.stats.$environment.$instance.input.*.metrics_decode_err.counter32), '.*\\.([^\\.]+)\\.metrics_decode_err.*', '\\1 decode err')" }, { "refId": "F", - "target": "aliasSub(stats.$environment.metrictank.$instance.*.metrics_decode_err, '.*\\.([^\\.]+)\\.metrics_decode_err', '\\1 decode err')", - "textEditor": false + "target": "aliasSub(perSecond(metrictank.stats.$environment.$instance.input.*.metric_invalid.counter32), '.*\\.([^\\.]+)\\.metric_invalid.*', '\\1 metric invalid')" }, { "refId": "G", - "target": "aliasSub(stats.$environment.metrictank.$instance.*.metric_invalid, '.*\\.([^\\.]+)\\.metric_invalid', '\\1 metric invalid')", - "textEditor": false - }, - { - "refId": "H", - "target": "alias(movingAverage(multiplySeries(stats.$environment.timers.metrictank.$instance.idx.elasticsearch.add_duration.mean, stats.$environment.timers.metrictank.$instance.idx.elasticsearch.add_duration.count_ps), 5), 'ES backpressure')", - "textEditor": true - }, - { - "hide": true, - "refId": "I", - "target": "stats.$environment.gauges.metrictank.$instance.cassandra.write_queue.size", - "textEditor": false - }, - { - "hide": true, - "refId": "J", - "target": "stats.$environment.gauges.metrictank.$instance.cassandra.num_writers", - "textEditor": false - }, - { - "hide": true, - "refId": "K", - "target": "multiplySeries(#I,#J)", - "textEditor": false - }, - { - "hide": true, - "refId": "L", - "target": "maxSeries(stats.$environment.timers.metrictank.$instance.cassandra.write_queue.*.items.upper)", - "textEditor": false - }, - { - "hide": true, - "refId": "M", - "target": "alias(diffSeries(#K, multiplySeries(#L,#J)),'remaining capacity')", - "textEditor": true - }, - { - "hide": true, - "refId": "N", - "target": "sumSeries(stats.$environment.metrictank.$instance.*.metrics_received)", - "textEditor": false - }, - { - "hide": true, - "refId": "O", - "target": "alias(offset(transformNull(removeAboveValue(#M,1),-1),1),'blocked')", - "textEditor": true - }, - { - "hide": false, - "refId": "P", - "target": "alias(scale(multiplySeries(#O, #N), 0.1), 'store backpressure')", - "textEditor": true + "target": "aliasByNode(scale(perSecond(metrictank.stats.$environment.$instance.input.carbon.pressure.*.counter32), 0.000000001), 6, 7)" } ], "timeFrom": null, @@ -220,7 +165,7 @@ "show": true }, { - "format": "short", + "format": "percentunit", "logBase": 1, "max": null, "min": 0, @@ -277,7 +222,7 @@ { "alias": "GCd metrics/s", "lines": false, - "pointradius": 2, + "pointradius": 1, "points": true }, { @@ -293,20 +238,20 @@ { "hide": false, "refId": "A", - "target": "aliasByNode(stats.$environment.gauges.metrictank.$instance.total_points, 5)" + "target": "aliasByNode(metrictank.stats.$environment.$instance.tank.total_points.gauge64, 5)" }, { "hide": false, "refId": "B", - "target": "aliasByNode(stats.$environment.gauges.metrictank.$instance.metrics_active, 5)" + "target": "aliasByNode(metrictank.stats.$environment.$instance.tank.metrics_active.gauge32, 5)" }, { "refId": "C", - "target": "alias(stats.$environment.metrictank.$instance.gc_metric, 'GCd metrics/s')", + "target": "alias(perSecond(metrictank.stats.$environment.$instance.tank.gc_metric.counter32), 'GCd metrics/s')", "textEditor": false }, { - "target": "alias(stats.$environment.gauges.metrictank.$instance.gc.heap_objects, 'heap objects')", + "target": "alias(metrictank.stats.$environment.$instance.memory.gc.heap_objects.gauge64, 'heap objects')", "refId": "D" } ], @@ -391,11 +336,11 @@ "targets": [ { "refId": "B", - "target": "alias(stats.$environment.gauges.metrictank.$instance.cluster.primary, 'primary')" + "target": "alias(metrictank.stats.$environment.$instance.cluster.primary.gauge1, 'primary')" }, { "refId": "A", - "target": "alias(stats.$environment.gauges.metrictank.$instance.cluster.promotion_wait, 'promotion wait')" + "target": "alias(metrictank.stats.$environment.$instance.cluster.promotion_wait.gauge32, 'promotion wait')" } ], "timeFrom": null, @@ -491,29 +436,29 @@ "steppedLine": false, "targets": [ { - "hide": true, + "hide": false, "refId": "A", - "target": "alias(stats.$environment.gauges.metrictank.$instance.bytes_alloc.incl_freed, 'alloc incl freed')" + "target": "alias(perSecond(metrictank.stats.$environment.$instance.memory.total_bytes_allocated.counter64), 'alloc rate')" }, { "hide": false, "refId": "B", - "target": "alias(stats.$environment.gauges.metrictank.$instance.bytes_alloc.not_freed, 'alloc not freed')" + "target": "alias(metrictank.stats.$environment.$instance.memory.bytes.allocated_in_heap.gauge64, 'allocated in heap')" }, { "hide": false, "refId": "C", - "target": "alias(stats.$environment.gauges.metrictank.$instance.bytes_sys, 'bytes_sys')" + "target": "alias(metrictank.stats.$environment.$instance.memory.bytes.obtained_from_sys.gauge64, 'bytes_sys')" }, { "hide": true, "refId": "D", - "target": "alias(stats.$environment.gauges.metrictank.$instance.metrics_active, 'metrics_active')" + "target": "alias(metrictank.stats.$environment.$instance.tank.metrics_active.gauge32, 'metrics_active')" }, { "hide": true, "refId": "E", - "target": "alias(stats.$environment.gauges.metrictank.$instance.total_points, 'total_points')" + "target": "alias(metrictank.stats.$environment.$instance.tank.total_points.gauge64, 'total_points')" }, { "hide": false, @@ -521,15 +466,10 @@ "target": "alias(divideSeries(#B, #E), 'bytes_per_point')", "textEditor": false }, - { - "hide": false, - "refId": "G", - "target": "alias(perSecond(stats.$environment.gauges.metrictank.$instance.bytes_alloc.not_freed), 'alloc speed')" - }, { "hide": false, "refId": "H", - "target": "alias(stats.$environment.gauges.metrictank.$instance.gc.cpufraction, 'GC cpu fraction - promille')" + "target": "alias(metrictank.stats.$environment.$instance.memory.gc.cpu_fraction.gauge32, 'GC cpu fraction - promille')" } ], "timeFrom": null, @@ -587,13 +527,13 @@ "total": false, "values": false }, - "lines": true, + "lines": false, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, - "pointradius": 5, - "points": false, + "pointradius": 1, + "points": true, "renderer": "flot", "seriesOverrides": [ { @@ -610,11 +550,11 @@ "targets": [ { "refId": "A", - "target": "alias(perSecond(stats.$environment.gauges.metrictank.$instance.gc.num), 'collections')" + "target": "alias(perSecond(metrictank.stats.$environment.$instance.memory.total_gc_cycles.counter64), 'collections')" }, { "refId": "B", - "target": "alias(stats.$environment.gauges.metrictank.$instance.gc.dur, 'duration')" + "target": "alias(metrictank.stats.$environment.$instance.memory.gc.last_duration.gauge64, 'duration')" } ], "timeFrom": null, @@ -708,23 +648,23 @@ "targets": [ { "refId": "A", - "target": "alias(stats.$environment.timers.metrictank.$instance.request_handle_duration.median, 'median')" + "target": "alias(metrictank.stats.$environment.$instance.api.request_handle.latency.median.gauge32, 'median')" }, { "refId": "B", - "target": "alias(stats.$environment.timers.metrictank.$instance.request_handle_duration.upper_90, 'p90')" + "target": "alias(metrictank.stats.$environment.$instance.api.request_handle.latency.p90.gauge32, 'p90')" }, { "refId": "C", - "target": "alias(stats.$environment.timers.metrictank.$instance.request_handle_duration.upper, 'max')" + "target": "alias(metrictank.stats.$environment.$instance.api.request_handle.latency.max.gauge32, 'max')" }, { "refId": "D", - "target": "aliasByNode(perSecond(collectd.$environment.metrictank.$instance.cpu.*.cpu.idle), 5, 7)" + "target": "aliasByNode(perSecond(collectd.$environment.$instance.cpu.*.cpu.idle), 5, 7)" }, { "refId": "E", - "target": "alias(stats.$environment.timers.metrictank.$instance.request_handle_duration.count_ps, 'reqs')" + "target": "alias(metrictank.stats.$environment.$instance.api.request_handle.values.rate32, 'reqs')" } ], "timeFrom": null, @@ -814,42 +754,42 @@ { "hide": false, "refId": "A", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem_and_cassandra.lower, 'mem-cass min')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem_and_cassandra.min.gauge32, 'mem-cass min')" }, { "hide": true, "refId": "B", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem_and_cassandra.upper_90, 'mem-cass p90')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem_and_cassandra.p90.gauge32, 'mem-cass p90')" }, { "hide": false, "refId": "C", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem_and_cassandra.median, 'mem-cass med')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem_and_cassandra.median.gauge32, 'mem-cass med')" }, { "hide": false, "refId": "D", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem_and_cassandra.upper, 'mem-cass max')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem_and_cassandra.max.gauge32, 'mem-cass max')" }, { "hide": false, "refId": "E", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem.lower, 'mem min')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem.min.gauge32, 'mem min')" }, { "hide": false, "refId": "F", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem.median, 'mem med')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem.median.gauge32, 'mem med')" }, { "hide": true, "refId": "G", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem.upper_90, 'mem p90')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem.p90.gauge32, 'mem p90')" }, { "hide": false, "refId": "H", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem.upper, 'mem max')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem.max.gauge32, 'mem max')" } ], "timeFrom": null, @@ -930,12 +870,12 @@ { "hide": false, "refId": "A", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem_and_cassandra.count_ps, 'mem-cass req/s')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem_and_cassandra.values.count32, 'mem-cass req/s')" }, { "hide": false, "refId": "B", - "target": "alias(stats.$environment.timers.metrictank.$instance.requests_span.mem.count_ps, 'mem req/s')" + "target": "alias(metrictank.stats.$environment.$instance.api.requests_span.mem.values.count32, 'mem req/s')" } ], "timeFrom": null, @@ -1013,30 +953,30 @@ "targets": [ { "refId": "A", - "target": "alias(stats.$environment.timers.metrictank.$instance.request_handle_duration.mean, 'total request handle')" + "target": "alias(metrictank.stats.$environment.$instance.api.request_handle.latency.mean.gauge32, 'total request handle')" }, { "refId": "B", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.get_chunks.mean, 'cassandra get chunks')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.get_chunks.latency.mean.gauge32, 'cassandra get chunks')" }, { "refId": "C", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.to_iter.mean, 'cassandra to iter')", + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.to_iter.latency.mean.gauge32, 'cassandra to iter')", "textEditor": false }, { "refId": "D", - "target": "alias(stats.$environment.timers.metrictank.$instance.get_target_duration.mean, 'total getTarget')", + "target": "alias(metrictank.stats.$environment.$instance.api.get_target.latency.mean.gauge32, 'total getTarget')", "textEditor": false }, { "refId": "E", - "target": "alias(stats.$environment.timers.metrictank.$instance.iters_to_points_duration.mean, 'iters to points')", + "target": "alias(metrictank.stats.$environment.$instance.api.iters_to_points.latency.mean.gauge32, 'iters to points')", "textEditor": false }, { "refId": "F", - "target": "alias(stats.$environment.timers.metrictank.$instance.mem.to_iter_duration.mean, 'mem to iter')", + "target": "alias(metrictank.stats.$environment.$instance.api.mem.to_iter.latency.mean.gauge32, 'mem to iter')", "textEditor": false } ], @@ -1131,7 +1071,7 @@ "linewidth": 1 }, { - "alias": "stats.production.timers.metrictank.metric-tank-3-prod.persist_duration.upper", + "alias": "stats.production.timers.metrictank.metric-tank-3-prod.persist.latency.max", "yaxis": 2 } ], @@ -1142,22 +1082,22 @@ { "hide": false, "refId": "A", - "target": "aliasByNode(stats.$environment.timers.metrictank.$instance.cassandra.write_queue.*.items.upper, 6, 7, 8)" + "target": "aliasByNode(metrictank.stats.$environment.$instance.store.cassandra.write_queue.*.items.max.gauge32, 6, 7, 8)" }, { "hide": false, "refId": "B", - "target": "alias(stats.$environment.gauges.metrictank.$instance.cassandra.write_queue.size, 'write queue items limit (each)')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.write_queue.size.gauge32, 'write queue items limit (each)')" }, { "hide": false, "refId": "C", - "target": "alias(stats.$environment.gauges.metrictank.$instance.cassandra.num_writers, 'num write-workers')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.num_writers.gauge32, 'num write-workers')" }, { "hide": false, "refId": "D", - "target": "alias(consolidateBy(stats.$environment.timers.metrictank.$instance.persist_duration.upper_75, 'sum'), 'persist-duration p75')" + "target": "alias(consolidateBy(metrictank.stats.$environment.$instance.persist.latency.max.gauge32, 'sum'), 'persist-duration max')" } ], "timeFrom": null, @@ -1193,7 +1133,7 @@ { "aliasColors": { "message_size.mean": "#2F575E", - "message_size.upper": "#EF843C" + "message_size.max": "#EF843C" }, "bars": false, "datasource": "$datasource", @@ -1239,11 +1179,11 @@ "targets": [ { "refId": "A", - "target": "aliasByNode(stats.$environment.timers.metrictank.$instance.notifier.*.message_size.{mean,upper}, 6, 7, 8)" + "target": "aliasByNode(metrictank.stats.$environment.$instance.cluster.notifier.*.message_size.{mean,max}.gauge32, 6, 7, 8)" }, { "refId": "B", - "target": "aliasByNode(stats.$environment.metrictank.$instance.notifier.*.messages-published, 5, 6)" + "target": "aliasByNode(perSecond(metrictank.stats.$environment.$instance.cluster.notifier.*.messages-published.counter32), 5, 6)" } ], "timeFrom": null, @@ -1327,11 +1267,11 @@ "targets": [ { "refId": "E", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.get.wait.upper_90, 'get p90')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.get.wait.latency.p90.gauge32, 'get p90')" }, { "refId": "F", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.put.wait.upper_90, 'put p90')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.put.wait.latency.p90.gauge32, 'put p90')" } ], "timeFrom": null, @@ -1417,27 +1357,27 @@ "targets": [ { "refId": "A", - "target": "alias(stats.$environment.timers.metrictank.$instance.chunk_size.at_load.median, 'size at load median')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunk_size.at_load.median.gauge32, 'size at load median')" }, { "refId": "B", - "target": "alias(stats.$environment.timers.metrictank.$instance.chunk_size.at_load.upper_90, 'size at load p90')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunk_size.at_load.p90.gauge32, 'size at load p90')" }, { "refId": "C", - "target": "alias(stats.$environment.timers.metrictank.$instance.chunk_size.at_load.upper, 'size at load max')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunk_size.at_load.max.gauge32, 'size at load max')" }, { "refId": "D", - "target": "alias(stats.$environment.timers.metrictank.$instance.chunk_size.at_save.median, 'size at save median')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunk_size.at_save.median.gauge32, 'size at save median')" }, { "refId": "E", - "target": "alias(stats.$environment.timers.metrictank.$instance.chunk_size.at_save.upper_90, 'size at save p90')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunk_size.at_save.p90.gauge32, 'size at save p90')" }, { "refId": "F", - "target": "alias(stats.$environment.timers.metrictank.$instance.chunk_size.at_save.upper, 'size at save max')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunk_size.at_save.max.gauge32, 'size at save max')" } ], "timeFrom": null, @@ -1516,11 +1456,11 @@ "targets": [ { "refId": "A", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.put.exec.count_ps, 'put/s')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.put.exec.values.rate32, 'put/s')" }, { "refId": "B", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.get.exec.count_ps, 'get/s')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.get.exec.values.rate32, 'get/s')" } ], "timeFrom": null, @@ -1601,19 +1541,19 @@ "targets": [ { "refId": "A", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.put.exec.upper_90, 'put p90')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.put.exec.latency.p90.gauge32, 'put p90')" }, { "refId": "B", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.put.exec.mean, 'put mean')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.put.exec.latency.mean.gauge32, 'put mean')" }, { "refId": "C", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.get.exec.upper_90, 'get p90')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.get.exec.latency.p90.gauge32, 'get p90')" }, { "refId": "D", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.get.exec.mean, 'get mean')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.get.exec.latency.mean.gauge32, 'get mean')" } ], "timeFrom": null, @@ -1710,27 +1650,27 @@ "targets": [ { "refId": "A", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.chunks_per_row.median, 'chunks per row med')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunks_per_row.median.gauge32, 'chunks per row med')" }, { "refId": "B", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.chunks_per_row.lower, 'chunks per row min')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunks_per_row.min.gauge32, 'chunks per row min')" }, { "refId": "C", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.chunks_per_row.upper, 'chunks per row max')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunks_per_row.max.gauge32, 'chunks per row max')" }, { "refId": "D", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.chunks_per_row.median, 'rows per resp med')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.chunks_per_row.median.gauge32, 'rows per resp med')" }, { "refId": "E", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.rows_per_response.lower, 'rows per resp min')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.rows_per_response.min.gauge32, 'rows per resp min')" }, { "refId": "F", - "target": "alias(stats.$environment.timers.metrictank.$instance.cassandra.rows_per_response.upper, 'rows per resp max')" + "target": "alias(metrictank.stats.$environment.$instance.store.cassandra.rows_per_response.max.gauge32, 'rows per resp max')" } ], "timeFrom": null, @@ -1774,7 +1714,7 @@ "targets": [ { "refId": "A", - "target": "aliasByNode(stats.$environment.metrictank.$instance.cassandra.error.*, 6)" + "target": "aliasByNode(perSecond(metrictank.stats.$environment.$instance.store.cassandra.error.*.counter32), 6)" } ], "datasource": "$datasource", @@ -1786,7 +1726,7 @@ "logBase": 1, "min": null, "max": null, - "format": "short" + "format": "hertz" }, { "label": null, @@ -1886,19 +1826,7 @@ "targets": [ { "refId": "A", - "target": "alias(stats.$environment.metrictank.$instance.chunks.create, 'create/s')" - }, - { - "refId": "B", - "target": "alias(stats.$environment.metrictank.$instance.chunks.clear, 'clear/s')" - }, - { - "refId": "C", - "target": "alias(stats.$environment.metrictank.$instance.chunks.save_ok, 'save_ok/s')" - }, - { - "refId": "D", - "target": "alias(stats.$environment.metrictank.$instance.chunks.save_fail, 'save_fail/s')" + "target": "aliasByNode(perSecond(metrictank.stats.$environment.$instance.*.chunk_operations.*.counter32), 6)" } ], "timeFrom": null, @@ -1916,7 +1844,7 @@ }, "yaxes": [ { - "format": "short", + "format": "hertz", "logBase": 1, "max": null, "min": null, @@ -1954,9 +1882,9 @@ "memory.ok": "#7EB26D", "metrics_to_es.fail": "#BF1B00", "metrics_to_es.ok": "#629E51", - "upper": "#DEDAF7", - "upper_75": "#0A50A1", - "upper_90": "#806EB7" + "max": "#DEDAF7", + "p75": "#0A50A1", + "p90": "#806EB7" }, "bars": false, "datasource": "$datasource", @@ -2011,12 +1939,12 @@ "targets": [ { "refId": "C", - "target": "aliasByNode(stats.$environment.metrictank.$instance.idx.*.*, 5, 6)", + "target": "aliasByNode(perSecond(metrictank.stats.$environment.$instance.idx.*.add.{ok,fail}.counter32), 5, 7)", "textEditor": false }, { "refId": "A", - "target": "alias(stats.$environment.gauges.metrictank.$instance.idx.elasticsearch.retrybuf.items, 'es-retrybuf-items')", + "target": "alias(metrictank.stats.$environment.$instance.idx.elasticsearch.retrybuf.items.gauge32, 'es-retrybuf-items')", "textEditor": false } ], @@ -2063,7 +1991,7 @@ "targets": [ { "refId": "A", - "target": "aliasByNode(stats.$environment.metrictank.$instance.idx.cassandra.error.*, 7)" + "target": "aliasByNode(perSecond(metrictank.stats.$environment.$instance.idx.cassandra.error.*.counter32), 7)" } ], "datasource": "$datasource", @@ -2075,7 +2003,7 @@ "logBase": 1, "min": null, "max": null, - "format": "short" + "format": "hertz" }, { "label": null, @@ -2137,9 +2065,9 @@ "aliasColors": { "cache hit": "#3F6833", "duration median": "#BA43A9", - "duration upper_90": "#511749", + "duration p90": "#511749", "metricdef lookup duration median": "#E0752D", - "metricdef lookup duration upper_90": "#BF1B00" + "metricdef lookup duration p90": "#BF1B00" }, "bars": false, "datasource": "$datasource", @@ -2175,7 +2103,7 @@ "renderer": "flot", "seriesOverrides": [ { - "alias": "/upper/", + "alias": "/max/", "fill": 1, "lines": true, "linewidth": 0, @@ -2188,7 +2116,7 @@ "targets": [ { "refId": "A", - "target": "aliasByNode(stats.$environment.timers.metrictank.$instance.idx.*.*.{mean,upper_90,upper}, 6, 7, 8)" + "target": "aliasByNode(metrictank.stats.$environment.$instance.idx.*.*.latency.{mean,p90,max}.gauge32, 5, 6, 8)" } ], "timeFrom": null, @@ -2223,10 +2151,265 @@ } ], "title": "New row" + }, + { + "title": "stats", + "height": "250px", + "editable": true, + "collapse": false, + "panels": [ + { + "title": "message size", + "error": false, + "span": 4, + "editable": true, + "type": "graph", + "isNew": true, + "id": 24, + "targets": [ + { + "target": "aliasByNode(metrictank.stats.$environment.$instance.stats.message_size.gauge32, 5)", + "refId": "A" + } + ], + "datasource": "$datasource", + "renderer": "flot", + "yaxes": [ + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "bytes" + }, + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "short" + } + ], + "xaxis": { + "show": true + }, + "grid": { + "threshold1": null, + "threshold2": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "lines": true, + "fill": 1, + "linewidth": 2, + "points": false, + "pointradius": 5, + "bars": false, + "stack": false, + "percentage": false, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "nullPointMode": "connected", + "steppedLine": false, + "tooltip": { + "value_type": "cumulative", + "shared": true, + "sort": 0, + "msResolution": false + }, + "timeFrom": null, + "timeShift": null, + "aliasColors": {}, + "seriesOverrides": [], + "links": [] + }, + { + "title": "conn to graphite", + "error": false, + "span": 4, + "editable": true, + "type": "graph", + "isNew": true, + "id": 25, + "targets": [ + { + "target": "alias(metrictank.stats.$environment.$instance.stats.graphite.connected.gauge1, 'connected')", + "refId": "A" + }, + { + "target": "alias(metrictank.stats.$environment.$instance.stats.graphite.flush.latency.p90.gauge32, 'flush latency')", + "refId": "B" + }, + { + "refId": "C", + "target": "alias(metrictank.stats.$environment.$instance.stats.graphite.flush.values.rate32, 'flush rate')" + } + ], + "datasource": "$datasource", + "renderer": "flot", + "yaxes": [ + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "short" + }, + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "short" + } + ], + "xaxis": { + "show": true + }, + "grid": { + "threshold1": null, + "threshold2": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "lines": true, + "fill": 1, + "linewidth": 2, + "points": false, + "pointradius": 5, + "bars": false, + "stack": false, + "percentage": false, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "nullPointMode": "null", + "steppedLine": false, + "tooltip": { + "value_type": "cumulative", + "shared": true, + "sort": 0, + "msResolution": false + }, + "timeFrom": null, + "timeShift": null, + "aliasColors": { + "queue-limit": "#BF1B00", + "queue-items": "#7EB26D" + }, + "seriesOverrides": [ + { + "alias": "connected", + "yaxis": 2, + "fill": 4, + "linewidth": 0 + } + ], + "links": [] + }, + { + "title": "queue to graphite", + "error": false, + "span": 4, + "editable": true, + "type": "graph", + "isNew": true, + "id": 26, + "targets": [ + { + "target": "alias(metrictank.stats.$environment.$instance.stats.graphite.write_queue.size.gauge32, 'queue-limit')", + "refId": "A" + }, + { + "target": "alias(metrictank.stats.$environment.$instance.stats.graphite.write_queue.items.max.gauge32, 'queue-items')", + "refId": "B" + } + ], + "datasource": "$datasource", + "renderer": "flot", + "yaxes": [ + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "short" + }, + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "short" + } + ], + "xaxis": { + "show": true + }, + "grid": { + "threshold1": null, + "threshold2": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "lines": false, + "fill": 1, + "linewidth": 2, + "points": true, + "pointradius": 1, + "bars": false, + "stack": false, + "percentage": false, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "nullPointMode": "connected", + "steppedLine": false, + "tooltip": { + "value_type": "cumulative", + "shared": true, + "sort": 0, + "msResolution": false + }, + "timeFrom": null, + "timeShift": null, + "aliasColors": { + "queue-limit": "#BF1B00", + "queue-items": "#7EB26D" + }, + "seriesOverrides": [], + "links": [] + } + ] } ], "time": { - "from": "now-1h", + "from": "now-5m", "to": "now" }, "timepicker": { @@ -2285,7 +2468,7 @@ "multiFormat": "glob", "name": "environment", "options": [], - "query": "stats.*", + "query": "metrictank.stats.*", "refresh": 1, "refresh_on_load": false, "regex": "", @@ -2301,7 +2484,7 @@ "multiFormat": "glob", "name": "instance", "options": [], - "query": "stats.$environment.metrictank.*", + "query": "metrictank.stats.$environment.*", "refresh": 1, "refresh_on_load": false, "regex": "", diff --git a/docker/docker-cluster/docker-compose.yml b/docker/docker-cluster/docker-compose.yml index 4121543e3d..936e8e9b2e 100644 --- a/docker/docker-cluster/docker-compose.yml +++ b/docker/docker-cluster/docker-compose.yml @@ -20,7 +20,6 @@ services: MT_CLUSTER_PRIMARY_NODE: "true" links: - cassandra - - statsdaemon metrictank1: hostname: metrictank1 @@ -40,7 +39,6 @@ services: MT_CLUSTER_PEERS: metrictank0:6060,metrictank2:6060,metrictank3:6060 links: - cassandra - - statsdaemon - metrictank0 metrictank2: @@ -62,7 +60,6 @@ services: MT_CLUSTER_PRIMARY_NODE: "true" links: - cassandra - - statsdaemon - metrictank0 metrictank3: @@ -83,7 +80,6 @@ services: MT_CLUSTER_PEERS: metrictank0:6060,metrictank1:6060,metrictank2:6060 links: - cassandra - - statsdaemon - metrictank0 cassandra: diff --git a/docker/docker-dev-custom-cfg-kafka/README.md b/docker/docker-dev-custom-cfg-kafka/README.md new file mode 100644 index 0000000000..1566b939ff --- /dev/null +++ b/docker/docker-dev-custom-cfg-kafka/README.md @@ -0,0 +1,6 @@ +this docker environment features: +* your build binary +* a custom config +* kafka input and clustering backend +* short chunkspan & numchunks + (benefits: easy to trigger mem and mem_and_cass requests, frequent cass saves, notifier messages etc) diff --git a/docker/docker-dev-custom-cfg-kafka/datasources/metrictank b/docker/docker-dev-custom-cfg-kafka/datasources/metrictank new file mode 100644 index 0000000000..25ebf9433e --- /dev/null +++ b/docker/docker-dev-custom-cfg-kafka/datasources/metrictank @@ -0,0 +1,7 @@ +{ + "name":"metrictank", + "type":"graphite", + "url":"http://localhost:8080", + "access":"direct", + "isDefault":true +} diff --git a/docker/docker-dev-custom-cfg-kafka/docker-compose.yml b/docker/docker-dev-custom-cfg-kafka/docker-compose.yml new file mode 100644 index 0000000000..79332f2c59 --- /dev/null +++ b/docker/docker-dev-custom-cfg-kafka/docker-compose.yml @@ -0,0 +1,64 @@ +version: '2' + +services: + metrictank: + hostname: metrictank + image: raintank/metrictank + ports: + - "6060:6060" + - "2003:2003" + volumes: + - ../../build/metrictank:/usr/bin/metrictank + - ./metrictank.ini:/etc/raintank/metrictank.ini + environment: + WAIT_HOSTS: cassandra:9042,kafka:9092 + WAIT_TIMEOUT: 60 + links: + - cassandra + + kafka: + hostname: kafka + image: raintank/kafka + environment: + ADVERTISED_HOST: kafka + NUM_PARTITIONS: 8 + ports: + - "2181:2181" + - "9092:9092" + - "9999:9999" + volumes: + - /tmp/kafka-logs + + cassandra: + hostname: cassandra + image: cassandra:3.0.8 + environment: + MAX_HEAP_SIZE: 1G + HEAP_NEWSIZE: 256M + ports: + - "9042:9042" + + graphite-api: + hostname: graphite-api + image: raintank/graphite-metrictank + ports: + - "8080:8080" + links: + - metrictank + - statsdaemon + volumes: + - "../graphite-metrictank.yaml:/etc/graphite-metrictank/graphite-metrictank.yaml" + + grafana: + hostname: grafana + image: grafana/grafana + ports: + - "3000:3000" + + statsdaemon: + hostname: statsdaemon + image: raintank/statsdaemon + ports: + - "8125:8125/udp" + volumes: + - "../statsdaemon.ini:/etc/statsdaemon.ini" diff --git a/docker/docker-dev-custom-cfg-kafka/metrictank.ini b/docker/docker-dev-custom-cfg-kafka/metrictank.ini new file mode 100644 index 0000000000..fe85a1b2c7 --- /dev/null +++ b/docker/docker-dev-custom-cfg-kafka/metrictank.ini @@ -0,0 +1,288 @@ +## misc ## + +# instance identifier. must be unique. used in clustering messages, for naming queue consumers and emitted metrics. +instance = default + +# accounting period to track per-org usage metrics +accounting-period = 5min + +## data ## + +# see https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md for more details +# duration of raw chunks. e.g. 10min, 30min, 1h, 90min... +chunkspan = 2min +# number of raw chunks to keep in memory. should be at least 1 more than what's needed to satisfy aggregation rules +numchunks = 1 +# minimum wait before raw metrics are removed from storage +ttl = 35d + +# max age for a chunk before to be considered stale and to be persisted to Cassandra +chunk-max-stale = 1h +# max age for a metric before to be considered stale and to be purged from memory +metric-max-stale = 6h +# Interval to run garbage collection job +gc-interval = 1h + +# duration before secondary nodes start serving requests +# shorter warmup means metrictank will need to query cassandra more if it doesn't have requested data yet. +# in clusters, best to assure the primary has saved all the data that a newly warmup instance will need to query, to prevent gaps in charts +warm-up-period = 1h + +# settings for rollups (aggregation for archives) +# comma-separated list of archive specifications. +# archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true] +# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false +# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in memory, keep for 3months in cassandra +# 1hr worth of data, in chunks of 6 hours, 2 chunks in mem, keep for 1 year, but this series is not ready yet for querying. +# When running a cluster of metrictank instances, all instances should have the same agg-settings. +# chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md +agg-settings = + +## metric data storage in cassandra ## + +# see https://github.com/raintank/metrictank/blob/master/docs/cassandra.md for more details +# comma-separated list of hostnames to connect to +cassandra-addrs = cassandra:9042 +# keyspace to use for storing the metric data table +cassandra-keyspace = raintank +# desired write consistency (any|one|two|three|quorum|all|local_quorum|each_quorum|local_one +cassandra-consistency = one +# how to select which hosts to query +# roundrobin : iterate all hosts, spreading queries evenly. +# hostpool-simple : basic pool that tracks which hosts are up and which are not. +# hostpool-epsilon-greedy : prefer best hosts, but regularly try other hosts to stay on top of all hosts. +# tokenaware,roundrobin : prefer host that has the needed data, fallback to roundrobin. +# tokenaware,hostpool-simple : prefer host that has the needed data, fallback to hostpool-simple. +# tokenaware,hostpool-epsilon-greedy : prefer host that has the needed data, fallback to hostpool-epsilon-greedy. +cassandra-host-selection-policy = tokenaware,hostpool-epsilon-greedy +# cassandra timeout in milliseconds +cassandra-timeout = 1000 +# max number of concurrent reads to cassandra +cassandra-read-concurrency = 20 +# max number of concurrent writes to cassandra +cassandra-write-concurrency = 10 +# max number of outstanding reads before blocking. value doesn't matter much +cassandra-read-queue-size = 100 +# write queue size per cassandra worker. should be large engough to hold all at least the total number of series expected, divided by how many workers you have +cassandra-write-queue-size = 100000 +# how many times to retry a query before failing it +cassandra-retries = 0 +# CQL protocol version. cassandra 3.x needs v3 or 4. +cql-protocol-version = 4 + +# enable SSL connection to cassandra +cassandra-ssl = false +# cassandra CA certficate path when using SSL +cassandra-ca-path = /etc/raintank/ca.pem +# host (hostname and server cert) verification when using SSL +cassandra-host-verification = true +# enable cassandra user authentication +cassandra-auth = false +# username for authentication +cassandra-username = cassandra +# password for authentication +cassandra-password = cassandra + +## Profiling and logging ## + +# see https://golang.org/pkg/runtime/#SetBlockProfileRate +block-profile-rate = 0 +# 0 to disable. 1 for max precision (expensive!) see https://golang.org/pkg/runtime/#pkg-variables") +mem-profile-rate = 524288 # 512*1024 + +# inspect status frequency. set to 0 to disable +proftrigger-freq = 60s +# path to store triggered profiles +proftrigger-path = /tmp +# minimum time between triggered profiles +proftrigger-min-diff = 1h +# if process consumes this many bytes (see bytes_sys in dashboard), trigger a heap profile for developer diagnosis +# set it higher than your typical memory usage, but lower than how much RAM the process can take before its get killed +proftrigger-heap-thresh = 25000000000 + +# only log log-level and higher. 0=TRACE|1=DEBUG|2=INFO|3=WARN|4=ERROR|5=CRITICAL|6=FATAL +log-level = 2 + +# instrumentation stats +[stats] +# enable sending graphite messages for instrumentation +enabled = true +# stats prefix (will add trailing dot automatically if needed) +# The default matches what the Grafana dashboard expects +# $instance will be replaced with the `instance` setting. +# note, the 3rd word describes the environment you deployed in. +prefix = metrictank.stats.default.$instance +# graphite address +addr = localhost:2003 +# interval at which to send statistics +interval = 1 +# how many messages (holding all measurements from one interval. rule of thumb: a message is ~25kB) to buffer up in case graphite endpoint is unavailable. +# With the default of 20k you will use max about 500MB and bridge 5 hours of downtime when needed +buffer-size = 20000 + +## http api ## +[http] +# tcp address for metrictank to bind to for its HTTP interface +listen = :6060 +# use HTTPS +ssl = false +# SSL certificate file +cert-file = /etc/ssl/certs/ssl-cert-snakeoil.pem +# SSL key file +key-file = /etc/ssl/private/ssl-cert-snakeoil.key +# limit on how many points could be requested in one request. 1M allows 500 series at a MaxDataPoints of 2000. (0 disables limit) +max-points-per-req = 1000000 +# limit on what kind of time range can be requested in one request. the default allows 500 series of 2 years. (0 disables limit) +max-days-per-req = 365000 +# only log incoming requests if their timerange is at least this duration. Use 0 to disable +log-min-dur = 5min + +## metric data inputs ## + +### carbon input (optional) +[carbon-in] +enabled = true +# tcp address +addr = :2003 +# represents the "partition" of your data if you decide to partition your data. +partition = 1 +# needed to know your raw resolution for your metrics. see http://graphite.readthedocs.io/en/latest/config-carbon.html#storage-schemas-conf +# NOTE: does NOT use aggregation and retention settings from this file. We use agg-settings and ttl for that. +schemas-file = /etc/raintank/storage-schemas.conf + +### kafka-mdm input (optional, recommended) +[kafka-mdm-in] +enabled = true +# tcp address (may be given multiple times as a comma-separated list) +brokers = kafka:9092 +# kafka topic (may be given multiple times as a comma-separated list) +topics = mdm +# offset to start consuming from. Can be one of newest, oldest,last or a time duration +# the further back in time you go, the more old data you can load into metrictank, but the longer it takes to catch up to realtime data +offset = last +# kafka partitions to consume. use '*' or a comma separated list of id's +partitions = * +# save interval for offsets +offset-commit-interval = 5s +# directory to store partition offsets index. supports relative or absolute paths. empty means working dir. +# it will be created (incl parent dirs) if not existing. +data-dir = /var/lib/metrictank +# The number of metrics to buffer in internal and external channels +channel-buffer-size = 1000000 +# The minimum number of message bytes to fetch in a request +consumer-fetch-min = 1 +# The default number of message bytes to fetch in a request +consumer-fetch-default = 32768 +# The maximum amount of time the broker will wait for Consumer.Fetch.Min bytes to become available before it +consumer-max-wait-time = 1s +#The maximum amount of time the consumer expects a message takes to process +consumer-max-processing-time = 1s +# How many outstanding requests a connection is allowed to have before sending on it blocks +net-max-open-requests = 100 + +## basic clustering settings ## +[cluster] +# The primary node writes data to cassandra. There should only be 1 primary node per shardGroup. +primary-node = true +# http/s addresses of other nodes, comma separated. use this if you shard your data and want to query other instances +peers = +# Interval to probe peer nodes +probe-interval = 2s +# Operating mode of cluster. (single|multi) +mode = single + +## clustering transports for tracking chunk saves between replicated instances ## +### kafka as transport for clustering messages (recommended) +[kafka-cluster] +enabled = true +# tcp address (may be given multiple times as a comma-separated list) +brokers = kafka:9092 +# kafka topic (only one) +topic = metricpersist +# offset to start consuming from. Can be one of newest, oldest,last or a time duration +offset = last +# save interval for offsets +offset-commit-interval = 5s +# directory to store partition offsets index. supports relative or absolute paths. empty means working dir. +# it will be created (incl parent dirs) if not existing. +data-dir = /var/lib/metrictank + +### nsq as transport for clustering messages +[nsq-cluster] +enabled = false +# nsqd TCP address (may be given multiple times as comma-separated list) +nsqd-tcp-address = +# lookupd HTTP address (may be given multiple times as comma-separated list) +lookupd-http-address = +topic = metricpersist +channel = tank +# passthrough to nsq.Producer (may be given multiple times as comma-separated list, see http://godoc.org/github.com/nsqio/go-nsq#Config)") +producer-opt = +#passthrough to nsq.Consumer (may be given multiple times as comma-separated list, http://godoc.org/github.com/nsqio/go-nsq#Config)") +consumer-opt = +# max number of messages to allow in flight +max-in-flight = 200 + + +## metric metadata index ## + +### in-memory +[memory-idx] +enabled = false + +### in memory, elasticsearch-backed +[elasticsearch-idx] +enabled = false +# elasticsearch index name to use +index = metric +# Elasticsearch host addresses (multiple hosts can be specified as comma-separated list) +hosts = elasticsearch:9200 +# http basic auth +user = +pass = +# how often the retry buffer should be flushed to ES. Valid units are "s", "m", "h" +retry-interval = 10m +# max number of concurrent connections to ES +max-conns = 20 +# max numver of docs to keep in the BulkIndexer buffer +max-buffer-docs = 1000 +# max delay befoer the BulkIndexer flushes its buffer +buffer-delay-max = 10s + +### in memory, cassandra-backed +[cassandra-idx] +enabled = true +# Cassandra keyspace to store metricDefinitions in. +keyspace = raintank +# comma separated list of cassandra addresses in host:port form +hosts = cassandra:9042 +#cql protocol version to use +protocol-version = 4 +# write consistency (any|one|two|three|quorum|all|local_quorum|each_quorum|local_one +consistency = one +# cassandra request timeout +timout = 1s +# number of concurrent connections to cassandra +num-conns = 10 +# Max number of metricDefs allowed to be unwritten to cassandra +write-queue-size = 100000 +#automatically clear series from the index if they have not been seen for this much time. +max-stale = 0 +#Interval at which the index should be checked for stale series. +prune-interval = 3h +#frequency at which we should update the metricDef lastUpdate field. +update-interval = 4h +#fuzzyness factor for update-interval. should be in the range 0 > fuzzyness <= 1. With an updateInterval of 4hours and fuzzyness of 0.5, metricDefs will be updated every 4-6hours. +update-fuzzyness = 0.5 +# enable SSL connection to cassandra +ssl = false +# cassandra CA certficate path when using SSL +ca-path = /etc/raintank/ca.pem +# host (hostname and server cert) verification when using SSL +host-verification = true +# enable cassandra user authentication +auth = false +# username for authentication +username = cassandra +# password for authentication +password = cassandra diff --git a/docker/docker-dev/docker-compose.yml b/docker/docker-dev/docker-compose.yml index 99a315c5b4..8734269467 100644 --- a/docker/docker-dev/docker-compose.yml +++ b/docker/docker-dev/docker-compose.yml @@ -9,12 +9,12 @@ services: - "2003:2003" volumes: - ../../build/metrictank:/usr/bin/metrictank + - ../../scripts/config/metrictank-docker.ini:/etc/raintank/metrictank.ini environment: WAIT_HOSTS: cassandra:9042 WAIT_TIMEOUT: 60 links: - cassandra - - statsdaemon cassandra: hostname: cassandra diff --git a/docker/docker-standard/docker-compose.yml b/docker/docker-standard/docker-compose.yml index e6d8129c64..c26d7c8f44 100644 --- a/docker/docker-standard/docker-compose.yml +++ b/docker/docker-standard/docker-compose.yml @@ -12,7 +12,6 @@ services: WAIT_TIMEOUT: 60 links: - cassandra - - statsdaemon cassandra: hostname: cassandra diff --git a/docs/config.md b/docs/config.md index b1b2234265..ad7bd5d5b5 100644 --- a/docs/config.md +++ b/docs/config.md @@ -108,19 +108,13 @@ cassandra-username = cassandra cassandra-password = cassandra ``` -## Profiling, instrumentation and logging ## +## Profiling and logging ## ``` # see https://golang.org/pkg/runtime/#SetBlockProfileRate block-profile-rate = 0 # 0 to disable. 1 for max precision (expensive!) see https://golang.org/pkg/runtime/#pkg-variables") mem-profile-rate = 524288 # 512*1024 -# enable sending statsd messages for instrumentation -statsd-enabled = true -# statsd address -statsd-addr = localhost:8125 -# standard or datadog -statsd-type = standard # inspect status frequency. set to 0 to disable proftrigger-freq = 60s # path to store triggered profiles @@ -132,6 +126,22 @@ proftrigger-min-diff = 1h proftrigger-heap-thresh = 25000000000 # only log log-level and higher. 0=TRACE|1=DEBUG|2=INFO|3=WARN|4=ERROR|5=CRITICAL|6=FATAL log-level = 2 +# instrumentation stats +[stats] +# enable sending graphite messages for instrumentation +enabled = true +# stats prefix (will add trailing dot automatically if needed) +# The default matches what the Grafana dashboard expects +# $instance will be replaced with the `instance` setting. +# note, the 3rd word describes the environment you deployed in. +prefix = metrictank.stats.default.$instance +# graphite address +addr = localhost:2003 +# interval at which to send statistics +interval = 1 +# how many messages (holding all measurements from one interval. rule of thumb: a message is ~25kB) to buffer up in case graphite endpoint is unavailable. +# With the default of 20k you will use max about 500MB and bridge 5 hours of downtime when needed +buffer-size = 20000 ``` ## http api ## diff --git a/docs/installation-deb.md b/docs/installation-deb.md index 97bf25ba22..c32b2c61ac 100644 --- a/docs/installation-deb.md +++ b/docs/installation-deb.md @@ -9,7 +9,7 @@ We'll go over these in more detail below. * Our [graphite-raintank finder plugin](https://github.com/raintank/graphite-metrictank) and our [graphite-api fork](https://github.com/raintank/graphite-api/) (installed as 1 component) We're working toward simplifying this much more. -* Optional: [statsd](https://github.com/etsy/statsd) or something compatible with it. For instrumentation +* Optional: [statsd](https://github.com/etsy/statsd) or something compatible with it. For instrumentation of graphite-api. * Optional: Kafka, if you want to buffer data in case metrictank goes down. Kafka 0.10.0.1 is highly recommended. [more info](https://github.com/raintank/metrictank/blob/master/docs/kafka.md) * (you can optionally use Elasticsearch for persistence of metrics metadata. We recommend Cassandra instead, much better and easier. So that's what we'll use here) @@ -138,23 +138,16 @@ The log - should you need it - is at /var/log/elasticsearch/elasticsearch.log ## Set up statsd -While optional, we highly recommend installing statsd or a statsd-compatible agent for instrumentation, so you can get insights into what's going on. -To disable, you will have to set `statsd-enabled` to false in `/etc/raintank/metrictank.ini`. -Metrictank will refuse to start if `statsd-enabled` is true and nothing listens on the configured `statsd-addr`. +You can optionally statsd or a statsd-compatible agent for instrumentation of graphite-api. You can install the official [statsd](https://github.com/etsy/statsd) (see its installation instructions) or an alternative. We recommend [raintank/statsdaemon](https://github.com/raintank/statsdaemon). -For the [metrictank dashboard](https://grafana.net/dashboards/279) to work properly, you need the right statsd/statsdaemon settings. - Below are instructions for statsd and statsdaemon. Note: * `` is however you choose to call your environment. (test, production, dev, ...). - * we recommend installing statsd/statsdaemon on the same host as metrictank. - * Note, statsd/statsdaemon will write to metrictank's carbon port on localhost:2003, while metrictank will send its own performance metrics to statsd/statsdaemon on localhost:8125. - This is a circular dependency, we typically just bring up statsdaemon first, and metrictank a bit later. This means you will see some "unable to flush" errors from statsdaemon - or statsd during the timeframe where metrictank is not up yet. + * Note, statsd/statsdaemon will write to metrictank's carbon port on localhost:2003. ### Statsdaemon diff --git a/docs/installation-rpm.md b/docs/installation-rpm.md index 85e4def68b..0a790f19f5 100644 --- a/docs/installation-rpm.md +++ b/docs/installation-rpm.md @@ -9,7 +9,7 @@ We'll go over these in more detail below. * Our [graphite-raintank finder plugin](https://github.com/raintank/graphite-metrictank) and our [graphite-api fork](https://github.com/raintank/graphite-api/) (installed as 1 component) We're working toward simplifying this much more. -* Optional: [statsd](https://github.com/etsy/statsd) or something compatible with it. For instrumentation +* Optional: [statsd](https://github.com/etsy/statsd) or something compatible with it. For instrumentation of graphite-api. * Optional: Kafka, if you want to buffer data in case metrictank goes down. Kafka 0.10.0.1 is highly recommended. [more info](https://github.com/raintank/metrictank/blob/master/docs/kafka.md) * (you can optionally use Elasticsearch for persistence of metrics metadata. We recommend Cassandra instead, much better and easier. So that's what we'll use here) @@ -154,23 +154,16 @@ The log - should you need it - is at /var/log/elasticsearch/elasticsearch.log ## Set up statsd -While optional, we highly recommend installing statsd or a statsd-compatible agent for instrumentation, so you can get insights into what's going on. -To disable, you will have to set `statsd-enabled` to false in `/etc/raintank/metrictank.ini`. -Metrictank will refuse to start if `statsd-enabled` is true and nothing listens on the configured `statsd-addr`. +You can optionally statsd or a statsd-compatible agent for instrumentation of graphite-api. You can install the official [statsd](https://github.com/etsy/statsd) (see its installation instructions) or an alternative. We recommend [raintank/statsdaemon](https://github.com/raintank/statsdaemon). -For the [metrictank dashboard](https://grafana.net/dashboards/279) to work properly, you need the right statsd/statsdaemon settings. - Below are instructions for statsd and statsdaemon. Note: * `` is however you choose to call your environment. (test, production, dev, ...). - * we recommend installing statsd/statsdaemon on the same host as metrictank. - * Note, statsd/statsdaemon will write to metrictank's carbon port on localhost:2003, while metrictank will send its own performance metrics to statsd/statsdaemon on localhost:8125. - This is a circular dependency, we typically just bring up statsdaemon first, and metrictank a bit later. This means you will see some "unable to flush" errors from statsdaemon - or statsd during the timeframe where metrictank is not up yet. + * Note, statsd/statsdaemon will write to metrictank's carbon port on localhost:2003. ### Statsdaemon diff --git a/docs/metrics.md b/docs/metrics.md index edc3359077..f78ffdfb0a 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,38 +1,156 @@ # Overview of metrics (only shows metrics that are documented. generated with [metrics2docs](github.com/Dieterbe/metrics2docs)) -* `add_to_saved_chunk`: +* `api.get_target`: +how long it takes to get a target +* `api.iters_to_points`: +how long it takes to decode points from a chunk iterator +* `api.request_handle`: +how long it takes to handle a render request +* `api.requests_span.mem`: +the timerange of requests hitting only the ringbuffer +* `api.requests_span.mem_and_cassandra`: +the timerange of requests hitting both in-memory and cassandra +* `cluster.notifier.kafka.message_size`: +the sizes seen of messages through the kafka cluster notifier +* `cluster.notifier.kafka.messages-published`: +a counter of messages published to the kafka cluster notifier +* `cluster.notifier.nsq.message_size`: +the sizes seen of messages through the nsq cluster notifier +* `cluster.notifier.nsq.messages-published`: +a counter of messages published to the nsq cluster notifier +* `cluster.promotion_wait`: +how long a candidate (secondary node) has to wait until it can become a primary +When the timer becomes 0 it means the in-memory buffer has been able to fully populate so that if you stop a primary +and it was able to save its complete chunks, this node will be able to take over without dataloss. +You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save. +* `idx.cassadra.add.ok`: +how many metrics are successfully being indexed +* `idx.cassandra.add`: +the duration of addititons to the cassandra idx +* `idx.cassandra.add.fail`: +how many failures were encountered while trying to index metrics +* `idx.cassandra.delete`: +the duration of deletions from the cassandra idx +* `idx.cassandra.error.cannot-achieve-consistency`: +a counter of the cassandra idx not being able to achieve consistency for a given query +* `idx.cassandra.error.conn-closed`: +a counter of how many times we saw a connection closed to the cassandra idx +* `idx.cassandra.error.no-connections`: +a counter of how many times we had no connections remaining to the cassandra idx +* `idx.cassandra.error.other`: +a counter of other errors talking to the cassandra idx +* `idx.cassandra.error.timeout`: +a counter of timeouts seen to the cassandra idx +* `idx.cassandra.error.too-many-timeouts`: +a counter of how many times we saw to many timeouts and closed the connection to the cassandra idx +* `idx.cassandra.error.unavailable`: +a counter of how many times the cassandra idx was unavailable +* `idx.elasticsearch.add`: +the duration of additions to the ES idx +* `idx.elasticsearch.add.fail`: +the number of failed additions to the ES idx +* `idx.elasticsearch.add.ok`: +the number of successfull additions to the ES idx +* `idx.elasticsearch.delete`: +the duration of deletes from the ES idx +* `idx.elasticsearch.retrybuf.items`: +the amount of items currently in the retry buffer +* `idx.memory.add`: +the duration of (successfull) memory idx additions +* `idx.memory.add.fail`: +the number of failed additions to the memory idx +* `idx.memory.add.ok`: +the number of successful additions to the memory idx +* `idx.memory.delete`: +the duration of memory idx deletes +* `idx.memory.find`: +the duration of memory idx find +* `idx.memory.get`: +the duration of memory idx gets +* `idx.memory.list`: +the duration of memory idx listings +* `mem.to_iter`: +how long it takes to transform in-memory chunks to iterators +* `memory.bytes.obtained_from_sys`: +the amount of bytes currently obtained from the system by the process. This is what the profiletrigger looks at. +* `memory.bytes_allocated_on_heap`: +a gauge of currently allocated (within the runtime) memory. +* `memory.gc.cpu_fraction`: +how much cpu is consumed by the GC across process lifetime, in pro-mille +* `memory.gc.heap_objects`: +how many objects are allocated on the heap, it's a key indicator for GC workload +* `memory.gc.last_duration`: +the duration of the last GC STW pause in nanoseconds +* `memory.total_bytes_allocated`: +a counter of total amount of bytes allocated during process lifetime +* `memory.total_gc_cycles`: +a counter of the number of GC cycles since process start +* `metric_invalid`: +a count of times a metric did not validate +* `metrics_decode_err`: +a count of times an input message (MetricData, MetricDataArray or carbon line) failed to parse +* `store.cassandra.chunk_operations.save_fail`: +counter of failed saves +* `store.cassandra.chunk_operations.save_ok`: +counter of successfull saves +* `store.cassandra.chunk_size.at_load`: +the sizes of chunks seen when loading them +* `store.cassandra.chunk_size.at_save`: +the sizes of chunks seen when saving them +* `store.cassandra.chunks_per_row`: +how many chunks are retrieved per row in get queries +* `store.cassandra.error.cannot-achieve-consistency`: +a counter of the cassandra store not being able to achieve consistency for a given query +* `store.cassandra.error.conn-closed`: +a counter of how many times we saw a connection closed to the cassandra store +* `store.cassandra.error.no-connections`: +a counter of how many times we had no connections remaining to the cassandra store +* `store.cassandra.error.other`: +a counter of other errors talking to the cassandra store +* `store.cassandra.error.timeout`: +a counter of timeouts seen to the cassandra store +* `store.cassandra.error.too-many-timeouts`: +a counter of how many times we saw to many timeouts and closed the connection to the cassandra store +* `store.cassandra.error.unavailable`: +a counter of how many times the cassandra store was unavailable +* `store.cassandra.get.exec`: +the duration of getting from cassandra store +* `store.cassandra.get.wait`: +the duration of the get spent in the queue +* `store.cassandra.get_chunks`: +the duration of how long it takes to get chunks +* `store.cassandra.put.exec`: +the duration of putting in cassandra store +* `store.cassandra.put.wait`: +the duration of a put in the wait queue +* `store.cassandra.rows_per_response`: +how many rows come per get response +* `store.cassandra.to_iter`: +the duration of converting chunks to iterators +* `tank.add_to_saved_chunk`: points received - by a secondary node - for the most recent chunk when that chunk has already been saved by a primary. A secondary can add this data to its chunks. -* `add_to_saving_chunk`: +* `tank.add_to_saving_chunk`: points received - by the primary node - for the most recent chunk when that chunk is already being saved (or has been saved). this indicates that your GC is actively sealing chunks and saving them before you have the chance to send your (infrequent) updates. The primary won't add them to its in-memory chunks, but secondaries will (because they are never in "saving" state for them), see below. -* `bytes_alloc.incl_freed`: -a counter of total amount of bytes allocated during process lifetime. (incl freed data) -* `bytes_alloc.not_freed`: -a gauge of currently allocated (within the runtime) memory. -it does not include freed data so it drops at every GC run. -* `bytes_sys`: -the amount of bytes currently obtained from the system by the process. This is what the profiletrigger looks at. -* `cluster.promotion_wait`: -how long a candidate (secondary node) has to wait until it can become a primary -When the timer becomes 0 it means the in-memory buffer has been able to fully populate so that if you stop a primary -and it was able to save its complete chunks, this node will be able to take over without dataloss. -You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save. -* `gc.heap_objects`: -how many objects are allocated on the heap, it's a key indicator for GC workload -* `gc_metric`: +* `tank.chunk_operations.clear`: +a counter of how many chunks are cleared (replaced by new chunks) +* `tank.chunk_operations.create`: +a counter of how many chunks are created +* `tank.gc_metric`: the amount of times the metrics GC is about to inspect a metric (series) -* `idx.cassadra.ok`: -how many metrics are successfully being indexed -* `idx.cassandra.fail`: -how failures encountered while trying to index metrics -* `metrics_active`: +* `tank.metrics_active`: the amount of currently known metrics (excl rollup series), measured every second -* `metrics_too_old`: +* `tank.metrics_too_old`: points that go back in time. E.g. for any given series, when a point has a timestamp that is not higher than the timestamp of the last written timestamp for that series. +* `tank.persist`: +how long it takes to persist a chunk (and chunks preceeding it) +this is subject to backpressure from the store when the store's queue runs full +* `tank.total_points`: +the number of points currently held in the in-memory ringbuffer diff --git a/docs/quick-start-docker.md b/docs/quick-start-docker.md index 1c3ef39a12..d3dc36588a 100644 --- a/docs/quick-start-docker.md +++ b/docs/quick-start-docker.md @@ -97,13 +97,12 @@ Note: it also works with `proxy` mode but then you have to enter `http://graphit Now let's see some data. If you go to `Dashboards`, `New` and add a new graph panel. In the metrics tab you should see a bunch of metrics already in the root hierarchy: -* `stats`: these are metrics coming from metrictank and graphite-api. - i.e. they send their own instrumentation into statsd (statsdaemon actually is the version we use here), - and statsdaemon sends aggregated metrics into metrictank's carbon port. Statsdaemon flushes every second. * `service_is_statsdaemon`: statsdaemon's own internal metrics which it sends to metrictank's carbon port. -* `metrictank`: usage metrics reported by metrictank. See +* `metrictank.stats`: internal stats reported by metrictank +* `metrictank.usage`: usage metrics reported by metrictank. See [Usage reporting](https://github.com/raintank/metrictank/blob/master/docs/usage-reporting.md) It may take a few minutes for the usage metrics to show up. +* `stats`: these are metrics coming from graphite-api, aggregated by statsdaemon and sent back to metrictank every second. Note that metrictank is setup to track every metric on a 1-second granularity. If you wish to use it for less frequent metrics, @@ -130,7 +129,7 @@ There is an extensive [dashboard on grafana.net](https://grafana.net/dashboards/ So go to the dashboard dropdown -> import -> and paste in `https://grafana.net/dashboards/279` into the Grafana.net url field. It will show a dialog with a choice of which graphite datasource to use, for which you can enter `metrictank`. -You should now have a functioning dashboard showing all metrictank's internal metrics, which it reports via statsdaemon, back into itself. +You should now have a functioning dashboard showing all metrictank's internal metrics which it reports into itself. Another dashboard you can import for instant gratification is the [statsdaemon](https://grafana.net/dashboards/297) dashboard, which shows you metrics about the metrics. Very meta. diff --git a/idx/cassandra/cassandra.go b/idx/cassandra/cassandra.go index db013e5084..209749d055 100644 --- a/idx/cassandra/cassandra.go +++ b/idx/cassandra/cassandra.go @@ -9,11 +9,11 @@ import ( "time" "github.com/gocql/gocql" - "github.com/raintank/met" "github.com/raintank/metrictank/cassandra" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx" "github.com/raintank/metrictank/idx/memory" + "github.com/raintank/metrictank/stats" "github.com/raintank/worldping-api/pkg/log" "github.com/rakyll/globalconf" "gopkg.in/raintank/schema.v1" @@ -37,11 +37,15 @@ const TableSchema = `CREATE TABLE IF NOT EXISTS %s.metric_idx ( const MetricIdxPartitionIndex = `CREATE INDEX IF NOT EXISTS ON %s.metric_idx(partition)` var ( - idxCasOk met.Count // metric idx.cassadra.ok is how many metrics are successfully being indexed - idxCasFail met.Count // metric idx.cassandra.fail is how failures encountered while trying to index metrics - idxCasAddDuration met.Timer - idxCasDeleteDuration met.Timer - metrics cassandra.Metrics + // metric idx.cassadra.add.ok is how many metrics are successfully being indexed + idxCasOk = stats.NewCounter32("idx.cassandra.add.ok") + // metric idx.cassandra.add.fail is how many failures were encountered while trying to index metrics + idxCasFail = stats.NewCounter32("idx.cassandra.add.fail") + // metric idx.cassandra.add is the duration of addititons to the cassandra idx + idxCasAddDuration = stats.NewLatencyHistogram15s32("idx.cassandra.add") + // metric idx.cassandra.delete is the duration of deletions from the cassandra idx + idxCasDeleteDuration = stats.NewLatencyHistogram15s32("idx.cassandra.delete") + errmetrics = cassandra.NewErrMetrics("idx.cassandra") Enabled bool ssl bool @@ -173,18 +177,12 @@ func (c *CasIdx) InitBare() error { // Init makes sure the needed keyspace, table, index in cassandra exists, creates the session, // rebuilds the in-memory index, sets up write queues, metrics and pruning routines -func (c *CasIdx) Init(stats met.Backend) error { +func (c *CasIdx) Init() error { log.Info("initializing cassandra-idx. Hosts=%s", hosts) - if err := c.MemoryIdx.Init(stats); err != nil { + if err := c.MemoryIdx.Init(); err != nil { return err } - idxCasOk = stats.NewCount("idx.cassandra.ok") - idxCasFail = stats.NewCount("idx.cassandra.fail") - idxCasAddDuration = stats.NewTimer("idx.cassandra.add_duration", 0) - idxCasDeleteDuration = stats.NewTimer("idx.cassandra.delete_duration", 0) - metrics = cassandra.NewMetrics("idx.cassandra", stats) - if err := c.InitBare(); err != nil { return err } @@ -322,8 +320,8 @@ func (c *CasIdx) processWriteQueue() { req.def.Tags, req.def.LastUpdate).Exec(); err != nil { - idxCasFail.Inc(1) - metrics.Inc(err) + idxCasFail.Inc() + errmetrics.Inc(err) if (attempts % 20) == 0 { log.Warn("cassandra-idx Failed to write def to cassandra. it will be retried. %s", err) } @@ -336,7 +334,7 @@ func (c *CasIdx) processWriteQueue() { } else { success = true idxCasAddDuration.Value(time.Since(req.recvTime)) - idxCasOk.Inc(1) + idxCasOk.Inc() log.Debug("cassandra-idx metricDef saved to cassandra. %s", req.def.Id) } } @@ -357,7 +355,7 @@ func (c *CasIdx) Delete(orgId int, pattern string) ([]schema.MetricDefinition, e attempts++ cErr := c.session.Query("DELETE FROM metric_idx where id=?", def.Id).Exec() if cErr != nil { - metrics.Inc(err) + errmetrics.Inc(err) log.Error(3, "cassandra-idx Failed to delete metricDef %s from cassandra. %s", def.Id, err) time.Sleep(time.Second) } else { @@ -380,7 +378,7 @@ func (c *CasIdx) Prune(orgId int, oldest time.Time) ([]schema.MetricDefinition, attempts++ cErr := c.session.Query("DELETE FROM metric_idx where id=?", def.Id).Exec() if cErr != nil { - metrics.Inc(err) + errmetrics.Inc(err) log.Error(3, "cassandra-idx Failed to delete metricDef %s from cassandra. %s", def.Id, err) time.Sleep(time.Second) } else { diff --git a/idx/cassandra/cassandra_test.go b/idx/cassandra/cassandra_test.go index 6be985d9c8..24453bb1dc 100644 --- a/idx/cassandra/cassandra_test.go +++ b/idx/cassandra/cassandra_test.go @@ -8,7 +8,6 @@ import ( "testing" "time" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx" . "github.com/smartystreets/goconvey/convey" @@ -72,8 +71,7 @@ func getMetricData(orgId, depth, count, interval int, prefix string) []*schema.M func TestGetAddKey(t *testing.T) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() publicSeries := getMetricData(-1, 2, 5, 10, "metric.public") org1Series := getMetricData(1, 2, 5, 10, "metric.org1") @@ -112,8 +110,7 @@ func TestGetAddKey(t *testing.T) { func TestFind(t *testing.T) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() for _, s := range getMetricData(-1, 2, 5, 10, "metric.demo") { ix.Add(s, 1) } @@ -228,11 +225,10 @@ func BenchmarkIndexing(b *testing.B) { } tmpSession.Query("TRUNCATE raintank.metric_idx").Exec() tmpSession.Close() - stats, err := helper.New(false, "", "standard", "metrictank", "") if err != nil { b.Skipf("can't connect to cassandra: %s", err) } - ix.Init(stats) + ix.Init() b.ReportAllocs() b.ResetTimer() @@ -269,14 +265,13 @@ func BenchmarkLoad(b *testing.B) { updateFuzzyness = 1.0 ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") tmpSession, err := ix.cluster.CreateSession() if err != nil { b.Skipf("can't connect to cassandra: %s", err) } tmpSession.Query("TRUNCATE raintank.metric_idx").Exec() tmpSession.Close() - err = ix.Init(stats) + err = ix.Init() if err != nil { b.Skipf("can't initialize cassandra: %s", err) } @@ -287,6 +282,6 @@ func BenchmarkLoad(b *testing.B) { b.ReportAllocs() b.ResetTimer() ix = New() - ix.Init(stats) + ix.Init() ix.Stop() } diff --git a/idx/elasticsearch/elasticsearch.go b/idx/elasticsearch/elasticsearch.go index 8ccdf7de0b..8b10faa27b 100644 --- a/idx/elasticsearch/elasticsearch.go +++ b/idx/elasticsearch/elasticsearch.go @@ -11,21 +11,26 @@ import ( "time" "github.com/mattbaird/elastigo/lib" - "github.com/raintank/met" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx" "github.com/raintank/metrictank/idx/memory" + "github.com/raintank/metrictank/stats" "github.com/raintank/worldping-api/pkg/log" "github.com/rakyll/globalconf" "gopkg.in/raintank/schema.v1" ) var ( - idxEsOk met.Count - idxEsFail met.Count - idxEsAddDuration met.Timer - idxEsDeleteDuration met.Timer - retryBufItems met.Gauge + // metric idx.elasticsearch.add.ok is the number of successfull additions to the ES idx + idxEsOk = stats.NewCounter32("idx.elasticsearch.add.ok") + // metric idx.elasticsearch.add.fail is the number of failed additions to the ES idx + idxEsFail = stats.NewCounter32("idx.elasticsearch.add.fail") + // metric idx.elasticsearch.add is the duration of additions to the ES idx + idxEsAddDuration = stats.NewLatencyHistogram15s32("idx.elasticsearch.add") + // metric idx.elasticsearch.delete is the duration of deletes from the ES idx + idxEsDeleteDuration = stats.NewLatencyHistogram15s32("idx.elasticsearch.delete") + // metric idx.elasticsearch.retrybuf.items is the amount of items currently in the retry buffer + retryBufItems = stats.NewGauge32("idx.elasticsearch.retrybuf.items") Enabled bool esIndex string @@ -92,7 +97,7 @@ func (r *RetryBuffer) Queue(id string) { } r.Lock() r.Defs = append(r.Defs, def) - retryBufItems.Value(int64(len(r.Defs))) + retryBufItems.Set(len(r.Defs)) r.Unlock() } @@ -100,7 +105,7 @@ func (r *RetryBuffer) retry() { r.Lock() defs := r.Defs r.Defs = make([]schema.MetricDefinition, 0, len(defs)) - retryBufItems.Value(0) + retryBufItems.Set(0) r.Unlock() if len(defs) == 0 { log.Debug("retry buffer is empty") @@ -110,7 +115,7 @@ func (r *RetryBuffer) retry() { if err := r.Index.BulkIndexer.Index(esIndex, "metric_index", d.Id, "", "", nil, d); err != nil { log.Error(3, "Failed to add metricDef to BulkIndexer queue. %s", err) r.Defs = append(r.Defs, d) - retryBufItems.Value(int64(len(r.Defs))) + retryBufItems.Set(len(r.Defs)) return } } @@ -158,23 +163,16 @@ func New() *EsIdx { } } -func (e *EsIdx) Init(stats met.Backend) error { +func (e *EsIdx) Init() error { if esRetryInterval < time.Second { return errors.New("Invalid retry-interval. Valid units are 's', 'm', 'h'. Must be at least 1 second") } log.Info("initializing EsIdx. Hosts=%s", esHosts) - if err := e.MemoryIdx.Init(stats); err != nil { + if err := e.MemoryIdx.Init(); err != nil { return err } - - idxEsOk = stats.NewCount("idx.elasticsearch.ok") - idxEsFail = stats.NewCount("idx.elasticsearch.fail") - idxEsAddDuration = stats.NewTimer("idx.elasticsearch.add_duration", 0) - idxEsDeleteDuration = stats.NewTimer("idx.elasticsearch.delete_duration", 0) - retryBufItems = stats.NewGauge("idx.elasticsearch.retrybuf.items", 0) - log.Info("Checking if index %s exists in ES", esIndex) if exists, err := e.Conn.ExistsIndex(esIndex, "", nil); err != nil && err.Error() != "record not found" { return err @@ -287,7 +285,7 @@ func (e *EsIdx) processEsResponse(body []byte) error { for _, m := range response.Items { docCount += len(m) } - idxEsOk.Inc(int64(docCount)) + idxEsOk.Add(docCount) return nil } @@ -298,14 +296,14 @@ func (e *EsIdx) processEsResponse(body []byte) error { if errStr, ok := v["error"].(string); ok { log.Warn("ES: %s failed: %s", id, errStr) e.retryBuf.Queue(id) - idxEsFail.Inc(1) + idxEsFail.Inc() } else if errMap, ok := v["error"].(map[string]interface{}); ok { log.Warn("ES: %s failed: %s: %q", id, errMap["type"].(string), errMap["reason"].(string)) e.retryBuf.Queue(id) - idxEsFail.Inc(1) + idxEsFail.Inc() } else { log.Debug("ES: completed %s successfully.", id) - idxEsOk.Inc(1) + idxEsOk.Inc() } } } diff --git a/idx/elasticsearch/elasticsearch_test.go b/idx/elasticsearch/elasticsearch_test.go index b947e1f598..e10cb53e90 100644 --- a/idx/elasticsearch/elasticsearch_test.go +++ b/idx/elasticsearch/elasticsearch_test.go @@ -8,7 +8,6 @@ import ( "testing" "time" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx" //"github.com/raintank/worldping-api/pkg/log" @@ -49,8 +48,7 @@ func TestES(t *testing.T) { reqChan <- requestTrace{method: method, url: urlStr, body: body} } - stats, _ := helper.New(false, "", "standard", "metrictank", "") - err := ix.Init(stats) + err := ix.Init() So(err, ShouldBeNil) // we should see a PUT request to create the index mapping @@ -76,8 +74,7 @@ func TestES(t *testing.T) { ix.Conn.RequestTracer = func(method, urlStr, body string) { reqChan <- requestTrace{method: method, url: urlStr, body: body} } - stats, _ := helper.New(false, "", "standard", "metrictank", "") - err := ix.Init(stats) + err := ix.Init() So(err, ShouldBeNil) defs := ix.List(1) @@ -149,8 +146,7 @@ func TestGetAddKey(t *testing.T) { rt.Response["POST http://localhost:9200/_bulk?refresh=true"] = handleBulkOk ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() defer ix.Stop() publicSeries := getMetricData(-1, 2, 5, 10, "metric.public") @@ -206,8 +202,7 @@ func TestFind(t *testing.T) { rt.Response["POST http://localhost:9200/_bulk?refresh=true"] = handleBulkOk ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() defer ix.Stop() for _, s := range getMetricData(-1, 2, 5, 10, "metric.demo") { @@ -324,8 +319,7 @@ func TestDelete(t *testing.T) { rt.Response["POST http://localhost:9200/_bulk?refresh=true"] = handleBulkOk ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() defer ix.Stop() publicSeries := getMetricData(-1, 2, 5, 10, "metric.public") @@ -395,8 +389,7 @@ func BenchmarkIndexing(b *testing.B) { ix := New() ix.Conn.DeleteIndex(esIndex) - stats, _ := helper.New(false, "", "standard", "metrictank", "") - err := ix.Init(stats) + err := ix.Init() if err != nil { b.Skipf("can't connect to ES: %s", err) } diff --git a/idx/idx.go b/idx/idx.go index ef40642e89..b58672c20c 100644 --- a/idx/idx.go +++ b/idx/idx.go @@ -6,7 +6,6 @@ import ( "errors" "time" - "github.com/raintank/met" "gopkg.in/raintank/schema.v1" ) @@ -42,7 +41,7 @@ Note: Interface -* Init(met.Backend) +* Init() This is the initialization step performed at startup. This method should block until the index is ready to handle searches. @@ -85,7 +84,7 @@ Interface error encountered. */ type MetricIndex interface { - Init(met.Backend) error + Init() error Stop() Add(*schema.MetricData, int32) error Get(string) (schema.MetricDefinition, error) diff --git a/idx/memory/memory.go b/idx/memory/memory.go index 5272e47cf0..cdcb2ac2fb 100644 --- a/idx/memory/memory.go +++ b/idx/memory/memory.go @@ -8,21 +8,28 @@ import ( "sync" "time" - "github.com/raintank/met" "github.com/raintank/metrictank/idx" + "github.com/raintank/metrictank/stats" "github.com/raintank/worldping-api/pkg/log" "github.com/rakyll/globalconf" "gopkg.in/raintank/schema.v1" ) var ( - idxOk met.Count - idxFail met.Count - idxAddDuration met.Timer - idxGetDuration met.Timer - idxListDuration met.Timer - idxFindDuration met.Timer - idxDeleteDuration met.Timer + // metric idx.memory.add.ok is the number of successful additions to the memory idx + idxOk = stats.NewCounter32("idx.memory.add.ok") + // metric idx.memory.add.fail is the number of failed additions to the memory idx + idxFail = stats.NewCounter32("idx.memory.add.fail") + // metric idx.memory.add is the duration of (successfull) memory idx additions + idxAddDuration = stats.NewLatencyHistogram15s32("idx.memory.add") + // metric idx.memory.get is the duration of memory idx gets + idxGetDuration = stats.NewLatencyHistogram15s32("idx.memory.get") + // metric idx.memory.list is the duration of memory idx listings + idxListDuration = stats.NewLatencyHistogram15s32("idx.memory.list") + // metric idx.memory.find is the duration of memory idx find + idxFindDuration = stats.NewLatencyHistogram15s32("idx.memory.find") + // metric idx.memory.delete is the duration of memory idx deletes + idxDeleteDuration = stats.NewLatencyHistogram15s32("idx.memory.delete") Enabled bool ) @@ -67,14 +74,7 @@ func New() *MemoryIdx { } } -func (m *MemoryIdx) Init(stats met.Backend) error { - idxOk = stats.NewCount("idx.memory.ok") - idxFail = stats.NewCount("idx.memory.fail") - idxAddDuration = stats.NewTimer("idx.memory.add_duration", 0) - idxGetDuration = stats.NewTimer("idx.memory.get_duration", 0) - idxListDuration = stats.NewTimer("idx.memory.list_duration", 0) - idxFindDuration = stats.NewTimer("idx.memory.find_duration", 0) - idxDeleteDuration = stats.NewTimer("idx.memory.delete_duration", 0) +func (m *MemoryIdx) Init() error { return nil } @@ -92,14 +92,14 @@ func (m *MemoryIdx) Add(data *schema.MetricData, partition int32) error { // there's not much point in doing the work of trying over // and over again, and flooding the logs with the same failure. // so just trigger the stats metric as if we tried again - idxFail.Inc(1) + idxFail.Inc() return err } existing, ok := m.DefById[data.Id] if ok { log.Debug("metricDef with id %s already in index.", data.Id) existing.LastUpdate = data.Time - idxOk.Inc(1) + idxOk.Inc() idxAddDuration.Value(time.Since(pre)) return nil } @@ -133,7 +133,7 @@ func (m *MemoryIdx) AddDef(def *schema.MetricDefinition) error { if _, ok := m.DefById[def.Id]; ok { log.Debug("memory-idx: metricDef with id %s already in index.", def.Id) m.DefById[def.Id] = def - idxOk.Inc(1) + idxOk.Inc() idxAddDuration.Value(time.Since(pre)) return nil } @@ -166,13 +166,13 @@ func (m *MemoryIdx) add(def *schema.MetricDefinition) error { //bad data. A path cant be both a leaf and a branch. log.Info("memory-idx: Bad data, a path can not be both a leaf and a branch. %d - %s", def.OrgId, path) m.FailedDefs[def.Id] = idx.BothBranchAndLeaf - idxFail.Inc(1) + idxFail.Inc() return idx.BothBranchAndLeaf } log.Debug("memory-idx: existing index entry for %s. Adding %s as child", path, def.Id) node.Children = append(node.Children, def.Id) m.DefById[def.Id] = def - idxOk.Inc(1) + idxOk.Inc() return nil } } @@ -193,7 +193,7 @@ func (m *MemoryIdx) add(def *schema.MetricDefinition) error { if n.Leaf { log.Info("memory-idx: Branches cant be added to a leaf node. %d - %s", def.OrgId, path) m.FailedDefs[def.Id] = idx.BranchUnderLeaf - idxFail.Inc(1) + idxFail.Inc() return idx.BranchUnderLeaf } log.Debug("memory-idx: Found branch %s which metricDef %s is a descendant of", branch, path) @@ -232,7 +232,7 @@ func (m *MemoryIdx) add(def *schema.MetricDefinition) error { Children: []string{def.Id}, } m.DefById[def.Id] = def - idxOk.Inc(1) + idxOk.Inc() return nil } diff --git a/idx/memory/memory_find_test.go b/idx/memory/memory_find_test.go index 8bb10ea9aa..396863b905 100644 --- a/idx/memory/memory_find_test.go +++ b/idx/memory/memory_find_test.go @@ -5,7 +5,6 @@ import ( "strconv" "testing" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/idx" "gopkg.in/raintank/schema.v1" ) @@ -52,8 +51,7 @@ func diskMetrics(dcCount, hostCount, hostOffset, diskCount int, prefix string) [ func Init() { ix = New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() var data *schema.MetricData diff --git a/idx/memory/memory_test.go b/idx/memory/memory_test.go index fc23389e55..9ae7ae1fd3 100644 --- a/idx/memory/memory_test.go +++ b/idx/memory/memory_test.go @@ -8,7 +8,6 @@ import ( "testing" "time" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/idx" . "github.com/smartystreets/goconvey/convey" "gopkg.in/raintank/schema.v1" @@ -59,8 +58,7 @@ func getMetricData(orgId, depth, count, interval int, prefix string) []*schema.M func TestGetAddKey(t *testing.T) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() publicSeries := getMetricData(-1, 2, 5, 10, "metric.public") org1Series := getMetricData(1, 2, 5, 10, "metric.org1") @@ -99,8 +97,7 @@ func TestGetAddKey(t *testing.T) { func TestFind(t *testing.T) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() for _, s := range getMetricData(-1, 2, 5, 10, "metric.demo") { s.Time = 10 * 86400 ix.Add(s, 1) @@ -223,8 +220,7 @@ func TestFind(t *testing.T) { func TestDelete(t *testing.T) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() publicSeries := getMetricData(-1, 2, 5, 10, "metric.public") org1Series := getMetricData(1, 2, 5, 10, "metric.org1") @@ -285,8 +281,7 @@ func TestDelete(t *testing.T) { func TestBadAdd(t *testing.T) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() first := &schema.MetricData{ Name: "foo.bar", @@ -340,8 +335,7 @@ func TestBadAdd(t *testing.T) { // verify that if a leaf blocks a new branch, we can add the branch after deleting the leaf func TestDeleteLeafAddBranch(t *testing.T) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() first := &schema.MetricData{ Name: "foo.bar", @@ -412,8 +406,7 @@ func TestDeleteLeafAddBranch(t *testing.T) { func TestPrune(t *testing.T) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() // add old series for _, s := range getSeriesNames(2, 5, "metric.bah") { @@ -477,8 +470,7 @@ func TestPrune(t *testing.T) { func BenchmarkIndexing(b *testing.B) { ix := New() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - ix.Init(stats) + ix.Init() var series string var data *schema.MetricData diff --git a/input/carbon/carbon.go b/input/carbon/carbon.go index 0b807c4010..1717cac29f 100644 --- a/input/carbon/carbon.go +++ b/input/carbon/carbon.go @@ -11,7 +11,6 @@ import ( "github.com/lomik/go-carbon/persister" "github.com/metrics20/go-metrics20/carbon20" - "github.com/raintank/met" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx" "github.com/raintank/metrictank/input" @@ -27,7 +26,6 @@ type Carbon struct { addrStr string addr *net.TCPAddr schemas persister.WhisperSchemas - stats met.Backend listener *net.TCPListener handlerWaitGroup sync.WaitGroup quit chan struct{} @@ -110,7 +108,7 @@ func ConfigProcess() { cluster.ThisNode.SetPartitions([]int32{int32(partitionId)}) } -func New(stats met.Backend) *Carbon { +func New() *Carbon { addrT, err := net.ResolveTCPAddr("tcp", addr) if err != nil { log.Fatal(4, "carbon-in: %s", err.Error()) @@ -119,20 +117,21 @@ func New(stats met.Backend) *Carbon { addrStr: addr, addr: addrT, schemas: schemas, - stats: stats, - quit: make(chan struct{}), connTrack: NewConnTrack(), } } func (c *Carbon) Start(metrics mdata.Metrics, metricIndex idx.MetricIndex, usg *usage.Usage) { - c.Input = input.New(metrics, metricIndex, usg, "carbon", c.stats) + if c.Input.MsgsAge == nil { + c.Input = input.New(metrics, metricIndex, usg, "carbon") + } l, err := net.ListenTCP("tcp", c.addr) if nil != err { log.Fatal(4, "carbon-in: %s", err.Error()) } c.listener = l log.Info("carbon-in: listening on %v/tcp", c.addr) + c.quit = make(chan struct{}) go c.accept() } @@ -189,7 +188,7 @@ func (c *Carbon) handle(conn net.Conn) { key, val, ts, err := carbon20.ValidatePacket(buf, carbon20.Medium) if err != nil { - c.Input.MetricsDecodeErr.Inc(1) + c.Input.MetricsDecodeErr.Inc() log.Error(4, "carbon-in: invalid metric: %s", err.Error()) continue } @@ -211,7 +210,7 @@ func (c *Carbon) handle(conn net.Conn) { OrgId: 1, // admin org } md.SetId() - c.Input.MetricsPerMessage.Value(int64(1)) + c.Input.MetricsPerMessage.ValueUint32(1) c.Input.Process(md, int32(partitionId)) } c.handlerWaitGroup.Done() diff --git a/input/carbon/carbon_test.go b/input/carbon/carbon_test.go index 637d39150b..59ee8da024 100644 --- a/input/carbon/carbon_test.go +++ b/input/carbon/carbon_test.go @@ -10,7 +10,6 @@ import ( "github.com/benbjohnson/clock" "github.com/lomik/go-carbon/persister" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx/memory" "github.com/raintank/metrictank/mdata" @@ -19,13 +18,11 @@ import ( ) func Test_HandleMessage(t *testing.T) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") cluster.Init("default", "test", time.Now()) - mdata.InitMetrics(stats) store := mdata.NewDevnullStore() aggmetrics := mdata.NewAggMetrics(store, 600, 10, 800, 8000, 10000, 0, make([]mdata.AggSetting, 0)) metricIndex := memory.New() - metricIndex.Init(stats) + metricIndex.Init() usage := usage.New(300, aggmetrics, metricIndex, clock.New()) Enabled = true addr = "localhost:2003" @@ -41,7 +38,7 @@ func Test_HandleMessage(t *testing.T) { } schemas = persister.WhisperSchemas{s} - c := New(stats) + c := New() c.Start(aggmetrics, metricIndex, usage) allMetrics := make(map[string]int) diff --git a/input/input.go b/input/input.go index 1ed0ef357d..5aaeb6a5f4 100644 --- a/input/input.go +++ b/input/input.go @@ -4,35 +4,42 @@ package input import ( "fmt" + "time" - "github.com/raintank/met" "github.com/raintank/metrictank/idx" "github.com/raintank/metrictank/mdata" + "github.com/raintank/metrictank/stats" "github.com/raintank/metrictank/usage" "github.com/raintank/worldping-api/pkg/log" "gopkg.in/raintank/schema.v1" ) +// TODO: clever way to document all metrics for all different inputs + // In is a base handler for a metrics packet, aimed to be embedded by concrete implementations type Input struct { - MetricsPerMessage met.Meter - metricsReceived met.Count - MetricsDecodeErr met.Count // metric metrics_decode_err is a count of times an input message (MetricData, MetricDataArray or carbon line) failed to parse - MetricInvalid met.Count // metric metric_invalid is a count of times a metric did not validate - MsgsAge met.Meter // in ms + MetricsPerMessage *stats.Meter32 + metricsReceived *stats.Counter32 + MetricsDecodeErr *stats.Counter32 // metric metrics_decode_err is a count of times an input message (MetricData, MetricDataArray or carbon line) failed to parse + MetricInvalid *stats.Counter32 // metric metric_invalid is a count of times a metric did not validate + MsgsAge *stats.Meter32 // in ms + pressureIdx *stats.Counter32 + pressureTank *stats.Counter32 metrics mdata.Metrics metricIndex idx.MetricIndex usage *usage.Usage } -func New(metrics mdata.Metrics, metricIndex idx.MetricIndex, usage *usage.Usage, input string, stats met.Backend) Input { +func New(metrics mdata.Metrics, metricIndex idx.MetricIndex, usage *usage.Usage, input string) Input { return Input{ - MetricsPerMessage: stats.NewMeter(fmt.Sprintf("%s.metrics_per_message", input), 0), - metricsReceived: stats.NewCount(fmt.Sprintf("%s.metrics_received", input)), - MetricsDecodeErr: stats.NewCount(fmt.Sprintf("%s.metrics_decode_err", input)), - MetricInvalid: stats.NewCount(fmt.Sprintf("%s.metric_invalid", input)), - MsgsAge: stats.NewMeter(fmt.Sprintf("%s.message_age", input), 0), + MetricsPerMessage: stats.NewMeter32(fmt.Sprintf("input.%s.metrics_per_message", input), false), + metricsReceived: stats.NewCounter32(fmt.Sprintf("input.%s.metrics_received", input)), + MetricsDecodeErr: stats.NewCounter32(fmt.Sprintf("input.%s.metrics_decode_err", input)), + MetricInvalid: stats.NewCounter32(fmt.Sprintf("input.%s.metric_invalid", input)), + MsgsAge: stats.NewMeter32(fmt.Sprintf("input.%s.message_age", input), false), + pressureIdx: stats.NewCounter32(fmt.Sprintf("input.%s.pressure.idx", input)), + pressureTank: stats.NewCounter32(fmt.Sprintf("input.%s.pressure.tank", input)), metrics: metrics, metricIndex: metricIndex, @@ -47,21 +54,27 @@ func (in Input) Process(metric *schema.MetricData, partition int32) { if metric == nil { return } - in.metricsReceived.Inc(1) + in.metricsReceived.Inc() err := metric.Validate() if err != nil { - in.MetricInvalid.Inc(1) + in.MetricInvalid.Inc() log.Debug("in: Invalid metric %s %v", err, metric) return } if metric.Time == 0 { + in.MetricInvalid.Inc() log.Warn("in: invalid metric. metric.Time is 0. %s", metric.Id) } else { + pre := time.Now() in.metricIndex.Add(metric, partition) + in.pressureIdx.Add(int(time.Since(pre).Nanoseconds())) + + pre = time.Now() m := in.metrics.GetOrCreate(metric.Id) m.Add(uint32(metric.Time), metric.Value) if in.usage != nil { in.usage.Add(metric.OrgId, metric.Id) } + in.pressureTank.Add(int(time.Since(pre).Nanoseconds())) } } diff --git a/input/input_test.go b/input/input_test.go index 9ca524f6bf..2a4a362c6d 100644 --- a/input/input_test.go +++ b/input/input_test.go @@ -6,24 +6,22 @@ import ( "time" "github.com/benbjohnson/clock" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx/memory" "github.com/raintank/metrictank/mdata" + "github.com/raintank/metrictank/stats" "github.com/raintank/metrictank/usage" "gopkg.in/raintank/schema.v1" ) func Test_Process(t *testing.T) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") cluster.Init("default", "test", time.Now()) - mdata.InitMetrics(stats) store := mdata.NewDevnullStore() aggmetrics := mdata.NewAggMetrics(store, 600, 10, 800, 8000, 10000, 0, make([]mdata.AggSetting, 0)) metricIndex := memory.New() - metricIndex.Init(stats) + metricIndex.Init() usage := usage.New(300, aggmetrics, metricIndex, clock.New()) - in := New(aggmetrics, metricIndex, usage, "test", stats) + in := New(aggmetrics, metricIndex, usage, "TestProcess") allMetrics := make(map[string]int) for i := 0; i < 5; i++ { @@ -87,16 +85,15 @@ func test_Process(worker int, in *Input, t *testing.T) map[string]int { } func BenchmarkProcess(b *testing.B) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") + stats.Clear() cluster.Init("default", "test", time.Now()) - mdata.InitMetrics(stats) store := mdata.NewDevnullStore() aggmetrics := mdata.NewAggMetrics(store, 600, 10, 800, 8000, 10000, 0, make([]mdata.AggSetting, 0)) metricIndex := memory.New() - metricIndex.Init(stats) + metricIndex.Init() usage := usage.New(300, aggmetrics, metricIndex, clock.New()) - in := New(aggmetrics, metricIndex, usage, "test", stats) + in := New(aggmetrics, metricIndex, usage, "BenchmarkProcess") // timestamps start at 10 and go up from there. (we can't use 0, see AggMetric.Add()) datas := make([]*schema.MetricData, b.N) diff --git a/input/kafkamdm/kafkamdm.go b/input/kafkamdm/kafkamdm.go index 8a406e3c90..ec096951ca 100644 --- a/input/kafkamdm/kafkamdm.go +++ b/input/kafkamdm/kafkamdm.go @@ -11,7 +11,6 @@ import ( "github.com/raintank/worldping-api/pkg/log" "github.com/rakyll/globalconf" - "github.com/raintank/met" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx" "github.com/raintank/metrictank/input" @@ -25,7 +24,6 @@ type KafkaMdm struct { input.Input consumer sarama.Consumer client sarama.Client - stats met.Backend wg sync.WaitGroup @@ -186,7 +184,7 @@ Iter: return diff } -func New(stats met.Backend) *KafkaMdm { +func New() *KafkaMdm { client, err := sarama.NewClient(brokers, config) if err != nil { log.Fatal(4, "kafka-mdm failed to create client. %s", err) @@ -199,7 +197,6 @@ func New(stats met.Backend) *KafkaMdm { k := KafkaMdm{ consumer: consumer, client: client, - stats: stats, stopConsuming: make(chan struct{}), } @@ -207,7 +204,7 @@ func New(stats met.Backend) *KafkaMdm { } func (k *KafkaMdm) Start(metrics mdata.Metrics, metricIndex idx.MetricIndex, usg *usage.Usage) { - k.Input = input.New(metrics, metricIndex, usg, "kafka-mdm", k.stats) + k.Input = input.New(metrics, metricIndex, usg, "kafka-mdm") var err error for _, topic := range topics { for _, partition := range partitions { @@ -270,11 +267,11 @@ func (k *KafkaMdm) handleMsg(data []byte, partition int32) { md := schema.MetricData{} _, err := md.UnmarshalMsg(data) if err != nil { - k.Input.MetricsDecodeErr.Inc(1) + k.Input.MetricsDecodeErr.Inc() log.Error(3, "kafka-mdm decode error, skipping message. %s", err) return } - k.Input.MetricsPerMessage.Value(int64(1)) + k.Input.MetricsPerMessage.ValueUint32(1) k.Input.Process(&md, partition) } diff --git a/input/kafkamdm/kafkamdm_test.go b/input/kafkamdm/kafkamdm_test.go index c0a41c691c..6ffdc48a21 100644 --- a/input/kafkamdm/kafkamdm_test.go +++ b/input/kafkamdm/kafkamdm_test.go @@ -6,7 +6,6 @@ import ( "time" "github.com/benbjohnson/clock" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx/memory" "github.com/raintank/metrictank/input" @@ -17,16 +16,14 @@ import ( ) func Test_HandleMessage(t *testing.T) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") cluster.Init("default", "test", time.Now()) - mdata.InitMetrics(stats) store := mdata.NewDevnullStore() aggmetrics := mdata.NewAggMetrics(store, 600, 10, 800, 8000, 10000, 0, make([]mdata.AggSetting, 0)) metricIndex := memory.New() - metricIndex.Init(stats) + metricIndex.Init() usage := usage.New(300, aggmetrics, metricIndex, clock.New()) k := KafkaMdm{ - Input: input.New(aggmetrics, metricIndex, usage, "test", stats), + Input: input.New(aggmetrics, metricIndex, usage, "test"), } allMetrics := make(map[string]int) diff --git a/mdata/aggmetric.go b/mdata/aggmetric.go index 1630e11927..3d08c08457 100644 --- a/mdata/aggmetric.go +++ b/mdata/aggmetric.go @@ -419,7 +419,7 @@ func (a *AggMetric) Add(ts uint32, val float64) { t0 := ts - (ts % a.ChunkSpan) if len(a.Chunks) == 0 { - chunkCreate.Inc(1) + chunkCreate.Inc() // no data has been added to this metric at all. a.Chunks = append(a.Chunks, chunk.New(t0)) @@ -443,7 +443,7 @@ func (a *AggMetric) Add(ts uint32, val float64) { if currentChunk.Saving { // if we're already saving the chunk, it means it has the end-of-stream marker and any new points behind it wouldn't be read by an iterator // you should monitor this metric closely, it indicates that maybe your GC settings don't match how you actually send data (too late) - addToSavingChunk.Inc(1) + addToSavingChunk.Inc() return } @@ -453,17 +453,17 @@ func (a *AggMetric) Add(ts uint32, val float64) { // typically this happens when non-primaries receive metrics that the primary already saved (maybe cause their metrics consumer is laggy) // we allow adding data to such chunks in that case, though this open the possibility for data to be rejected by the primary, to be // visible on secondaries. - addToSavedChunk.Inc(1) + addToSavedChunk.Inc() } } else { log.Debug("AM failed to add metric to chunk for %s. %s", a.Key, err) - metricsTooOld.Inc(1) + metricsTooOld.Inc() return } log.Debug("AM %s Add(): pushed new value to last chunk: %v", a.Key, a.Chunks[0]) } else if t0 < currentChunk.T0 { log.Debug("AM Point at %d has t0 %d, goes back into previous chunk. CurrentChunk t0: %d, LastTs: %d", ts, t0, currentChunk.T0, currentChunk.LastTs) - metricsTooOld.Inc(1) + metricsTooOld.Inc() return } else { // persist the chunk. If the writeQueue is full, then this will block. @@ -474,7 +474,7 @@ func (a *AggMetric) Add(ts uint32, val float64) { a.CurrentChunkPos = 0 } - chunkCreate.Inc(1) + chunkCreate.Inc() if len(a.Chunks) < int(a.NumChunks) { a.Chunks = append(a.Chunks, chunk.New(t0)) if err := a.Chunks[a.CurrentChunkPos].Push(ts, val); err != nil { @@ -482,7 +482,7 @@ func (a *AggMetric) Add(ts uint32, val float64) { } log.Debug("AM %s Add(): added new chunk to buffer. now %d chunks. and added the new point: %s", a.Key, a.CurrentChunkPos+1, a.Chunks[a.CurrentChunkPos]) } else { - chunkClear.Inc(1) + chunkClear.Inc() a.Chunks[a.CurrentChunkPos].Clear() a.Chunks[a.CurrentChunkPos] = chunk.New(t0) if err := a.Chunks[a.CurrentChunkPos].Push(ts, val); err != nil { diff --git a/mdata/aggmetric_test.go b/mdata/aggmetric_test.go index 4997a77dbb..6a2ac27b5e 100644 --- a/mdata/aggmetric_test.go +++ b/mdata/aggmetric_test.go @@ -5,7 +5,6 @@ import ( "testing" "time" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/cluster" ) @@ -72,9 +71,7 @@ func (c *Checker) Verify(primary bool, from, to, first, last uint32) { } func TestAggMetric(t *testing.T) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") cluster.Init("default", "test", time.Now()) - InitMetrics(stats) c := NewChecker(t, NewAggMetric(dnstore, "foo", 100, 5, 1, []AggSetting{}...)) @@ -153,8 +150,6 @@ func TestAggMetric(t *testing.T) { // TODO update once we clean old data, then we should look at numChunks func BenchmarkAggMetrics1000Metrics1Day(b *testing.B) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") - InitMetrics(stats) cluster.Init("default", "test", time.Now()) // we will store 10s metrics in 5 chunks of 2 hours // aggragate them in 5min buckets, stored in 1 chunk of 24hours @@ -177,7 +172,6 @@ func BenchmarkAggMetrics1000Metrics1Day(b *testing.B) { } metrics := NewAggMetrics(dnstore, chunkSpan, numChunks, chunkMaxStale, metricMaxStale, ttl, 0, aggSettings) - defer metrics.Stop() maxT := 3600 * 24 * uint32(b.N) // b.N in days for t := uint32(1); t < maxT; t += 10 { @@ -189,8 +183,6 @@ func BenchmarkAggMetrics1000Metrics1Day(b *testing.B) { } func BenchmarkAggMetrics1kSeries2Chunks1kQueueSize(b *testing.B) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") - InitMetrics(stats) chunkSpan := uint32(600) numChunks := uint32(5) @@ -214,7 +206,6 @@ func BenchmarkAggMetrics1kSeries2Chunks1kQueueSize(b *testing.B) { } metrics := NewAggMetrics(dnstore, chunkSpan, numChunks, chunkMaxStale, metricMaxStale, ttl, 0, aggSettings) - defer metrics.Stop() maxT := uint32(1200) for t := uint32(1); t < maxT; t += 10 { @@ -226,9 +217,6 @@ func BenchmarkAggMetrics1kSeries2Chunks1kQueueSize(b *testing.B) { } func BenchmarkAggMetrics10kSeries2Chunks10kQueueSize(b *testing.B) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") - InitMetrics(stats) - chunkSpan := uint32(600) numChunks := uint32(5) chunkMaxStale := uint32(3600) @@ -251,7 +239,6 @@ func BenchmarkAggMetrics10kSeries2Chunks10kQueueSize(b *testing.B) { } metrics := NewAggMetrics(dnstore, chunkSpan, numChunks, chunkMaxStale, metricMaxStale, ttl, 0, aggSettings) - defer metrics.Stop() maxT := uint32(1200) for t := uint32(1); t < maxT; t += 10 { @@ -263,9 +250,6 @@ func BenchmarkAggMetrics10kSeries2Chunks10kQueueSize(b *testing.B) { } func BenchmarkAggMetrics100kSeries2Chunks100kQueueSize(b *testing.B) { - stats, _ := helper.New(false, "", "standard", "metrictank", "") - InitMetrics(stats) - chunkSpan := uint32(600) numChunks := uint32(5) chunkMaxStale := uint32(3600) @@ -288,7 +272,6 @@ func BenchmarkAggMetrics100kSeries2Chunks100kQueueSize(b *testing.B) { } metrics := NewAggMetrics(dnstore, chunkSpan, numChunks, chunkMaxStale, metricMaxStale, ttl, 0, aggSettings) - defer metrics.Stop() maxT := uint32(1200) for t := uint32(1); t < maxT; t += 10 { diff --git a/mdata/aggmetrics.go b/mdata/aggmetrics.go index 78acb6e3a4..3e21df87de 100644 --- a/mdata/aggmetrics.go +++ b/mdata/aggmetrics.go @@ -19,7 +19,6 @@ type AggMetrics struct { metricMaxStale uint32 ttl uint32 gcInterval time.Duration - tickStats *time.Ticker } func NewAggMetrics(store Store, chunkSpan, numChunks, chunkMaxStale, metricMaxStale uint32, ttl uint32, gcInterval time.Duration, aggSettings []AggSetting) *AggMetrics { @@ -33,10 +32,8 @@ func NewAggMetrics(store Store, chunkSpan, numChunks, chunkMaxStale, metricMaxSt metricMaxStale: metricMaxStale, ttl: ttl, gcInterval: gcInterval, - tickStats: time.NewTicker(time.Duration(1) * time.Second), } - go ms.stats() // gcInterval = 0 can be useful in tests if gcInterval > 0 { go ms.GC() @@ -68,7 +65,7 @@ func (ms *AggMetrics) GC() { } ms.RUnlock() for _, key := range keys { - gcMetric.Inc(1) + gcMetric.Inc() ms.RLock() a := ms.Metrics[key] ms.RUnlock() @@ -76,6 +73,7 @@ func (ms *AggMetrics) GC() { log.Info("metric %s is stale. Purging data from memory.", key) ms.Lock() delete(ms.Metrics, key) + metricsActive.Set(len(ms.Metrics)) ms.Unlock() } } @@ -83,14 +81,6 @@ func (ms *AggMetrics) GC() { } } -func (ms *AggMetrics) stats() { - for range ms.tickStats.C { - ms.RLock() - metricsActive.Value(int64(len(ms.Metrics))) - ms.RUnlock() - } -} - func (ms *AggMetrics) Get(key string) (Metric, bool) { ms.RLock() m, ok := ms.Metrics[key] @@ -98,19 +88,13 @@ func (ms *AggMetrics) Get(key string) (Metric, bool) { return m, ok } -// closes the stats reporting. Not the GC. -// this is meant to be used by unit tests, which don't even start the GC -// we can adjust this later as needed -func (ms *AggMetrics) Stop() { - ms.tickStats.Stop() -} - func (ms *AggMetrics) GetOrCreate(key string) Metric { ms.Lock() m, ok := ms.Metrics[key] if !ok { m = NewAggMetric(ms.store, key, ms.chunkSpan, ms.numChunks, ms.ttl, ms.aggSettings...) ms.Metrics[key] = m + metricsActive.Set(len(ms.Metrics)) } ms.Unlock() return m diff --git a/mdata/chunk/chunk.go b/mdata/chunk/chunk.go index 41e9365b4d..19130619f2 100644 --- a/mdata/chunk/chunk.go +++ b/mdata/chunk/chunk.go @@ -2,13 +2,14 @@ package chunk import ( "fmt" - "sync/atomic" "time" "github.com/dgryski/go-tsz" + "github.com/raintank/metrictank/stats" ) -var totalPoints uint64 +// metric tank.total_points is the number of points currently held in the in-memory ringbuffer +var totalPoints = stats.NewGauge64("tank.total_points") // Chunk is a chunk of data. not concurrency safe. type Chunk struct { @@ -38,13 +39,10 @@ func (c *Chunk) Push(t uint32, v float64) error { c.NumPoints += 1 c.LastTs = t c.LastWrite = uint32(time.Now().Unix()) - atomic.AddUint64(&totalPoints, 1) + totalPoints.Inc() return nil } -func (c *Chunk) Clear() { - atomic.AddUint64(&totalPoints, ^uint64(c.NumPoints-1)) -} -func TotalPoints() uint64 { - return atomic.LoadUint64(&totalPoints) +func (c *Chunk) Clear() { + totalPoints.DecUint64(uint64(c.NumPoints)) } diff --git a/mdata/init.go b/mdata/init.go index 20450b0432..b1354349f0 100644 --- a/mdata/init.go +++ b/mdata/init.go @@ -3,48 +3,43 @@ // save states over the network package mdata -import "github.com/raintank/met" +import "github.com/raintank/metrictank/stats" var ( LogLevel int - chunkCreate met.Count - chunkClear met.Count + // metric tank.chunk_operations.create is a counter of how many chunks are created + chunkCreate = stats.NewCounter32("tank.chunk_operations.create") - // metric metrics_too_old is points that go back in time. + // metric tank.chunk_operations.clear is a counter of how many chunks are cleared (replaced by new chunks) + chunkClear = stats.NewCounter32("tank.chunk_operations.clear") + + // metric tank.metrics_too_old is points that go back in time. // E.g. for any given series, when a point has a timestamp // that is not higher than the timestamp of the last written timestamp for that series. - metricsTooOld met.Count + metricsTooOld = stats.NewCounter32("tank.metrics_too_old") - // metric add_to_saving_chunk is points received - by the primary node - for the most recent chunk + // metric tank.add_to_saving_chunk is points received - by the primary node - for the most recent chunk // when that chunk is already being saved (or has been saved). // this indicates that your GC is actively sealing chunks and saving them before you have the chance to send // your (infrequent) updates. The primary won't add them to its in-memory chunks, but secondaries will // (because they are never in "saving" state for them), see below. - addToSavingChunk met.Count + addToSavingChunk = stats.NewCounter32("tank.add_to_saving_chunk") - // metric add_to_saved_chunk is points received - by a secondary node - for the most recent chunk when that chunk + // metric tank.add_to_saved_chunk is points received - by a secondary node - for the most recent chunk when that chunk // has already been saved by a primary. A secondary can add this data to its chunks. - addToSavedChunk met.Count - - memToIterDuration met.Timer - persistDuration met.Timer + addToSavedChunk = stats.NewCounter32("tank.add_to_saved_chunk") - metricsActive met.Gauge // metric metrics_active is the amount of currently known metrics (excl rollup series), measured every second - gcMetric met.Count // metric gc_metric is the amount of times the metrics GC is about to inspect a metric (series) -) + // metric mem.to_iter is how long it takes to transform in-memory chunks to iterators + memToIterDuration = stats.NewLatencyHistogram15s32("mem.to_iter") -func InitMetrics(stats met.Backend) { - chunkCreate = stats.NewCount("chunks.create") - chunkClear = stats.NewCount("chunks.clear") + // metric tank.persist is how long it takes to persist a chunk (and chunks preceeding it) + // this is subject to backpressure from the store when the store's queue runs full + persistDuration = stats.NewLatencyHistogram15s32("tank.persist") - metricsTooOld = stats.NewCount("metrics_too_old") - addToSavingChunk = stats.NewCount("add_to_saving_chunk") - addToSavedChunk = stats.NewCount("add_to_saved_chunk") + // metric tank.metrics_active is the amount of currently known metrics (excl rollup series), measured every second + metricsActive = stats.NewGauge32("tank.metrics_active") - memToIterDuration = stats.NewTimer("mem.to_iter_duration", 0) - persistDuration = stats.NewTimer("persist_duration", 0) - - gcMetric = stats.NewCount("gc_metric") - metricsActive = stats.NewGauge("metrics_active", 0) -} + // metric tank.gc_metric is the amount of times the metrics GC is about to inspect a metric (series) + gcMetric = stats.NewCounter32("tank.gc_metric") +) diff --git a/mdata/notifier.go b/mdata/notifier.go index ebd3308e01..cb7552dd1e 100644 --- a/mdata/notifier.go +++ b/mdata/notifier.go @@ -3,7 +3,6 @@ package mdata import ( "encoding/json" - "github.com/raintank/met" "github.com/raintank/worldping-api/pkg/log" ) @@ -42,7 +41,7 @@ func SendPersistMessage(key string, t0 uint32) { } } -func InitPersistNotifier(stats met.Backend, handlers ...NotifierHandler) { +func InitPersistNotifier(handlers ...NotifierHandler) { notifierHandlers = handlers } diff --git a/mdata/notifierKafka/cfg.go b/mdata/notifierKafka/cfg.go index b000572a64..c2ea0b1be8 100644 --- a/mdata/notifierKafka/cfg.go +++ b/mdata/notifierKafka/cfg.go @@ -7,7 +7,7 @@ import ( "time" "github.com/Shopify/sarama" - "github.com/raintank/met" + "github.com/raintank/metrictank/stats" "github.com/rakyll/globalconf" ) @@ -21,8 +21,8 @@ var config *sarama.Config var offsetDuration time.Duration var offsetCommitInterval time.Duration -var messagesPublished met.Count -var messagesSize met.Meter +var messagesPublished *stats.Counter32 +var messagesSize *stats.Meter32 func ConfigSetup() { fs := flag.NewFlagSet("kafka-cluster", flag.ExitOnError) diff --git a/mdata/notifierKafka/notifierKafka.go b/mdata/notifierKafka/notifierKafka.go index 18ccf119c8..e36b82b6da 100644 --- a/mdata/notifierKafka/notifierKafka.go +++ b/mdata/notifierKafka/notifierKafka.go @@ -8,9 +8,9 @@ import ( "time" "github.com/Shopify/sarama" - "github.com/raintank/met" "github.com/raintank/metrictank/kafka" "github.com/raintank/metrictank/mdata" + "github.com/raintank/metrictank/stats" "github.com/raintank/worldping-api/pkg/log" ) @@ -29,9 +29,11 @@ type NotifierKafka struct { mdata.Notifier } -func New(instance string, metrics mdata.Metrics, stats met.Backend) *NotifierKafka { - messagesPublished = stats.NewCount("notifier.kafka.messages-published") - messagesSize = stats.NewMeter("notifier.kafka.message_size", 0) +func New(instance string, metrics mdata.Metrics) *NotifierKafka { + // metric cluster.notifier.kafka.messages-published is a counter of messages published to the kafka cluster notifier + messagesPublished = stats.NewCounter32("cluster.notifier.kafka.messages-published") + // metric cluster.notifier.kafka.message_size is the sizes seen of messages through the kafka cluster notifier + messagesSize = stats.NewMeter32("cluster.notifier.kafka.message_size", false) client, err := sarama.NewClient(brokers, config) if err != nil { @@ -187,7 +189,7 @@ func (c *NotifierKafka) flush() { buf := new(bytes.Buffer) binary.Write(buf, binary.LittleEndian, uint8(mdata.PersistMessageBatchV1)) buf.Write(data) - messagesSize.Value(int64(buf.Len())) + messagesSize.Value(buf.Len()) payload := &sarama.ProducerMessage{ Topic: topic, Value: sarama.ByteEncoder(buf.Bytes()), @@ -204,6 +206,6 @@ func (c *NotifierKafka) flush() { } time.Sleep(time.Second) } - messagesPublished.Inc(1) + messagesPublished.Inc() }() } diff --git a/mdata/notifierNsq/cfg.go b/mdata/notifierNsq/cfg.go index f52ea27e62..c003e78308 100644 --- a/mdata/notifierNsq/cfg.go +++ b/mdata/notifierNsq/cfg.go @@ -12,7 +12,7 @@ import ( "strings" "github.com/nsqio/go-nsq" - "github.com/raintank/met" + "github.com/raintank/metrictank/stats" "github.com/raintank/misc/app" "github.com/rakyll/globalconf" ) @@ -30,8 +30,8 @@ var ( consumerOpts string pCfg *nsq.Config cCfg *nsq.Config - messagesPublished met.Count - messagesSize met.Meter + messagesPublished *stats.Counter32 + messagesSize *stats.Meter32 ) func ConfigSetup() { diff --git a/mdata/notifierNsq/instrumented_nsq/consumer.go b/mdata/notifierNsq/instrumented_nsq/consumer.go new file mode 100644 index 0000000000..e06c9018ea --- /dev/null +++ b/mdata/notifierNsq/instrumented_nsq/consumer.go @@ -0,0 +1,48 @@ +package insq + +import ( + "fmt" + "time" + + "github.com/nsqio/go-nsq" + "github.com/raintank/metrictank/stats" +) + +type Consumer struct { + *nsq.Consumer + msgsReceived *stats.Counter64 + msgsFinished *stats.Counter64 + msgsRequeued *stats.Counter64 + msgsConnections *stats.Gauge32 + numHandlers *stats.Gauge32 +} + +func NewConsumer(topic, channel string, config *nsq.Config, metricsPatt string) (*Consumer, error) { + consumer, err := nsq.NewConsumer(topic, channel, config) + if err != nil { + return nil, err + } + c := Consumer{ + consumer, + stats.NewCounter64(fmt.Sprintf(metricsPatt, "received")), + stats.NewCounter64(fmt.Sprintf(metricsPatt, "finished")), + stats.NewCounter64(fmt.Sprintf(metricsPatt, "requeued")), + stats.NewGauge32(fmt.Sprintf(metricsPatt, "connections")), + stats.NewGauge32(fmt.Sprintf(metricsPatt, "num_handlers")), + } + go func() { + for range time.Tick(time.Second) { + s := consumer.Stats() + c.msgsReceived.SetUint64(s.MessagesReceived) + c.msgsFinished.SetUint64(s.MessagesFinished) + c.msgsRequeued.SetUint64(s.MessagesRequeued) + c.msgsConnections.Set(s.Connections) + } + }() + return &c, nil +} + +func (r *Consumer) AddConcurrentHandlers(handler nsq.Handler, concurrency int) { + r.numHandlers.Add(concurrency) + r.Consumer.AddConcurrentHandlers(handler, concurrency) +} diff --git a/mdata/notifierNsq/notifierNsq.go b/mdata/notifierNsq/notifierNsq.go index 1b48680d55..edf98dc521 100644 --- a/mdata/notifierNsq/notifierNsq.go +++ b/mdata/notifierNsq/notifierNsq.go @@ -8,9 +8,9 @@ import ( "github.com/bitly/go-hostpool" "github.com/nsqio/go-nsq" - "github.com/raintank/met" "github.com/raintank/metrictank/mdata" - "github.com/raintank/misc/instrumented_nsq" + "github.com/raintank/metrictank/mdata/notifierNsq/instrumented_nsq" + "github.com/raintank/metrictank/stats" "github.com/raintank/worldping-api/pkg/log" ) @@ -26,9 +26,11 @@ type NotifierNSQ struct { mdata.Notifier } -func New(instance string, metrics mdata.Metrics, stats met.Backend) *NotifierNSQ { - messagesPublished = stats.NewCount("notifier.nsq.messages-published") - messagesSize = stats.NewMeter("notifier.nsq.message_size", 0) +func New(instance string, metrics mdata.Metrics) *NotifierNSQ { + // metric cluster.notifier.nsq.messages-published is a counter of messages published to the nsq cluster notifier + messagesPublished = stats.NewCounter32("cluster.notifier.nsq.messages-published") + // metric cluster.notifier.nsq.message_size is the sizes seen of messages through the nsq cluster notifier + messagesSize = stats.NewMeter32("cluster.notifier.nsq.message_size", false) // producers hostPool = hostpool.NewEpsilonGreedy(nsqdAdds, 0, &hostpool.LinearEpsilonValueCalculator{}) producers = make(map[string]*nsq.Producer) @@ -42,7 +44,7 @@ func New(instance string, metrics mdata.Metrics, stats met.Backend) *NotifierNSQ } // consumers - consumer, err := insq.NewConsumer(topic, channel, cCfg, "metric_persist.%s", stats) + consumer, err := insq.NewConsumer(topic, channel, cCfg, "cluster.notifier.nsq.metric_persist.%s") if err != nil { log.Fatal(4, "nsq-cluster failed to create NSQ consumer. %s", err) } @@ -114,7 +116,7 @@ func (c *NotifierNSQ) flush() { buf := new(bytes.Buffer) binary.Write(buf, binary.LittleEndian, uint8(mdata.PersistMessageBatchV1)) buf.Write(data) - messagesSize.Value(int64(buf.Len())) + messagesSize.Value(buf.Len()) sent := false for !sent { @@ -134,6 +136,6 @@ func (c *NotifierNSQ) flush() { } time.Sleep(time.Second) } - messagesPublished.Inc(1) + messagesPublished.Inc() }() } diff --git a/mdata/store_cassandra.go b/mdata/store_cassandra.go index d9b01dfbba..07ef9d1c35 100644 --- a/mdata/store_cassandra.go +++ b/mdata/store_cassandra.go @@ -12,10 +12,10 @@ import ( "github.com/dgryski/go-tsz" "github.com/gocql/gocql" "github.com/hailocab/go-hostpool" - "github.com/raintank/met" "github.com/raintank/metrictank/cassandra" "github.com/raintank/metrictank/iter" "github.com/raintank/metrictank/mdata/chunk" + "github.com/raintank/metrictank/stats" "github.com/raintank/worldping-api/pkg/log" ) @@ -38,21 +38,34 @@ var ( errUnknownChunkFormat = errors.New("unrecognized chunk format in cassandra") errStartBeforeEnd = errors.New("start must be before end.") - cassGetExecDuration met.Timer - cassGetWaitDuration met.Timer - cassPutExecDuration met.Timer - cassPutWaitDuration met.Timer - - cassChunksPerRow met.Meter - cassRowsPerResponse met.Meter - cassGetChunksDuration met.Timer - cassToIterDuration met.Timer - - chunkSaveOk met.Count - chunkSaveFail met.Count - // it's pretty expensive/impossible to do chunk size in mem vs in cassandra etc, but we can more easily measure chunk sizes when we operate on them - chunkSizeAtSave met.Meter - chunkSizeAtLoad met.Meter + // metric store.cassandra.get.exec is the duration of getting from cassandra store + cassGetExecDuration = stats.NewLatencyHistogram15s32("store.cassandra.get.exec") + // metric store.cassandra.get.wait is the duration of the get spent in the queue + cassGetWaitDuration = stats.NewLatencyHistogram12h32("store.cassandra.get.wait") + // metric store.cassandra.put.exec is the duration of putting in cassandra store + cassPutExecDuration = stats.NewLatencyHistogram15s32("store.cassandra.put.exec") + // metric store.cassandra.put.wait is the duration of a put in the wait queue + cassPutWaitDuration = stats.NewLatencyHistogram12h32("store.cassandra.put.wait") + + // metric store.cassandra.chunks_per_row is how many chunks are retrieved per row in get queries + cassChunksPerRow = stats.NewMeter32("store.cassandra.chunks_per_row", false) + // metric store.cassandra.rows_per_response is how many rows come per get response + cassRowsPerResponse = stats.NewMeter32("store.cassandra.rows_per_response", false) + // metric store.cassandra.get_chunks is the duration of how long it takes to get chunks + cassGetChunksDuration = stats.NewLatencyHistogram15s32("store.cassandra.get_chunks") + // metric store.cassandra.to_iter is the duration of converting chunks to iterators + cassToIterDuration = stats.NewLatencyHistogram15s32("store.cassandra.to_iter") + + // metric store.cassandra.chunk_operations.save_ok is counter of successfull saves + chunkSaveOk = stats.NewCounter32("store.cassandra.chunk_operations.save_ok") + // metric store.cassandra.chunk_operations.save_fail is counter of failed saves + chunkSaveFail = stats.NewCounter32("store.cassandra.chunk_operations.save_fail") + // metric store.cassandra.chunk_size.at_save is the sizes of chunks seen when saving them + chunkSizeAtSave = stats.NewMeter32("store.cassandra.chunk_size.at_save", true) + // metric store.cassandra.chunk_size.at_load is the sizes of chunks seen when loading them + chunkSizeAtLoad = stats.NewMeter32("store.cassandra.chunk_size.at_load", true) + + errmetrics = cassandra.NewErrMetrics("store.cassandra") ) /* @@ -63,14 +76,16 @@ object to interact with the whole Cassandra cluster. */ type cassandraStore struct { - session *gocql.Session - writeQueues []chan *ChunkWriteRequest - readQueue chan *ChunkReadRequest - writeQueueMeters []met.Meter - metrics cassandra.Metrics + session *gocql.Session + writeQueues []chan *ChunkWriteRequest + readQueue chan *ChunkReadRequest } -func NewCassandraStore(stats met.Backend, addrs, keyspace, consistency, CaPath, Username, Password, hostSelectionPolicy string, timeout, readers, writers, readqsize, writeqsize, retries, protoVer int, ssl, auth, hostVerification bool) (*cassandraStore, error) { +func NewCassandraStore(addrs, keyspace, consistency, CaPath, Username, Password, hostSelectionPolicy string, timeout, readers, writers, readqsize, writeqsize, retries, protoVer int, ssl, auth, hostVerification bool) (*cassandraStore, error) { + + stats.NewGauge32("store.cassandra.write_queue.size").Set(writeqsize) + stats.NewGauge32("store.cassandra.num_writers").Set(writers) + cluster := gocql.NewCluster(strings.Split(addrs, ",")...) if ssl { cluster.SslOpts = &gocql.SslOptions{ @@ -139,16 +154,15 @@ func NewCassandraStore(stats met.Backend, addrs, keyspace, consistency, CaPath, } log.Debug("CS: created session to %s keysp %s cons %v with policy %s timeout %d readers %d writers %d readq %d writeq %d retries %d proto %d ssl %t auth %t hostverif %t", addrs, keyspace, consistency, hostSelectionPolicy, timeout, readers, writers, readqsize, writeqsize, retries, protoVer, ssl, auth, hostVerification) c := &cassandraStore{ - session: session, - writeQueues: make([]chan *ChunkWriteRequest, writers), - readQueue: make(chan *ChunkReadRequest, readqsize), - writeQueueMeters: make([]met.Meter, writers), + session: session, + writeQueues: make([]chan *ChunkWriteRequest, writers), + readQueue: make(chan *ChunkReadRequest, readqsize), } for i := 0; i < writers; i++ { c.writeQueues[i] = make(chan *ChunkWriteRequest, writeqsize) - c.writeQueueMeters[i] = stats.NewMeter(fmt.Sprintf("cassandra.write_queue.%d.items", i+1), 0) - go c.processWriteQueue(c.writeQueues[i], c.writeQueueMeters[i]) + queueMeter := stats.NewRange32(fmt.Sprintf("store.cassandra.write_queue.%d.items", i+1)) + go c.processWriteQueue(c.writeQueues[i], queueMeter) } for i := 0; i < readers; i++ { @@ -157,53 +171,31 @@ func NewCassandraStore(stats met.Backend, addrs, keyspace, consistency, CaPath, return c, err } - -func (c *cassandraStore) InitMetrics(stats met.Backend) { - cassGetExecDuration = stats.NewTimer("cassandra.get.exec", 0) - cassGetWaitDuration = stats.NewTimer("cassandra.get.wait", 0) - cassPutExecDuration = stats.NewTimer("cassandra.put.exec", 0) - cassPutWaitDuration = stats.NewTimer("cassandra.put.wait", 0) - - cassChunksPerRow = stats.NewMeter("cassandra.chunks_per_row", 0) - cassRowsPerResponse = stats.NewMeter("cassandra.rows_per_response", 0) - cassGetChunksDuration = stats.NewTimer("cassandra.get_chunks", 0) - cassToIterDuration = stats.NewTimer("cassandra.to_iter", 0) - - chunkSaveOk = stats.NewCount("chunks.save_ok") - chunkSaveFail = stats.NewCount("chunks.save_fail") - chunkSizeAtSave = stats.NewMeter("chunk_size.at_save", 0) - chunkSizeAtLoad = stats.NewMeter("chunk_size.at_load", 0) - - c.metrics = cassandra.NewMetrics("cassandra", stats) -} - func (c *cassandraStore) Add(cwr *ChunkWriteRequest) { sum := 0 for _, char := range cwr.key { sum += int(char) } which := sum % len(c.writeQueues) - c.writeQueueMeters[which].Value(int64(len(c.writeQueues[which]))) c.writeQueues[which] <- cwr - c.writeQueueMeters[which].Value(int64(len(c.writeQueues[which]))) } /* process writeQueue. */ -func (c *cassandraStore) processWriteQueue(queue chan *ChunkWriteRequest, meter met.Meter) { +func (c *cassandraStore) processWriteQueue(queue chan *ChunkWriteRequest, meter *stats.Range32) { tick := time.Tick(time.Duration(1) * time.Second) for { select { case <-tick: - meter.Value(int64(len(queue))) + meter.Value(len(queue)) case cwr := <-queue: - meter.Value(int64(len(queue))) + meter.Value(len(queue)) log.Debug("CS: starting to save %s:%d %v", cwr.key, cwr.chunk.T0, cwr.chunk) //log how long the chunk waited in the queue before we attempted to save to cassandra cassPutWaitDuration.Value(time.Now().Sub(cwr.timestamp)) data := cwr.chunk.Series.Bytes() - chunkSizeAtSave.Value(int64(len(data))) + chunkSizeAtSave.Value(len(data)) version := chunk.FormatStandardGoTszWithSpan buf := new(bytes.Buffer) binary.Write(buf, binary.LittleEndian, version) @@ -224,13 +216,13 @@ func (c *cassandraStore) processWriteQueue(queue chan *ChunkWriteRequest, meter cwr.chunk.Saved = true SendPersistMessage(cwr.key, cwr.chunk.T0) log.Debug("CS: save complete. %s:%d %v", cwr.key, cwr.chunk.T0, cwr.chunk) - chunkSaveOk.Inc(1) + chunkSaveOk.Inc() } else { - c.metrics.Inc(err) + errmetrics.Inc(err) if (attempts % 20) == 0 { log.Warn("CS: failed to save chunk to cassandra after %d attempts. %v, %s", attempts+1, cwr.chunk, err) } - chunkSaveFail.Inc(1) + chunkSaveFail.Inc() sleepTime := 100 * attempts if sleepTime > 2000 { sleepTime = 2000 @@ -361,7 +353,7 @@ func (c *cassandraStore) Search(key string, start, end uint32) ([]iter.Iter, err chunks := int64(0) for outcome.i.Scan(&ts, &b) { chunks += 1 - chunkSizeAtLoad.Value(int64(len(b))) + chunkSizeAtLoad.Value(len(b)) if len(b) < 2 { log.Error(3, errChunkTooSmall.Error()) return iters, errChunkTooSmall @@ -389,13 +381,13 @@ func (c *cassandraStore) Search(key string, start, end uint32) ([]iter.Iter, err err := outcome.i.Close() if err != nil { log.Error(3, "cassandra query error. %s", err) - c.metrics.Inc(err) + errmetrics.Inc(err) } else { - cassChunksPerRow.Value(chunks) + cassChunksPerRow.Value(int(chunks)) } } cassToIterDuration.Value(time.Now().Sub(pre)) - cassRowsPerResponse.Value(int64(len(outcomes))) + cassRowsPerResponse.Value(len(outcomes)) log.Debug("CS: searchCassandra(): %d outcomes (queries), %d total iters", len(outcomes), len(iters)) return iters, nil } diff --git a/metrictank-sample.ini b/metrictank-sample.ini index 7fd477adc4..5a2bfb96a7 100644 --- a/metrictank-sample.ini +++ b/metrictank-sample.ini @@ -86,20 +86,13 @@ cassandra-username = cassandra # password for authentication cassandra-password = cassandra -## Profiling, instrumentation and logging ## +## Profiling and logging ## # see https://golang.org/pkg/runtime/#SetBlockProfileRate block-profile-rate = 0 # 0 to disable. 1 for max precision (expensive!) see https://golang.org/pkg/runtime/#pkg-variables") mem-profile-rate = 524288 # 512*1024 -# enable sending statsd messages for instrumentation -statsd-enabled = true -# statsd address -statsd-addr = localhost:8125 -# standard or datadog -statsd-type = standard - # inspect status frequency. set to 0 to disable proftrigger-freq = 60s # path to store triggered profiles @@ -113,6 +106,22 @@ proftrigger-heap-thresh = 25000000000 # only log log-level and higher. 0=TRACE|1=DEBUG|2=INFO|3=WARN|4=ERROR|5=CRITICAL|6=FATAL log-level = 2 +# instrumentation stats +[stats] +# enable sending graphite messages for instrumentation +enabled = true +# stats prefix (will add trailing dot automatically if needed) +# The default matches what the Grafana dashboard expects +# $instance will be replaced with the `instance` setting. +# note, the 3rd word describes the environment you deployed in. +prefix = metrictank.stats.default.$instance +# graphite address +addr = localhost:2003 +# interval at which to send statistics +interval = 1 +# how many messages (holding all measurements from one interval. rule of thumb: a message is ~25kB) to buffer up in case graphite endpoint is unavailable. +# With the default of 20k you will use max about 500MB and bridge 5 hours of downtime when needed +buffer-size = 20000 ## http api ## [http] diff --git a/metrictank.go b/metrictank.go index e551af337c..6847a526d1 100644 --- a/metrictank.go +++ b/metrictank.go @@ -19,8 +19,6 @@ import ( "github.com/Shopify/sarama" "github.com/benbjohnson/clock" "github.com/raintank/dur" - "github.com/raintank/met" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/api" "github.com/raintank/metrictank/cluster" "github.com/raintank/metrictank/idx" @@ -34,6 +32,8 @@ import ( "github.com/raintank/metrictank/mdata/chunk" "github.com/raintank/metrictank/mdata/notifierKafka" "github.com/raintank/metrictank/mdata/notifierNsq" + "github.com/raintank/metrictank/stats" + statsConfig "github.com/raintank/metrictank/stats/config" "github.com/raintank/metrictank/usage" "github.com/raintank/metrictank/util" "github.com/raintank/worldping-api/pkg/log" @@ -93,45 +93,10 @@ var ( blockProfileRate = flag.Int("block-profile-rate", 0, "see https://golang.org/pkg/runtime/#SetBlockProfileRate") memProfileRate = flag.Int("mem-profile-rate", 512*1024, "0 to disable. 1 for max precision (expensive!) see https://golang.org/pkg/runtime/#pkg-variables") - statsdEnabled = flag.Bool("statsd-enabled", true, "enable sending statsd messages for instrumentation") - statsdAddr = flag.String("statsd-addr", "localhost:8125", "statsd address") - statsdType = flag.String("statsd-type", "standard", "statsd type: standard or datadog") - proftrigPath = flag.String("proftrigger-path", "/tmp", "path to store triggered profiles") proftrigFreqStr = flag.String("proftrigger-freq", "60s", "inspect status frequency. set to 0 to disable") proftrigMinDiffStr = flag.String("proftrigger-min-diff", "1h", "minimum time between triggered profiles") proftrigHeapThresh = flag.Int("proftrigger-heap-thresh", 25000000000, "if this many bytes allocated, trigger a profile") - - cassWriteQueueSize met.Gauge - cassWriters met.Gauge - getTargetDuration met.Timer - itersToPointsDuration met.Timer - messagesSize met.Meter - inItems met.Meter - points met.Gauge - - // metric bytes_alloc.not_freed is a gauge of currently allocated (within the runtime) memory. - // it does not include freed data so it drops at every GC run. - alloc met.Gauge - // metric bytes_alloc.incl_freed is a counter of total amount of bytes allocated during process lifetime. (incl freed data) - totalAlloc met.Gauge - // metric bytes_sys is the amount of bytes currently obtained from the system by the process. This is what the profiletrigger looks at. - sysBytes met.Gauge - clusterPrimary met.Gauge - - // metric cluster.promotion_wait is how long a candidate (secondary node) has to wait until it can become a primary - // When the timer becomes 0 it means the in-memory buffer has been able to fully populate so that if you stop a primary - // and it was able to save its complete chunks, this node will be able to take over without dataloss. - // You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save. - clusterPromoWait met.Gauge - gcNum met.Gauge // go GC - gcDur met.Gauge // go GC - gcCpuFraction met.Gauge // go GC - - // metric gc.heap_objects is how many objects are allocated on the heap, it's a key indicator for GC workload - heapObjects met.Gauge - - promotionReadyAtChan chan uint32 ) func init() { @@ -184,6 +149,9 @@ func main() { // load config for cluster cluster.ConfigSetup() + // stats + statsConfig.ConfigSetup() + conf.ParseAll() /*********************************** @@ -234,6 +202,7 @@ func main() { notifierKafka.ConfigProcess(*instance) api.ConfigProcess() cluster.ConfigProcess() + statsConfig.ConfigProcess(*instance) if !inCarbon.Enabled && !inKafkaMdm.Enabled { log.Fatal(4, "you should enable at least 1 input plugin") @@ -242,8 +211,6 @@ func main() { sec := dur.MustParseUNsec("warm-up-period", *warmUpPeriodStr) warmupPeriod = time.Duration(sec) * time.Second - promotionReadyAtChan = make(chan uint32) - chunkSpan := dur.MustParseUNsec("chunkspan", *chunkSpanStr) numChunks := uint32(*numChunksInt) chunkMaxStale := dur.MustParseUNsec("chunk-max-stale", *chunkMaxStaleStr) @@ -318,16 +285,9 @@ func main() { signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) /*********************************** - configure StatsD + collect stats ***********************************/ - if !*statsdEnabled { - log.Warn("running metrictank without statsd instrumentation.") - } - stats, err := helper.New(*statsdEnabled, *statsdAddr, *statsdType, "metrictank", *instance) - if err != nil { - log.Fatal(4, "failed to initialize statsd. %s", err) - } - initMetrics(stats) + statsConfig.Start() /************************************* Start polling our Cluster Peers @@ -337,17 +297,15 @@ func main() { /*********************************** Initialize our backendStore ***********************************/ - store, err := mdata.NewCassandraStore(stats, *cassandraAddrs, *cassandraKeyspace, *cassandraConsistency, *cassandraCaPath, *cassandraUsername, *cassandraPassword, *cassandraHostSelectionPolicy, *cassandraTimeout, *cassandraReadConcurrency, *cassandraWriteConcurrency, *cassandraReadQueueSize, *cassandraWriteQueueSize, *cassandraRetries, *cqlProtocolVersion, *cassandraSSL, *cassandraAuth, *cassandraHostVerification) + store, err := mdata.NewCassandraStore(*cassandraAddrs, *cassandraKeyspace, *cassandraConsistency, *cassandraCaPath, *cassandraUsername, *cassandraPassword, *cassandraHostSelectionPolicy, *cassandraTimeout, *cassandraReadConcurrency, *cassandraWriteConcurrency, *cassandraReadQueueSize, *cassandraWriteQueueSize, *cassandraRetries, *cqlProtocolVersion, *cassandraSSL, *cassandraAuth, *cassandraHostVerification) if err != nil { log.Fatal(4, "failed to initialize cassandra. %s", err) } - store.InitMetrics(stats) /*********************************** Initialize our MemoryStore ***********************************/ metrics = mdata.NewAggMetrics(store, chunkSpan, numChunks, chunkMaxStale, metricMaxStale, ttl, gcInterval, finalSettings) - mdata.InitMetrics(stats) /*********************************** Initialize our Inputs @@ -355,12 +313,12 @@ func main() { inputs := make([]input.Plugin, 0) // note. all these New functions must either return a valid instance or call log.Fatal if inCarbon.Enabled { - inputs = append(inputs, inCarbon.New(stats)) + inputs = append(inputs, inCarbon.New()) } if inKafkaMdm.Enabled { sarama.Logger = l.New(os.Stdout, "[Sarama] ", l.LstdFlags) - inputs = append(inputs, inKafkaMdm.New(stats)) + inputs = append(inputs, inKafkaMdm.New()) } if cluster.Mode == cluster.ModeMulti && len(inputs) > 1 { @@ -372,14 +330,14 @@ func main() { ***********************************/ handlers := make([]mdata.NotifierHandler, 0) if notifierKafka.Enabled { - handlers = append(handlers, notifierKafka.New(*instance, metrics, stats)) + handlers = append(handlers, notifierKafka.New(*instance, metrics)) } if notifierNsq.Enabled { - handlers = append(handlers, notifierNsq.New(*instance, metrics, stats)) + handlers = append(handlers, notifierNsq.New(*instance, metrics)) } - mdata.InitPersistNotifier(stats, handlers...) + mdata.InitPersistNotifier(handlers...) /*********************************** Initialize our MetricIdx @@ -409,7 +367,7 @@ func main() { log.Fatal(4, "No metricIndex handlers enabled.") } - err = metricIndex.Init(stats) + err = metricIndex.Init() if err != nil { log.Fatal(4, "failed to initialize metricIndex: %s", err) } @@ -427,12 +385,16 @@ func main() { plugin.Start(metrics, metricIndex, usg) } - promotionReadyAtChan <- (uint32(time.Now().Unix())/highestChunkSpan + 1) * highestChunkSpan + // metric cluster.promotion_wait is how long a candidate (secondary node) has to wait until it can become a primary + // When the timer becomes 0 it means the in-memory buffer has been able to fully populate so that if you stop a primary + // and it was able to save its complete chunks, this node will be able to take over without dataloss. + // You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save. + stats.NewTimeDiffReporter32("cluster.promotion_wait", (uint32(time.Now().Unix())/highestChunkSpan+1)*highestChunkSpan) /*********************************** Initialize our API server ***********************************/ - apiServer, err := api.NewServer(stats) + apiServer, err := api.NewServer() if err != nil { log.Fatal(4, "Failed to start API. %s", err.Error()) } @@ -494,65 +456,3 @@ func main() { log.Close() } - -func initMetrics(stats met.Backend) { - cassWriteQueueSize = stats.NewGauge("cassandra.write_queue.size", int64(*cassandraWriteQueueSize)) - cassWriters = stats.NewGauge("cassandra.num_writers", int64(*cassandraWriteConcurrency)) - getTargetDuration = stats.NewTimer("get_target_duration", 0) - itersToPointsDuration = stats.NewTimer("iters_to_points_duration", 0) - messagesSize = stats.NewMeter("message_size", 0) - inItems = stats.NewMeter("in.items", 0) - points = stats.NewGauge("total_points", 0) - alloc = stats.NewGauge("bytes_alloc.not_freed", 0) - totalAlloc = stats.NewGauge("bytes_alloc.incl_freed", 0) - sysBytes = stats.NewGauge("bytes_sys", 0) - clusterPrimary = stats.NewGauge("cluster.primary", 0) - clusterPromoWait = stats.NewGauge("cluster.promotion_wait", 1) - gcNum = stats.NewGauge("gc.num", 0) - gcDur = stats.NewGauge("gc.dur", 0) // in nanoseconds. last known duration. - gcCpuFraction = stats.NewGauge("gc.cpufraction", 0) // reported as pro-mille - heapObjects = stats.NewGauge("gc.heap_objects", 0) - - // run a collector for some global stats - go func() { - var m runtime.MemStats - var promotionReadyAtTs uint32 - - ticker := time.Tick(time.Duration(1) * time.Second) - for { - select { - case now := <-ticker: - points.Value(int64(chunk.TotalPoints())) - runtime.ReadMemStats(&m) - alloc.Value(int64(m.Alloc)) - totalAlloc.Value(int64(m.TotalAlloc)) - sysBytes.Value(int64(m.Sys)) - gcNum.Value(int64(m.NumGC)) - gcDur.Value(int64(m.PauseNs[(m.NumGC+255)%256])) - gcCpuFraction.Value(int64(1000 * m.GCCPUFraction)) - heapObjects.Value(int64(m.HeapObjects)) - var px int64 - if cluster.ThisNode.IsPrimary() { - px = 1 - } else { - px = 0 - } - clusterPrimary.Value(px) - cassWriters.Value(int64(*cassandraWriteConcurrency)) - cassWriteQueueSize.Value(int64(*cassandraWriteQueueSize)) - unix := uint32(now.Unix()) - if unix >= promotionReadyAtTs { - if promotionReadyAtTs == 0 { - // not set yet. operator should hold off - clusterPromoWait.Value(1) - } else { - clusterPromoWait.Value(0) - } - } else { - clusterPromoWait.Value(int64(promotionReadyAtTs - unix)) - } - case promotionReadyAtTs = <-promotionReadyAtChan: - } - } - }() -} diff --git a/scripts/config/metrictank-docker.ini b/scripts/config/metrictank-docker.ini index ff14ed7742..3a1f7846bd 100644 --- a/scripts/config/metrictank-docker.ini +++ b/scripts/config/metrictank-docker.ini @@ -83,20 +83,13 @@ cassandra-username = cassandra # password for authentication cassandra-password = cassandra -## Profiling, instrumentation and logging ## +## Profiling and logging ## # see https://golang.org/pkg/runtime/#SetBlockProfileRate block-profile-rate = 0 # 0 to disable. 1 for max precision (expensive!) see https://golang.org/pkg/runtime/#pkg-variables") mem-profile-rate = 524288 # 512*1024 -# enable sending statsd messages for instrumentation -statsd-enabled = true -# statsd address -statsd-addr = statsdaemon:8125 -# standard or datadog -statsd-type = standard - # inspect status frequency. set to 0 to disable proftrigger-freq = 60s # path to store triggered profiles @@ -110,6 +103,23 @@ proftrigger-heap-thresh = 25000000000 # only log log-level and higher. 0=TRACE|1=DEBUG|2=INFO|3=WARN|4=ERROR|5=CRITICAL|6=FATAL log-level = 2 +# instrumentation stats +[stats] +# enable sending graphite messages for instrumentation +enabled = true +# stats prefix (will add trailing dot automatically if needed) +# The default matches what the Grafana dashboard expects +# $instance will be replaced with the `instance` setting. +# note, the 3rd word describes the environment you deployed in. +prefix = metrictank.stats.docker-env.$instance +# graphite address +addr = localhost:2003 +# interval at which to send statistics +interval = 1 +# how many messages (holding all measurements from one interval. rule of thumb: a message is ~25kB) to buffer up in case graphite endpoint is unavailable. +# With the default of 20k you will use max about 500MB and bridge 5 hours of downtime when needed +buffer-size = 20000 + ## http api ## [http] # tcp address for metrictank to bind to for its HTTP interface diff --git a/scripts/config/metrictank-package.ini b/scripts/config/metrictank-package.ini index 3342e34f22..e9d9314018 100644 --- a/scripts/config/metrictank-package.ini +++ b/scripts/config/metrictank-package.ini @@ -70,20 +70,13 @@ cassandra-retries = 0 # CQL protocol version. cassandra 3.x needs v3 or 4. cql-protocol-version = 4 -## Profiling, instrumentation and logging ## +## Profiling and logging ## # see https://golang.org/pkg/runtime/#SetBlockProfileRate block-profile-rate = 0 # 0 to disable. 1 for max precision (expensive!) see https://golang.org/pkg/runtime/#pkg-variables") mem-profile-rate = 524288 # 512*1024 -# enable sending statsd messages for instrumentation -statsd-enabled = true -# statsd address -statsd-addr = localhost:8125 -# standard or datadog -statsd-type = standard - # inspect status frequency. set to 0 to disable proftrigger-freq = 60s # path to store triggered profiles @@ -97,6 +90,22 @@ proftrigger-heap-thresh = 25000000000 # only log log-level and higher. 0=TRACE|1=DEBUG|2=INFO|3=WARN|4=ERROR|5=CRITICAL|6=FATAL log-level = 2 +# instrumentation stats +[stats] +# enable sending graphite messages for instrumentation +enabled = true +# stats prefix (will add trailing dot automatically if needed) +# The default matches what the Grafana dashboard expects +# $instance will be replaced with the `instance` setting. +# note, the 3rd word describes the environment you deployed in. +prefix = metrictank.stats.default.$instance +# graphite address +addr = localhost:2003 +# interval at which to send statistics +interval = 1 +# how many messages (holding all measurements from one interval. rule of thumb: a message is ~25kB) to buffer up in case graphite endpoint is unavailable. +# With the default of 20k you will use max about 500MB and bridge 5 hours of downtime when needed +buffer-size = 20000 ## http api ## [http] diff --git a/scripts/end2end_test.sh b/scripts/end2end_test.sh index 7888663a5b..4187759a52 100755 --- a/scripts/end2end_test.sh +++ b/scripts/end2end_test.sh @@ -4,7 +4,7 @@ set -x # debugging DOCKER_COMPOSE_VERSION="1.8.1" DOCKER_COMPOSE_EXEC="/tmp/docker-compose" -DOCKER_COMPOSE_FILE="docker/docker-standard/docker-compose.yml" +DOCKER_COMPOSE_FILE="docker/docker-dev/docker-compose.yml" # this is only necessary until Circle CI updates their images to provide a recent version curl -L "https://github.com/docker/compose/releases/download/$DOCKER_COMPOSE_VERSION/docker-compose-Linux-x86_64" > $DOCKER_COMPOSE_EXEC diff --git a/scripts/verify_metrics_received.py b/scripts/verify_metrics_received.py index 276e95e072..c0a69f1151 100755 --- a/scripts/verify_metrics_received.py +++ b/scripts/verify_metrics_received.py @@ -29,7 +29,7 @@ def error(msg): }, 'data': { 'target': - 'stats.docker-env.metrictank.default.carbon.metrics_received', + 'perSecond(metrictank.stats.docker-env.default.input.carbon.metrics_received.counter32)', }, } @@ -53,13 +53,14 @@ def error(msg): ) # verify the format and content of the response is as we expect it +# note : since we got a perSecond(), the first value is always null, we only use points 2 and onwards if ( len(parsed_result) < 1 or 'datapoints' not in parsed_result[0] or not all([len(x) >= 2 for x in parsed_result[0]['datapoints']]) or not all([ isinstance(x[0], (int, float)) - for x in parsed_result[0]['datapoints'] + for x in parsed_result[0]['datapoints'][1:] ]) ): error( @@ -67,7 +68,7 @@ def error(msg): .format(response=result.text) ) -datapoints = [float(x[0]) for x in parsed_result[0]['datapoints']] +datapoints = [float(x[0]) for x in parsed_result[0]['datapoints'][1:]] datapoints_avg = sum(datapoints)/len(datapoints) expected = float(sys.argv[4]) diff --git a/stats/bool.go b/stats/bool.go new file mode 100644 index 0000000000..bea2c154bb --- /dev/null +++ b/stats/bool.go @@ -0,0 +1,36 @@ +package stats + +import ( + "sync/atomic" + "time" +) + +type Bool struct { + val uint32 +} + +func NewBool(name string) *Bool { + return registry.add(name, &Bool{}).(*Bool) +} + +func (b *Bool) SetTrue() { + atomic.StoreUint32(&b.val, 1) +} + +func (b *Bool) SetFalse() { + atomic.StoreUint32(&b.val, 0) +} + +func (b *Bool) Set(val bool) { + if val { + b.SetTrue() + } else { + b.SetFalse() + } +} + +func (b *Bool) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + val := atomic.LoadUint32(&b.val) + buf = WriteUint32(buf, prefix, []byte("gauge1"), val, now) + return buf +} diff --git a/stats/config/init.go b/stats/config/init.go new file mode 100644 index 0000000000..db7c97ed73 --- /dev/null +++ b/stats/config/init.go @@ -0,0 +1,44 @@ +package config + +import ( + "flag" + "strings" + + "github.com/raintank/metrictank/stats" + "github.com/raintank/worldping-api/pkg/log" + "github.com/rakyll/globalconf" +) + +var enabled bool +var prefix string +var addr string +var interval int +var bufferSize int + +func ConfigSetup() { + inStats := flag.NewFlagSet("stats", flag.ExitOnError) + inStats.BoolVar(&enabled, "enabled", true, "enable sending graphite messages for instrumentation") + inStats.StringVar(&prefix, "prefix", "metrictank.stats.default.$instance", "stats prefix (will add trailing dot automatically if needed)") + inStats.StringVar(&addr, "addr", "localhost:2003", "graphite address") + inStats.IntVar(&interval, "interval", 1, "interval at which to send statistics") + inStats.IntVar(&bufferSize, "buffer-size", 20000, "how many messages (holding all measurements from one interval. rule of thumb: a message is ~25kB) to buffer up in case graphite endpoint is unavailable. With the default of 20k you will use max about 500MB and bridge 5 hours of downtime when needed") + globalconf.Register("stats", inStats) +} + +func ConfigProcess(instance string) { + if !enabled { + return + } + // TODO validate tcp addr + prefix = strings.Replace(prefix, "$instance", instance, -1) +} + +func Start() { + if enabled { + stats.NewMemoryReporter() + stats.NewGraphite(prefix, addr, interval, bufferSize) + } else { + stats.NewDevnull() + log.Warn("running metrictank without instrumentation.") + } +} diff --git a/stats/counter32.go b/stats/counter32.go new file mode 100644 index 0000000000..304c25a412 --- /dev/null +++ b/stats/counter32.go @@ -0,0 +1,36 @@ +package stats + +import ( + "sync/atomic" + "time" +) + +type Counter32 struct { + val uint32 +} + +func NewCounter32(name string) *Counter32 { + return registry.add(name, &Counter32{}).(*Counter32) +} + +func (c *Counter32) SetUint32(val uint32) { + atomic.StoreUint32(&c.val, val) +} + +func (c *Counter32) Inc() { + atomic.AddUint32(&c.val, 1) +} + +func (c *Counter32) Add(val int) { + c.AddUint32(uint32(val)) +} + +func (c *Counter32) AddUint32(val uint32) { + atomic.AddUint32(&c.val, val) +} + +func (c *Counter32) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + val := atomic.LoadUint32(&c.val) + buf = WriteUint32(buf, prefix, []byte("counter32"), val, now) + return buf +} diff --git a/stats/counter64.go b/stats/counter64.go new file mode 100644 index 0000000000..7dd7a5f21f --- /dev/null +++ b/stats/counter64.go @@ -0,0 +1,32 @@ +package stats + +import ( + "sync/atomic" + "time" +) + +type Counter64 struct { + val uint64 +} + +func NewCounter64(name string) *Counter64 { + return registry.add(name, &Counter64{}).(*Counter64) +} + +func (c *Counter64) SetUint64(val uint64) { + atomic.StoreUint64(&c.val, val) +} + +func (c *Counter64) Inc() { + atomic.AddUint64(&c.val, 1) +} + +func (c *Counter64) AddUint64(val uint64) { + atomic.AddUint64(&c.val, val) +} + +func (c *Counter64) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + val := atomic.LoadUint64(&c.val) + buf = WriteUint64(buf, prefix, []byte("counter64"), val, now) + return buf +} diff --git a/stats/gauge32.go b/stats/gauge32.go new file mode 100644 index 0000000000..9399e06dc8 --- /dev/null +++ b/stats/gauge32.go @@ -0,0 +1,56 @@ +package stats + +import ( + "sync/atomic" + "time" +) + +type Gauge32 struct { + val uint32 +} + +func NewGauge32(name string) *Gauge32 { + return registry.add(name, &Gauge32{}).(*Gauge32) +} + +func (g *Gauge32) Inc() { + atomic.AddUint32(&g.val, 1) +} + +func (g *Gauge32) Dec() { + atomic.AddUint32(&g.val, ^uint32(0)) +} + +func (g *Gauge32) AddUint32(val uint32) { + atomic.AddUint32(&g.val, val) +} + +func (g *Gauge32) DecUint32(val uint32) { + atomic.AddUint32(&g.val, ^uint32(val-1)) +} + +func (g *Gauge32) Add(val int) { + if val == 0 { + return + } + if val > 0 { + g.AddUint32(uint32(val)) + return + } + // < 0 + g.DecUint32(uint32(-1 * val)) +} + +func (g *Gauge32) Set(val int) { + atomic.StoreUint32(&g.val, uint32(val)) +} + +func (g *Gauge32) SetUint32(val uint32) { + atomic.StoreUint32(&g.val, val) +} + +func (g *Gauge32) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + val := atomic.LoadUint32(&g.val) + buf = WriteUint32(buf, prefix, []byte("gauge32"), val, now) + return buf +} diff --git a/stats/gauge64.go b/stats/gauge64.go new file mode 100644 index 0000000000..789eae085e --- /dev/null +++ b/stats/gauge64.go @@ -0,0 +1,56 @@ +package stats + +import ( + "sync/atomic" + "time" +) + +type Gauge64 struct { + val uint64 +} + +func NewGauge64(name string) *Gauge64 { + return registry.add(name, &Gauge64{}).(*Gauge64) +} + +func (g *Gauge64) Inc() { + atomic.AddUint64(&g.val, 1) +} + +func (g *Gauge64) Dec() { + atomic.AddUint64(&g.val, ^uint64(0)) +} + +func (g *Gauge64) AddUint64(val uint64) { + atomic.AddUint64(&g.val, val) +} + +func (g *Gauge64) DecUint64(val uint64) { + atomic.AddUint64(&g.val, ^uint64(val-1)) +} + +func (g *Gauge64) Add(val int) { + if val == 0 { + return + } + if val > 0 { + g.AddUint64(uint64(val)) + return + } + // < 0 + g.DecUint64(uint64(-1 * val)) +} + +func (g *Gauge64) Set(val int) { + atomic.StoreUint64(&g.val, uint64(val)) +} + +func (g *Gauge64) SetUint64(val uint64) { + atomic.StoreUint64(&g.val, val) +} + +func (g *Gauge64) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + val := atomic.LoadUint64(&g.val) + buf = WriteUint64(buf, prefix, []byte("gauge64"), val, now) + return buf +} diff --git a/stats/init.go b/stats/init.go new file mode 100644 index 0000000000..43ee66d26d --- /dev/null +++ b/stats/init.go @@ -0,0 +1,20 @@ +// Package stats provides functionality for instrumenting metrics and reporting them +// +// The metrics can be user specified, or sourced from the runtime (reporters) +// To use this package correctly, you must instantiate exactly 1 output. +// If you use 0 outputs, certain metrics type will accumulate data unboundedly +// (e.g. histograms and meters) resulting in unreasonable memory usage. +// (though you can ignore this for shortlived processes, unit tests, etc) +// If you use >1 outputs, then each will only see a partial view of the stats. +// Currently supported outputs are DevNull and Graphite +package stats + +var registry *Registry + +func init() { + registry = NewRegistry() +} + +func Clear() { + registry.Clear() +} diff --git a/stats/latencyhistogram12h32.go b/stats/latencyhistogram12h32.go new file mode 100644 index 0000000000..9185a2a39f --- /dev/null +++ b/stats/latencyhistogram12h32.go @@ -0,0 +1,44 @@ +package stats + +import ( + "time" + + "github.com/Dieterbe/artisanalhistogram/hist12h" +) + +// tracks latency measurements in a given range as 32 bit counters +type LatencyHistogram12h32 struct { + hist hist12h.Hist12h + since time.Time +} + +func NewLatencyHistogram12h32(name string) *LatencyHistogram12h32 { + return registry.add(name, &LatencyHistogram12h32{ + hist: hist12h.New(), + since: time.Now(), + }, + ).(*LatencyHistogram12h32) +} + +func (l *LatencyHistogram12h32) Value(t time.Duration) { + l.hist.AddDuration(t) +} + +func (l *LatencyHistogram12h32) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + snap := l.hist.Snapshot() + // TODO: once we can actually do cool stuff (e.g. visualize) histogram bucket data, report it + // for now, only report the summaries :( + r, ok := l.hist.Report(snap) + if ok { + buf = WriteUint32(buf, prefix, []byte("latency.min.gauge32"), r.Min/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.mean.gauge32"), r.Mean/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.median.gauge32"), r.Median/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.p75.gauge32"), r.P75/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.p90.gauge32"), r.P90/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.max.gauge32"), r.Max/1000, now) + } + buf = WriteUint32(buf, prefix, []byte("values.count32"), r.Count, now) + buf = WriteFloat64(buf, prefix, []byte("values.rate32"), float64(r.Count)/now.Sub(l.since).Seconds(), now) + l.since = now + return buf +} diff --git a/stats/latencyhistogram15s32.go b/stats/latencyhistogram15s32.go new file mode 100644 index 0000000000..aa44490a91 --- /dev/null +++ b/stats/latencyhistogram15s32.go @@ -0,0 +1,45 @@ +package stats + +import ( + "time" + + "github.com/Dieterbe/artisanalhistogram/hist15s" +) + +// tracks latency measurements in a given range as 32 bit counters +type LatencyHistogram15s32 struct { + hist hist15s.Hist15s + since time.Time +} + +func NewLatencyHistogram15s32(name string) *LatencyHistogram15s32 { + return registry.add(name, &LatencyHistogram15s32{ + hist: hist15s.New(), + since: time.Now(), + }, + ).(*LatencyHistogram15s32) +} + +func (l *LatencyHistogram15s32) Value(t time.Duration) { + l.hist.AddDuration(t) +} + +func (l *LatencyHistogram15s32) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + snap := l.hist.Snapshot() + // TODO: once we can actually do cool stuff (e.g. visualize) histogram bucket data, report it + // for now, only report the summaries :( + r, ok := l.hist.Report(snap) + if ok { + buf = WriteUint32(buf, prefix, []byte("latency.min.gauge32"), r.Min/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.mean.gauge32"), r.Mean/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.median.gauge32"), r.Median/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.p75.gauge32"), r.P75/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.p90.gauge32"), r.P90/1000, now) + buf = WriteUint32(buf, prefix, []byte("latency.max.gauge32"), r.Max/1000, now) + } + buf = WriteUint32(buf, prefix, []byte("values.count32"), r.Count, now) + buf = WriteFloat64(buf, prefix, []byte("values.rate32"), float64(r.Count)/now.Sub(l.since).Seconds(), now) + + l.since = now + return buf +} diff --git a/stats/memory_reporter.go b/stats/memory_reporter.go new file mode 100644 index 0000000000..04eeeb5722 --- /dev/null +++ b/stats/memory_reporter.go @@ -0,0 +1,47 @@ +package stats + +import ( + "runtime" + "time" +) + +// MemoryReporter sources memory stats from the runtime and reports them +type MemoryReporter struct { + mem runtime.MemStats + gcCyclesTotal uint32 +} + +func NewMemoryReporter() *MemoryReporter { + return registry.add("memory", &MemoryReporter{}).(*MemoryReporter) +} + +func (m *MemoryReporter) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + runtime.ReadMemStats(&m.mem) + + // metric memory.total_bytes_allocated is a counter of total amount of bytes allocated during process lifetime + buf = WriteUint64(buf, prefix, []byte("total_bytes_allocated.counter64"), m.mem.TotalAlloc, now) + + // metric memory.bytes_allocated_on_heap is a gauge of currently allocated (within the runtime) memory. + buf = WriteUint64(buf, prefix, []byte("bytes.allocated_in_heap.gauge64"), m.mem.Alloc, now) + + // metric memory.bytes.obtained_from_sys is the amount of bytes currently obtained from the system by the process. This is what the profiletrigger looks at. + buf = WriteUint64(buf, prefix, []byte("bytes.obtained_from_sys.gauge64"), m.mem.Sys, now) + + // metric memory.total_gc_cycles is a counter of the number of GC cycles since process start + buf = WriteUint32(buf, prefix, []byte("total_gc_cycles.counter64"), m.mem.NumGC, now) + + // metric memory.gc.cpu_fraction is how much cpu is consumed by the GC across process lifetime, in pro-mille + buf = WriteUint32(buf, prefix, []byte("gc.cpu_fraction.gauge32"), uint32(1000*m.mem.GCCPUFraction), now) + + // metric memory.gc.heap_objects is how many objects are allocated on the heap, it's a key indicator for GC workload + buf = WriteUint64(buf, prefix, []byte("gc.heap_objects.gauge64"), m.mem.HeapObjects, now) + + // there was no new GC run, we should only report points to represent actual runs + if m.gcCyclesTotal != m.mem.NumGC { + // metric memory.gc.last_duration is the duration of the last GC STW pause in nanoseconds + buf = WriteUint64(buf, prefix, []byte("gc.last_duration.gauge64"), m.mem.PauseNs[(m.mem.NumGC+255)%256], now) + m.gcCyclesTotal = m.mem.NumGC + } + + return buf +} diff --git a/stats/meter32.go b/stats/meter32.go new file mode 100644 index 0000000000..02ca7b9775 --- /dev/null +++ b/stats/meter32.go @@ -0,0 +1,117 @@ +package stats + +import ( + "math" + "sort" + "sync" + "time" + + "github.com/dgryski/go-linlog" +) + +// meter maintains a histogram, from which it reports summary statistics such as quantiles. +// you can choose to approximate, in which case it uses linear-log bucketed class boundaries +// (we could also report the bins but right now we wouldn't be able to do anything with them +// and also the boundaries are powers of two which is a bit weird) + +type Meter32 struct { + approx bool + + sync.Mutex + hist map[uint32]uint32 + min uint32 + max uint32 + count uint32 + since time.Time +} + +func NewMeter32(name string, approx bool) *Meter32 { + return registry.add(name, &Meter32{ + approx: approx, + hist: make(map[uint32]uint32), + min: math.MaxUint32, + since: time.Now(), + }, + ).(*Meter32) +} + +func (m *Meter32) clear() { + m.hist = make(map[uint32]uint32) + m.min = math.MaxUint32 + m.max = 0 + m.count = 0 +} + +func (m *Meter32) Value(val int) { + m.ValueUint32(uint32(val)) +} + +func (m *Meter32) ValueUint32(val uint32) { + bin := val + if m.approx { + // subbin log2(16)= 4 -> up to 100/16 = 6.25% error I think + // in practice it's max about 12% but anyway. + tmp, _ := linlog.BinOf(uint64(val), 4, 2) + bin = uint32(tmp) + } + m.Lock() + if val < m.min { + m.min = val + } + + if val > m.max { + m.max = val + } + m.hist[bin]++ + m.count += 1 + m.Unlock() +} + +func (m *Meter32) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + m.Lock() + if m.count == 0 { + m.Unlock() + return buf + } + keys := make([]int, 0, len(m.hist)) + for k := range m.hist { + keys = append(keys, int(k)) + } + sort.Ints(keys) + + quantiles := []struct { + p float64 + str string + }{ + {0.50, "median.gauge32"}, + {0.75, "p75.gauge32"}, + {0.90, "p90.gauge32"}, + } + + pidx := 0 + runningcount := uint32(0) + runningsum := uint64(0) + + for _, k := range keys { + key := uint32(k) + runningcount += m.hist[key] + runningsum += uint64(m.hist[key]) * uint64(key) + p := float64(runningcount) / float64(m.count) + for pidx < len(quantiles) && quantiles[pidx].p <= p { + buf = WriteUint32(buf, prefix, []byte(quantiles[pidx].str), key, now) + pidx++ + } + } + + buf = WriteUint32(buf, prefix, []byte("min.gauge32"), m.min, now) + buf = WriteUint32(buf, prefix, []byte("mean.gauge32"), uint32(runningsum/uint64(m.count)), now) + buf = WriteUint32(buf, prefix, []byte("max.gauge32"), m.max, now) + buf = WriteUint32(buf, prefix, []byte("values.count32"), m.count, now) + buf = WriteFloat64(buf, prefix, []byte("values.rate32"), float64(m.count)/now.Sub(m.since).Seconds(), now) + m.since = now + + m.clear() + m.Unlock() + + return buf +} diff --git a/stats/out_devnull.go b/stats/out_devnull.go new file mode 100644 index 0000000000..6dfc135bd7 --- /dev/null +++ b/stats/out_devnull.go @@ -0,0 +1,17 @@ +package stats + +import ( + "time" +) + +func NewDevnull() { + go func() { + ticker := tick(time.Second) + buf := make([]byte, 0) + for now := range ticker { + for _, metric := range registry.list() { + metric.ReportGraphite(nil, buf[:], now) + } + } + }() +} diff --git a/stats/out_graphite.go b/stats/out_graphite.go new file mode 100644 index 0000000000..5a3eecd9b5 --- /dev/null +++ b/stats/out_graphite.go @@ -0,0 +1,125 @@ +package stats + +import ( + "bytes" + "net" + "time" + + "github.com/raintank/worldping-api/pkg/log" +) + +var ( + queueItems *Range32 + genDataDuration *Gauge32 + flushDuration *LatencyHistogram15s32 + messageSize *Gauge32 + connected *Bool +) + +type GraphiteMetric interface { + // Report the measurements in graphite format and reset measurements for the next interval if needed + ReportGraphite(prefix []byte, buf []byte, now time.Time) []byte +} + +type Graphite struct { + prefix []byte + addr string + prefixCounter32 string + prefixGauge32 string + prefixGauge64 string + prefixRange32 string + prefixLatencyHistogram32 string + + toGraphite chan []byte +} + +func NewGraphite(prefix, addr string, interval int, bufferSize int) { + if len(prefix) != 0 && prefix[len(prefix)-1] != '.' { + prefix = prefix + "." + } + NewGauge32("stats.graphite.write_queue.size").Set(bufferSize) + queueItems = NewRange32("stats.graphite.write_queue.items") + // metric stats.generate_message is how long it takes to generate the stats + genDataDuration = NewGauge32("stats.generate_message.duration") + flushDuration = NewLatencyHistogram15s32("stats.graphite.flush") + messageSize = NewGauge32("stats.message_size") + connected = NewBool("stats.graphite.connected") + + g := &Graphite{ + prefix: []byte(prefix), + addr: addr, + toGraphite: make(chan []byte, bufferSize), + } + go g.writer() + go g.reporter(interval) +} + +func (g *Graphite) reporter(interval int) { + ticker := tick(time.Duration(interval) * time.Second) + for now := range ticker { + log.Debug("stats flushing for", now, "to graphite") + queueItems.Value(len(g.toGraphite)) + if cap(g.toGraphite) != 0 && len(g.toGraphite) == cap(g.toGraphite) { + // no space in buffer, no use in doing any work + continue + } + + pre := time.Now() + + buf := make([]byte, 0) + + var fullPrefix bytes.Buffer + for name, metric := range registry.list() { + fullPrefix.Reset() + fullPrefix.Write(g.prefix) + fullPrefix.WriteString(name) + fullPrefix.WriteRune('.') + buf = metric.ReportGraphite(fullPrefix.Bytes(), buf, now) + } + + genDataDuration.Set(int(time.Since(pre).Nanoseconds())) + messageSize.Set(len(buf)) + g.toGraphite <- buf + queueItems.Value(len(g.toGraphite)) + } +} + +// graphiteWriter is the background workers that connects to graphite and submits all pending data to it +// TODO: conn.Write() returns no error for a while when the remote endpoint is down, the reconnect happens with a delay. this can also cause lost data for a second or two. +func (g *Graphite) writer() { + var conn net.Conn + var err error + + assureConn := func() net.Conn { + connected.Set(conn != nil) + for conn == nil { + time.Sleep(time.Second) + conn, err = net.Dial("tcp", g.addr) + if err == nil { + log.Info("stats now connected to %s", g.addr) + } else { + log.Warn("stats dialing %s failed: %s. will retry", g.addr, err.Error()) + } + connected.Set(conn != nil) + } + return conn + } + + for buf := range g.toGraphite { + queueItems.Value(len(g.toGraphite)) + var ok bool + for !ok { + conn = assureConn() + pre := time.Now() + _, err = conn.Write(buf) + if err == nil { + ok = true + flushDuration.Value(time.Since(pre)) + } else { + log.Warn("stats failed to write to graphite: %s (took %s). will retry...", err, time.Now().Sub(pre)) + conn.Close() + conn = nil + } + } + } +} diff --git a/stats/range32.go b/stats/range32.go new file mode 100644 index 0000000000..c5bbd6d36f --- /dev/null +++ b/stats/range32.go @@ -0,0 +1,56 @@ +package stats + +import ( + "math" + "sync" + "time" +) + +// Range32 computes the min and max of sets of numbers, as 32bit numbers +// example application: queue depths +// min lets you see if the queue is able to drain +// max lets you see how large the queue tends to grow +// concurrency-safe +type Range32 struct { + sync.Mutex + min uint32 + max uint32 + valid bool // whether any values have been seen +} + +func NewRange32(name string) *Range32 { + return registry.add(name, &Range32{ + min: math.MaxUint32, + }, + ).(*Range32) +} + +func (r *Range32) Value(val int) { + r.ValueUint32(uint32(val)) +} + +func (r *Range32) ValueUint32(val uint32) { + r.Lock() + if val < r.min { + r.min = val + } + if val > r.max { + r.max = val + } + r.valid = true + r.Unlock() +} + +func (r *Range32) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + r.Lock() + // if no values were seen, don't report anything to graphite + if r.valid { + buf = WriteUint32(buf, prefix, []byte("min.gauge32"), r.min, now) + buf = WriteUint32(buf, prefix, []byte("max.gauge32"), r.max, now) + r.min = math.MaxUint32 + r.max = 0 + r.valid = false + } + r.Unlock() + return buf +} diff --git a/stats/registry.go b/stats/registry.go new file mode 100644 index 0000000000..e1851295a7 --- /dev/null +++ b/stats/registry.go @@ -0,0 +1,50 @@ +package stats + +import ( + "fmt" + "sync" +) + +var errFmtMetricExists = "fatal: metric %q already exists" + +// Registry tracks metrics and reporters +type Registry struct { + sync.Mutex + // here we use just the metric name as key. it does not include any prefix + // this means that technically we're more strict here then needed around naming conflicts: + // it's possible to have metrics with the same name, but a different prefix. + // we still complain about that + metrics map[string]GraphiteMetric +} + +func NewRegistry() *Registry { + return &Registry{ + metrics: make(map[string]GraphiteMetric), + } +} + +func (r *Registry) add(name string, metric GraphiteMetric) GraphiteMetric { + r.Lock() + if _, ok := r.metrics[name]; ok { + panic(fmt.Sprintf(errFmtMetricExists, name)) + } + r.metrics[name] = metric + r.Unlock() + return metric +} + +func (r *Registry) list() map[string]GraphiteMetric { + metrics := make(map[string]GraphiteMetric) + r.Lock() + for name, metric := range r.metrics { + metrics[name] = metric + } + r.Unlock() + return metrics +} + +func (r *Registry) Clear() { + r.Lock() + r.metrics = make(map[string]GraphiteMetric) + r.Unlock() +} diff --git a/stats/tick.go b/stats/tick.go new file mode 100644 index 0000000000..b6bcfb1f7b --- /dev/null +++ b/stats/tick.go @@ -0,0 +1,24 @@ +package stats + +import "time" + +// provides "clean" ticks at precise intervals, and delivers them shortly after +func tick(period time.Duration) chan time.Time { + ch := make(chan time.Time) + go func() { + for { + now := time.Now() + nowUnix := now.UnixNano() + diff := period - (time.Duration(nowUnix) % period) + ideal := now.Add(diff) + time.Sleep(diff) + + // try to write, if it blocks, skip the tick + select { + case ch <- ideal: + default: + } + } + }() + return ch +} diff --git a/stats/timediff_reporter.go b/stats/timediff_reporter.go new file mode 100644 index 0000000000..db8ccdbfb9 --- /dev/null +++ b/stats/timediff_reporter.go @@ -0,0 +1,34 @@ +package stats + +import ( + "sync/atomic" + "time" +) + +// reports the time in seconds until a specific timestamp is reached +// once reached, reports 0 +type TimeDiffReporter32 struct { + target uint32 +} + +func NewTimeDiffReporter32(name string, target uint32) *TimeDiffReporter32 { + return registry.add(name, &TimeDiffReporter32{ + target: target, + }, + ).(*TimeDiffReporter32) +} + +func (g *TimeDiffReporter32) Set(target uint32) { + atomic.StoreUint32(&g.target, target) +} + +func (g *TimeDiffReporter32) ReportGraphite(prefix, buf []byte, now time.Time) []byte { + target := atomic.LoadUint32(&g.target) + now32 := uint32(now.Unix()) + report := uint32(0) + if now32 < target { + report = target - now32 + } + buf = WriteUint32(buf, prefix, []byte("gauge32"), report, now) + return buf +} diff --git a/stats/write.go b/stats/write.go new file mode 100644 index 0000000000..1c3c071461 --- /dev/null +++ b/stats/write.go @@ -0,0 +1,36 @@ +package stats + +import ( + "strconv" + "time" +) + +func WriteFloat64(buf, prefix, key []byte, val float64, now time.Time) []byte { + buf = append(buf, prefix...) + buf = append(buf, key...) + buf = append(buf, ' ') + buf = strconv.AppendFloat(buf, val, 'f', -1, 64) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, now.Unix(), 10) + return append(buf, '\n') +} + +func WriteUint32(buf, prefix, key []byte, val uint32, now time.Time) []byte { + buf = append(buf, prefix...) + buf = append(buf, key...) + buf = append(buf, ' ') + buf = strconv.AppendUint(buf, uint64(val), 10) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, now.Unix(), 10) + return append(buf, '\n') +} + +func WriteUint64(buf, prefix, key []byte, val uint64, now time.Time) []byte { + buf = append(buf, prefix...) + buf = append(buf, key...) + buf = append(buf, ' ') + buf = strconv.AppendUint(buf, val, 10) + buf = append(buf, ' ') + buf = strconv.AppendInt(buf, now.Unix(), 10) + return append(buf, '\n') +} diff --git a/usage/usage_test.go b/usage/usage_test.go index 7522de4cc7..740f478038 100644 --- a/usage/usage_test.go +++ b/usage/usage_test.go @@ -6,7 +6,6 @@ import ( "time" "github.com/benbjohnson/clock" - "github.com/raintank/met/helper" "github.com/raintank/metrictank/consolidation" "github.com/raintank/metrictank/idx/memory" "github.com/raintank/metrictank/iter" @@ -99,10 +98,8 @@ func assert(interval uint32, epoch int, aggmetrics *FakeAggMetrics, org int, met func TestUsageBasic(t *testing.T) { mock := clock.NewMock() aggmetrics := NewFakeAggMetrics() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - mdata.InitMetrics(stats) metricIndex := memory.New() - metricIndex.Init(stats) + metricIndex.Init() interval := uint32(60) u := New(interval, aggmetrics, metricIndex, mock) @@ -138,10 +135,8 @@ func TestUsageBasic(t *testing.T) { func TestUsageMinusOne(t *testing.T) { mock := clock.NewMock() aggmetrics := NewFakeAggMetrics() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - mdata.InitMetrics(stats) metricIndex := memory.New() - metricIndex.Init(stats) + metricIndex.Init() interval := uint32(60) u := New(interval, aggmetrics, metricIndex, mock) @@ -171,10 +166,8 @@ func TestUsageMinusOne(t *testing.T) { func TestUsageWrap32(t *testing.T) { mock := clock.NewMock() aggmetrics := NewFakeAggMetrics() - stats, _ := helper.New(false, "", "standard", "metrictank", "") - mdata.InitMetrics(stats) metricIndex := memory.New() - metricIndex.Init(stats) + metricIndex.Init() interval := uint32(60) u := New(interval, aggmetrics, metricIndex, mock) diff --git a/vendor/github.com/DataDog/datadog-go/statsd/README.md b/vendor/github.com/DataDog/datadog-go/statsd/README.md deleted file mode 100644 index c3b462f85c..0000000000 --- a/vendor/github.com/DataDog/datadog-go/statsd/README.md +++ /dev/null @@ -1,45 +0,0 @@ -## Overview - -Package `statsd` provides a Go [dogstatsd](http://docs.datadoghq.com/guides/dogstatsd/) client. Dogstatsd extends Statsd, adding tags -and histograms. - -## Get the code - - $ go get github.com/DataDog/datadog-go/statsd - -## Usage - -```go -// Create the client -c, err := statsd.New("127.0.0.1:8125") -if err != nil { - log.Fatal(err) -} -// Prefix every metric with the app name -c.Namespace = "flubber." -// Send the EC2 availability zone as a tag with every metric -c.Tags = append(c.Tags, "us-east-1a") -err = c.Gauge("request.duration", 1.2, nil, 1) -``` - -## Buffering Client - -Dogstatsd accepts packets with multiple statsd payloads in them. Using the BufferingClient via `NewBufferingClient` will buffer up commands and send them when the buffer is reached or after 100msec. - -## Development - -Run the tests with: - - $ go test - -## Documentation - -Please see: http://godoc.org/github.com/DataDog/datadog-go/statsd - -## License - -go-dogstatsd is released under the [MIT license](http://www.opensource.org/licenses/mit-license.php). - -## Credits - -Original code by [ooyala](https://github.com/ooyala/go-dogstatsd). diff --git a/vendor/github.com/DataDog/datadog-go/statsd/statsd.go b/vendor/github.com/DataDog/datadog-go/statsd/statsd.go deleted file mode 100644 index 4038b890ac..0000000000 --- a/vendor/github.com/DataDog/datadog-go/statsd/statsd.go +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright 2013 Ooyala, Inc. - -/* -Package statsd provides a Go dogstatsd client. Dogstatsd extends the popular statsd, -adding tags and histograms and pushing upstream to Datadog. - -Refer to http://docs.datadoghq.com/guides/dogstatsd/ for information about DogStatsD. - -Example Usage: - - // Create the client - c, err := statsd.New("127.0.0.1:8125") - if err != nil { - log.Fatal(err) - } - // Prefix every metric with the app name - c.Namespace = "flubber." - // Send the EC2 availability zone as a tag with every metric - c.Tags = append(c.Tags, "us-east-1a") - err = c.Gauge("request.duration", 1.2, nil, 1) - -statsd is based on go-statsd-client. -*/ -package statsd - -import ( - "bytes" - "fmt" - "math/rand" - "net" - "strconv" - "strings" - "sync" - "time" -) - -// A Client is a handle for sending udp messages to dogstatsd. It is safe to -// use one Client from multiple goroutines simultaneously. -type Client struct { - conn net.Conn - // Namespace to prepend to all statsd calls - Namespace string - // Tags are global tags to be added to every statsd call - Tags []string - // BufferLength is the length of the buffer in commands. - bufferLength int - flushTime time.Duration - commands []string - stop bool - sync.Mutex -} - -// New returns a pointer to a new Client given an addr in the format "hostname:port". -func New(addr string) (*Client, error) { - udpAddr, err := net.ResolveUDPAddr("udp", addr) - if err != nil { - return nil, err - } - conn, err := net.DialUDP("udp", nil, udpAddr) - if err != nil { - return nil, err - } - client := &Client{conn: conn} - return client, nil -} - -// NewBuffered returns a Client that buffers its output and sends it in chunks. -// Buflen is the length of the buffer in number of commands. -func NewBuffered(addr string, buflen int) (*Client, error) { - client, err := New(addr) - if err != nil { - return nil, err - } - client.bufferLength = buflen - client.commands = make([]string, 0, buflen) - client.flushTime = time.Millisecond * 100 - go client.watch() - return client, nil -} - -// format a message from its name, value, tags and rate. Also adds global -// namespace and tags. -func (c *Client) format(name, value string, tags []string, rate float64) string { - var buf bytes.Buffer - if c.Namespace != "" { - buf.WriteString(c.Namespace) - } - buf.WriteString(name) - buf.WriteString(":") - buf.WriteString(value) - if rate < 1 { - buf.WriteString(`|@`) - buf.WriteString(strconv.FormatFloat(rate, 'f', -1, 64)) - } - - tags = append(c.Tags, tags...) - if len(tags) > 0 { - buf.WriteString("|#") - buf.WriteString(tags[0]) - for _, tag := range tags[1:] { - buf.WriteString(",") - buf.WriteString(tag) - } - } - return buf.String() -} - -func (c *Client) watch() { - for _ = range time.Tick(c.flushTime) { - if c.stop { - return - } - c.Lock() - if len(c.commands) > 0 { - // FIXME: eating error here - c.flush() - } - c.Unlock() - } -} - -func (c *Client) append(cmd string) error { - c.Lock() - c.commands = append(c.commands, cmd) - // if we should flush, lets do it - if len(c.commands) == c.bufferLength { - if err := c.flush(); err != nil { - c.Unlock() - return err - } - } - c.Unlock() - return nil -} - -// flush the commands in the buffer. Lock must be held by caller. -func (c *Client) flush() error { - data := strings.Join(c.commands, "\n") - _, err := c.conn.Write([]byte(data)) - // clear the slice with a slice op, doesn't realloc - c.commands = c.commands[:0] - return err -} - -func (c *Client) sendMsg(msg string) error { - // if this client is buffered, then we'll just append this - if c.bufferLength > 0 { - return c.append(msg) - } - c.Lock() - _, err := c.conn.Write([]byte(msg)) - c.Unlock() - return err -} - -// send handles sampling and sends the message over UDP. It also adds global namespace prefixes and tags. -func (c *Client) send(name, value string, tags []string, rate float64) error { - if c == nil { - return nil - } - if rate < 1 && rand.Float64() > rate { - return nil - } - data := c.format(name, value, tags, rate) - return c.sendMsg(data) -} - -// Gauge measures the value of a metric at a particular time. -func (c *Client) Gauge(name string, value float64, tags []string, rate float64) error { - stat := fmt.Sprintf("%f|g", value) - return c.send(name, stat, tags, rate) -} - -// Count tracks how many times something happened per second. -func (c *Client) Count(name string, value int64, tags []string, rate float64) error { - stat := fmt.Sprintf("%d|c", value) - return c.send(name, stat, tags, rate) -} - -// Histogram tracks the statistical distribution of a set of values. -func (c *Client) Histogram(name string, value float64, tags []string, rate float64) error { - stat := fmt.Sprintf("%f|h", value) - return c.send(name, stat, tags, rate) -} - -// Set counts the number of unique elements in a group. -func (c *Client) Set(name string, value string, tags []string, rate float64) error { - stat := fmt.Sprintf("%s|s", value) - return c.send(name, stat, tags, rate) -} - -// TimeInMilliseconds sends timing information in milliseconds. -// It is flushed by statsd with percentiles, mean and other info (https://github.com/etsy/statsd/blob/master/docs/metric_types.md#timing) -func (c *Client) TimeInMilliseconds(name string, value float64, tags []string, rate float64) error { - stat := fmt.Sprintf("%f|ms", value) - return c.send(name, stat, tags, rate) -} - -// Event sends the provided Event. -func (c *Client) Event(e *Event) error { - stat, err := e.Encode(c.Tags...) - if err != nil { - return err - } - return c.sendMsg(stat) -} - -// SimpleEvent sends an event with the provided title and text. -func (c *Client) SimpleEvent(title, text string) error { - e := NewEvent(title, text) - return c.Event(e) -} - -// Close the client connection. -func (c *Client) Close() error { - if c == nil { - return nil - } - c.stop = true - return c.conn.Close() -} - -// Events support - -type eventAlertType string - -const ( - // Info is the "info" AlertType for events - Info eventAlertType = "info" - // Error is the "error" AlertType for events - Error eventAlertType = "error" - // Warning is the "warning" AlertType for events - Warning eventAlertType = "warning" - // Success is the "success" AlertType for events - Success eventAlertType = "success" -) - -type eventPriority string - -const ( - // Normal is the "normal" Priority for events - Normal eventPriority = "normal" - // Low is the "low" Priority for events - Low eventPriority = "low" -) - -// An Event is an object that can be posted to your DataDog event stream. -type Event struct { - // Title of the event. Required. - Title string - // Text is the description of the event. Required. - Text string - // Timestamp is a timestamp for the event. If not provided, the dogstatsd - // server will set this to the current time. - Timestamp time.Time - // Hostname for the event. - Hostname string - // AggregationKey groups this event with others of the same key. - AggregationKey string - // Priority of the event. Can be statsd.Low or statsd.Normal. - Priority eventPriority - // SourceTypeName is a source type for the event. - SourceTypeName string - // AlertType can be statsd.Info, statsd.Error, statsd.Warning, or statsd.Success. - // If absent, the default value applied by the dogstatsd server is Info. - AlertType eventAlertType - // Tags for the event. - Tags []string -} - -// NewEvent creates a new event with the given title and text. Error checking -// against these values is done at send-time, or upon running e.Check. -func NewEvent(title, text string) *Event { - return &Event{ - Title: title, - Text: text, - } -} - -// Check verifies that an event is valid. -func (e Event) Check() error { - if len(e.Title) == 0 { - return fmt.Errorf("statsd.Event title is required") - } - if len(e.Text) == 0 { - return fmt.Errorf("statsd.Event text is required") - } - return nil -} - -// Encode returns the dogstatsd wire protocol representation for an event. -// Tags may be passed which will be added to the encoded output but not to -// the Event's list of tags, eg. for default tags. -func (e Event) Encode(tags ...string) (string, error) { - err := e.Check() - if err != nil { - return "", err - } - var buffer bytes.Buffer - buffer.WriteString("_e{") - buffer.WriteString(strconv.FormatInt(int64(len(e.Title)), 10)) - buffer.WriteRune(',') - buffer.WriteString(strconv.FormatInt(int64(len(e.Text)), 10)) - buffer.WriteString("}:") - buffer.WriteString(e.Title) - buffer.WriteRune('|') - buffer.WriteString(e.Text) - - if !e.Timestamp.IsZero() { - buffer.WriteString("|d:") - buffer.WriteString(strconv.FormatInt(int64(e.Timestamp.Unix()), 10)) - } - - if len(e.Hostname) != 0 { - buffer.WriteString("|h:") - buffer.WriteString(e.Hostname) - } - - if len(e.AggregationKey) != 0 { - buffer.WriteString("|k:") - buffer.WriteString(e.AggregationKey) - - } - - if len(e.Priority) != 0 { - buffer.WriteString("|p:") - buffer.WriteString(string(e.Priority)) - } - - if len(e.SourceTypeName) != 0 { - buffer.WriteString("|s:") - buffer.WriteString(e.SourceTypeName) - } - - if len(e.AlertType) != 0 { - buffer.WriteString("|t:") - buffer.WriteString(string(e.AlertType)) - } - - if len(tags)+len(e.Tags) > 0 { - all := make([]string, 0, len(tags)+len(e.Tags)) - all = append(all, tags...) - all = append(all, e.Tags...) - buffer.WriteString("|#") - buffer.WriteString(all[0]) - for _, tag := range all[1:] { - buffer.WriteString(",") - buffer.WriteString(tag) - } - } - - return buffer.String(), nil -} diff --git a/vendor/github.com/Dieterbe/artisanalhistogram/hist12h/hist12h.go b/vendor/github.com/Dieterbe/artisanalhistogram/hist12h/hist12h.go new file mode 100644 index 0000000000..85835efc33 --- /dev/null +++ b/vendor/github.com/Dieterbe/artisanalhistogram/hist12h/hist12h.go @@ -0,0 +1,175 @@ +package hist12h + +import ( + "math" + "sync/atomic" + "time" +) + +const maxVal = uint32(86400) // used to report max number as 1day even if it's higher + +// Hist12h is optimized for measurements between 500ms and 12h +type Hist12h struct { + limits [32]uint32 // in millis + counts [32]uint32 +} + +func New() Hist12h { + return Hist12h{ + limits: [32]uint32{ + 500, // 0 + 1000, // 1 + 2000, // 2 + 3000, // 3 + 5000, // 4 + 7500, // 5 + 10000, // 6 + 15000, // 7 + 20000, // 8 + 30000, // 9 + 45000, // 10 + 60000, // 11 // 1min + 90000, // 12 // 1.5min + 120000, // 13 // 2min + 180000, // 14 // 3min + 240000, // 15 // 4min + 300000, // 16 // 5min + 450000, // 17 // 7.5min + 600000, // 18 // 10min + 750000, // 19 // 12.5min + 9000000, // 20 // 15min + 1200000, // 21 // 20min + 1800000, // 22 // 30min + 2700000, // 23 // 45min + 3600000, // 24 // 1hr + 7200000, // 25 // 2hr + 10800000, // 26 // 3h + 16200000, // 27 // 4.5 + 21600000, // 28 // 6h + 32400000, // 29 // 9h + 43200000, // 30 // 12h + math.MaxUint32, // 31 // to ease binary search, but will be reported as 1day + }, + } +} + +// searchBucket implements a binary search, to find the bucket i to insert val in, like so: +// limits[i-1] < val <= limits[i] +// if we can convince the go compiler to inline this we can get a 14~22% speedup (verified by manually patching it in) +// but we can't :( see https://github.com/golang/go/issues/17566 +// so for now, we just replicate this code in addDuration below. make sure to keep the code in sync! +func searchBucket(limits [32]uint32, millis uint32) int { + min, i, max := 0, 16, 32 + for { + if millis <= limits[i] { + if i == 0 || millis > limits[i-1] { + return i + } + max = i + } else { + min = i + } + i = min + ((max - min) / 2) + } +} + +// adds to the right bucket with a copy of the searchBucket function below, to enforce inlining. +func (h *Hist12h) AddDuration(value time.Duration) { + // note: overflows at 1193 hours, but if you have values this high, + // you are definitely not using this histogram for the target use case. + millis := uint32(value.Nanoseconds() / 1000000) + min, i, max := 0, 16, 32 + for { + if millis <= h.limits[i] { + if i == 0 || millis > h.limits[i-1] { + atomic.AddUint32(&h.counts[i], 1) + return + } + max = i + } else { + min = i + } + i = min + ((max - min) / 2) + } +} + +// Snapshot returns a snapshot of the data and resets internal state +func (h *Hist12h) Snapshot() []uint32 { + snap := make([]uint32, 32) + for i := 0; i < 32; i++ { + snap[i] = atomic.SwapUint32(&h.counts[i], 0) + } + return snap +} + +// if count is 0 then there was no input, so statistical summaries are invalid +type Report struct { + Min uint32 // in millis + Mean uint32 // in millis + Median uint32 // in millis + P75 uint32 // in millis + P90 uint32 // in millis + Max uint32 // in millis + Count uint32 +} + +// ok can be false for two reasons: +// (in either case, we can't compute the summaries) +// * the total count was 0 +// * the total count overflowed (but we set to 1 so you can tell the difference) +func (h *Hist12h) Report(data []uint32) (Report, bool) { + totalValue := uint64(0) + r := Report{} + for i, count := range data { + if count > 0 { + remainder := math.MaxUint32 - r.Count + if count > remainder { + r.Count = 1 + return r, false + } + limit := h.limits[i] + // special case, report as 29s + if i == 31 { + limit = maxVal + } + if r.Min == 0 { // this means we haven't found min yet. + r.Min = limit + } + r.Max = limit + r.Count += count + totalValue += uint64(count * limit) + } + } + if r.Count == 0 { + return r, false + } + r.Median = h.limits[Quantile(data, 0.50, r.Count)] + if r.Median == math.MaxUint32 { + r.Median = maxVal + } + r.P75 = h.limits[Quantile(data, 0.75, r.Count)] + if r.P75 == math.MaxUint32 { + r.P75 = maxVal + } + r.P90 = h.limits[Quantile(data, 0.90, r.Count)] + if r.P90 == math.MaxUint32 { + r.P90 = maxVal + } + r.Mean = uint32(totalValue / uint64(r.Count)) + return r, true +} + +// quantile q means what's the value v so that all q of the values have value <= v +// TODO since we typically get 99%, 99.9%, 75%, etc. it should be faster to count the other way +func Quantile(data []uint32, q float64, total uint32) int { + count := q * float64(total) + for i := 0; i < 32; i++ { + count -= float64(data[i]) + + if count <= 0 { + return i // we return the upper limit, real quantile would be less, but we can't make the result any better. + } + } + + return -1 +} diff --git a/vendor/github.com/Dieterbe/artisanalhistogram/hist15s/hist15s.go b/vendor/github.com/Dieterbe/artisanalhistogram/hist15s/hist15s.go new file mode 100644 index 0000000000..4431a97ad1 --- /dev/null +++ b/vendor/github.com/Dieterbe/artisanalhistogram/hist15s/hist15s.go @@ -0,0 +1,175 @@ +package hist15s + +import ( + "math" + "sync/atomic" + "time" +) + +const maxVal = uint32(29999999) // used to report max number as 29s even if it's higher + +// Hist15s is optimized for measurements between 500ms and 12h +type Hist15s struct { + limits [32]uint32 // in micros + counts [32]uint32 +} + +func New() Hist15s { + return Hist15s{ + limits: [32]uint32{ + 1000, // 0 + 2000, // 1 + 3000, // 2 + 5000, // 3 + 7500, // 4 + 10000, // 5 + 15000, // 6 + 20000, // 7 + 30000, // 8 + 40000, // 9 + 50000, // 10 + 65000, // 11 + 80000, // 12 + 100000, // 13 + 150000, // 14 + 200000, // 15 + 300000, // 16 + 400000, // 17 + 500000, // 18 + 650000, // 19 + 800000, // 20 + 1000000, // 21 + 1500000, // 22 + 2000000, // 23 + 3000000, // 24 + 4000000, // 25 + 5000000, // 26 + 6500000, // 27 + 8000000, // 28 + 10000000, // 29 + 15000000, // 30 + math.MaxUint32, // 31 // to ease binary search, but will be reported as 29s + }, + } +} + +// searchBucket implements a binary search, to find the bucket i to insert val in, like so: +// limits[i-1] < val <= limits[i] +// if we can convince the go compiler to inline this we can get a 14~22% speedup (verified by manually patching it in) +// but we can't :( see https://github.com/golang/go/issues/17566 +// so for now, we just replicate this code in addDuration below. make sure to keep the code in sync! +func searchBucket(limits [32]uint32, micros uint32) int { + min, i, max := 0, 16, 32 + for { + if micros <= limits[i] { + if i == 0 || micros > limits[i-1] { + return i + } + max = i + } else { + min = i + } + i = min + ((max - min) / 2) + } +} + +// adds to the right bucket with a copy of the searchBucket function below, to enforce inlining. +func (h *Hist15s) AddDuration(value time.Duration) { + // note: overflows at 4294s, but if you have values this high, + // you are definitely not using this histogram for the target use case. + micros := uint32(value.Nanoseconds() / 1000) + min, i, max := 0, 16, 32 + for { + if micros <= h.limits[i] { + if i == 0 || micros > h.limits[i-1] { + atomic.AddUint32(&h.counts[i], 1) + return + } + max = i + } else { + min = i + } + i = min + ((max - min) / 2) + } +} + +// Snapshot returns a snapshot of the data and resets internal state +func (h *Hist15s) Snapshot() []uint32 { + snap := make([]uint32, 32) + for i := 0; i < 32; i++ { + snap[i] = atomic.SwapUint32(&h.counts[i], 0) + } + return snap +} + +// if count is 0 then there was no input, so statistical summaries are invalid +type Report struct { + Min uint32 // in micros + Mean uint32 // in micros + Median uint32 // in micros + P75 uint32 // in micros + P90 uint32 // in micros + Max uint32 // in micros + Count uint32 +} + +// ok can be false for two reasons: +// (in either case, we can't compute the summaries) +// * the total count was 0 +// * the total count overflowed (but we set to 1 so you can tell the difference) +func (h *Hist15s) Report(data []uint32) (Report, bool) { + totalValue := uint64(0) + r := Report{} + for i, count := range data { + if count > 0 { + remainder := math.MaxUint32 - r.Count + if count > remainder { + r.Count = 1 + return r, false + } + limit := h.limits[i] + // special case, report as 29s + if i == 31 { + limit = maxVal + } + if r.Min == 0 { // this means we haven't found min yet. + r.Min = limit + } + r.Max = limit + r.Count += count + totalValue += uint64(count * limit) + } + } + if r.Count == 0 { + return r, false + } + r.Median = h.limits[Quantile(data, 0.50, r.Count)] + if r.Median == math.MaxUint32 { + r.Median = maxVal + } + r.P75 = h.limits[Quantile(data, 0.75, r.Count)] + if r.P75 == math.MaxUint32 { + r.P75 = maxVal + } + r.P90 = h.limits[Quantile(data, 0.90, r.Count)] + if r.P90 == math.MaxUint32 { + r.P90 = maxVal + } + r.Mean = uint32(totalValue / uint64(r.Count)) + return r, true +} + +// quantile q means what's the value v so that all q of the values have value <= v +// TODO since we typically get 99%, 99.9%, 75%, etc. it should be faster to count the other way +func Quantile(data []uint32, q float64, total uint32) int { + count := q * float64(total) + for i := 0; i < 32; i++ { + count -= float64(data[i]) + + if count <= 0 { + return i // we return the upper limit, real quantile would be less, but we can't make the result any better. + } + } + + return -1 +} diff --git a/vendor/github.com/alexcesaro/statsd/LICENSE b/vendor/github.com/alexcesaro/statsd/LICENSE deleted file mode 100644 index 4ec7268d58..0000000000 --- a/vendor/github.com/alexcesaro/statsd/LICENSE +++ /dev/null @@ -1,20 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Alexandre Cesaro - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/github.com/alexcesaro/statsd/README.md b/vendor/github.com/alexcesaro/statsd/README.md deleted file mode 100644 index 00547dfcd7..0000000000 --- a/vendor/github.com/alexcesaro/statsd/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# statsd -[![Build Status](https://travis-ci.org/alexcesaro/statsd.svg?branch=v1)](https://travis-ci.org/alexcesaro/statsd) [![Code Coverage](http://gocover.io/_badge/gopkg.in/alexcesaro/statsd.v1)](http://gocover.io/gopkg.in/alexcesaro/statsd.v1) [![Documentation](https://godoc.org/gopkg.in/alexcesaro/statsd.v1?status.svg)](https://godoc.org/gopkg.in/alexcesaro/statsd.v1) - -## Introduction - -statsd is a simple and efficient [Statsd](https://github.com/etsy/statsd) -client. - -See the [benchmark](https://github.com/alexcesaro/statsdbench) for a comparison -with other Go StatsD clients. - -## Features - -- Supports all StatsD metrics: counter, gauge, timing and set -- Supports Datadog and InfluxDB tags -- Fast and GC-friendly: Client's methods do not allocate -- Simple API -- 100% test coverage -- Versioned API using gopkg.in - - -## Documentation - -https://godoc.org/gopkg.in/alexcesaro/statsd.v1 - - -## Download - - go get gopkg.in/alexcesaro/statsd.v1 - - -## Example - -See the [examples in the documentation](https://godoc.org/gopkg.in/alexcesaro/statsd.v1#example-package). - - -## License - -[MIT](LICENSE) - - -## Contribute - -Do you have any question the documentation does not answer? Is there a use case -that you feel is common and is not well-addressed by the current API? - -If so you are more than welcome to ask questions in the -[thread on golang-nuts](https://groups.google.com/d/topic/golang-nuts/Tz6t4_iLgnw/discussion) -or open an issue or send a pull-request here on Github. diff --git a/vendor/github.com/alexcesaro/statsd/doc.go b/vendor/github.com/alexcesaro/statsd/doc.go deleted file mode 100644 index ffd6c066d3..0000000000 --- a/vendor/github.com/alexcesaro/statsd/doc.go +++ /dev/null @@ -1,17 +0,0 @@ -/* -Package statsd is a simple and efficient StatsD client. - -Client's methods are fast and do not allocate memory. - -Internally, Client's methods buffers metrics. The buffer is flushed when either: - - the background goroutine flushes the buffer (every 100ms by default) - - the buffer is full (1440 bytes by default so that IP packets are not - fragmented) - -The background goroutine can be disabled using the WithFlushPeriod(0) option. - -Buffering can be disabled using the WithMaxPacketSize(0) option. - -StatsD homepage: https://github.com/etsy/statsd -*/ -package statsd diff --git a/vendor/github.com/alexcesaro/statsd/statsd.go b/vendor/github.com/alexcesaro/statsd/statsd.go deleted file mode 100644 index 45aec1b56a..0000000000 --- a/vendor/github.com/alexcesaro/statsd/statsd.go +++ /dev/null @@ -1,454 +0,0 @@ -package statsd - -import ( - "bytes" - "math/rand" - "net" - "strconv" - "sync" - "time" -) - -// A Client represents a StatsD client. -type Client struct { - mu sync.Mutex - - // Fields guarded by the mutex. - conn net.Conn - buf []byte - rateCache map[float32]string - closed bool - - // Fields settable with options at Client's creation. - muted bool - errorHandler func(error) - flushPeriod time.Duration - maxPacketSize int - network string - prefix string - tagFormat tagFormat - tags string -} - -// An Option represents an option for a Client. It must be used as an argument -// to New(). -type Option func(*Client) - -// Mute sets whether the Client is muted. -func Mute(b bool) Option { - return Option(func(c *Client) { - c.muted = b - }) -} - -// WithErrorHandler sets the error handling function used by the Client. -func WithErrorHandler(h func(error)) Option { - return Option(func(c *Client) { - c.errorHandler = h - }) -} - -// WithFlushPeriod sets how often the Client's buffer is flushed. -// If period is 0, the goroutine that periodically flush the buffer is not -// lauched and the buffer is only flushed when it is full. -// -// By default the flush period is 100 milliseconds. -func WithFlushPeriod(period time.Duration) Option { - return Option(func(c *Client) { - c.flushPeriod = period - }) -} - -// WithMaxPacketSize sets the maximum packet size in bytes sent by the Client. -// -// By default it is 1440. -func WithMaxPacketSize(n int) Option { - return Option(func(c *Client) { - c.maxPacketSize = n - }) -} - -// WithNetwork sets the network (udp, tcp, etc) used by the client. -// See net.Dial documentation: https://golang.org/pkg/net/#Dial -// -// By default, network is udp. -func WithNetwork(network string) Option { - return Option(func(c *Client) { - c.network = network - }) -} - -// WithPrefix sets the prefix prepended to every bucket name. -func WithPrefix(prefix string) Option { - return Option(func(c *Client) { - c.prefix = prefix - }) -} - -// WithDatadogTags sets the Datadog tags sent with every metrics. -// -// The tags should have the key:value syntax. -// See http://docs.datadoghq.com/guides/metrics/#tags -func WithDatadogTags(tags ...string) Option { - return Option(func(c *Client) { - // Datadog tag format: |#tag1:value1,tag2,tag3:value3 - // See http://docs.datadoghq.com/guides/dogstatsd/#datagram-format - buf := bytes.NewBufferString("|#") - first := true - for i := 0; i < len(tags); i++ { - if first { - first = false - } else { - buf.WriteByte(',') - } - buf.WriteString(tags[i]) - } - c.tagFormat = datadogFormat - c.tags = buf.String() - }) -} - -// WithInfluxDBTags sets the InfluxDB tags sent with every metrics. -// -// The tags must be set as key-value pairs. If the number of tags is not even, -// WithInfluxDBTags panics. -// -// See https://influxdb.com/blog/2015/11/03/getting_started_with_influx_statsd.html -func WithInfluxDBTags(tags ...string) Option { - if len(tags)%2 != 0 { - panic("statsd: WithInfluxDBTags only accepts an even number arguments") - } - - // InfluxDB tag format: ,tag1=payroll,region=us-west - // https://influxdb.com/blog/2015/11/03/getting_started_with_influx_statsd.html - return Option(func(c *Client) { - var buf bytes.Buffer - for i := 0; i < len(tags)/2; i++ { - buf.WriteByte(',') - buf.WriteString(tags[2*i]) - buf.WriteByte('=') - buf.WriteString(tags[2*i+1]) - } - c.tagFormat = influxDBFormat - c.tags = buf.String() - }) -} - -type tagFormat uint8 - -const ( - datadogFormat tagFormat = iota + 1 - influxDBFormat -) - -// New creates a new Client with the given options. -func New(addr string, options ...Option) (*Client, error) { - c := &Client{ - // Worst-case scenario: - // Ethernet MTU - IPv6 Header - TCP Header = 1500 - 40 - 20 = 1440 - maxPacketSize: 1440, - } - - for _, o := range options { - o(c) - } - - if c.muted { - return c, nil - } - - if c.network == "" { - c.network = "udp" - } - var err error - c.conn, err = dialTimeout(c.network, addr, 5*time.Second) - if err != nil { - return nil, err - } - // When using UDP do a quick check to see if something is listening on the - // given port to return an error as soon as possible. - if c.network[:3] == "udp" { - for i := 0; i < 2; i++ { - _, err = c.conn.Write(nil) - if err != nil { - _ = c.conn.Close() - return nil, err - } - } - } - - if c.flushPeriod == 0 { - c.flushPeriod = 100 * time.Millisecond - } - - // To prevent a buffer overflow add some capacity to the buffer to allow for - // an additional metric. - c.buf = make([]byte, 0, c.maxPacketSize+200) - - if c.flushPeriod > 0 { - go func() { - ticker := time.NewTicker(c.flushPeriod) - for _ = range ticker.C { - c.mu.Lock() - if c.closed { - ticker.Stop() - c.mu.Unlock() - return - } - c.flush(0) - c.mu.Unlock() - } - }() - } - - return c, nil -} - -// Count adds n to bucket with the given sampling rate. -func (c *Client) Count(bucket string, n int, rate float32) { - if c.muted { - return - } - if isRandAbove(rate) { - return - } - - c.mu.Lock() - l := len(c.buf) - c.appendBucket(bucket) - c.appendInt(n) - c.appendType("c") - c.appendRate(rate) - c.closeMetric() - c.flushIfBufferFull(l) - c.mu.Unlock() -} - -func isRandAbove(rate float32) bool { - return rate != 1 && randFloat() > rate -} - -// Increment increment the given bucket. -// It is equivalent to Count(bucket, 1, 1). -func (c *Client) Increment(bucket string) { - c.Count(bucket, 1, 1) -} - -// Gauge records an absolute value for the given bucket. -func (c *Client) Gauge(bucket string, value int) { - if c.muted { - return - } - - c.mu.Lock() - l := len(c.buf) - // To set a gauge to a negative value we must first set it to 0. - // https://github.com/etsy/statsd/blob/master/docs/metric_types.md#gauges - if value < 0 { - c.appendBucket(bucket) - c.gauge(0) - } - c.appendBucket(bucket) - c.gauge(value) - c.flushIfBufferFull(l) - c.mu.Unlock() -} - -// ChangeGauge changes the value of a gauge by the given delta. -func (c *Client) ChangeGauge(bucket string, delta int) { - if c.muted { - return - } - if delta == 0 { - return - } - - c.mu.Lock() - l := len(c.buf) - c.appendBucket(bucket) - if delta > 0 { - c.appendByte('+') - } - c.gauge(delta) - c.flushIfBufferFull(l) - c.mu.Unlock() -} - -func (c *Client) gauge(value int) { - c.appendInt(value) - c.appendType("g") - c.closeMetric() -} - -// Timing sends a timing value to a bucket with the given sampling rate. -func (c *Client) Timing(bucket string, value int, rate float32) { - if c.muted { - return - } - if isRandAbove(rate) { - return - } - - c.mu.Lock() - l := len(c.buf) - c.appendBucket(bucket) - c.appendInt(value) - c.appendType("ms") - c.appendRate(rate) - c.closeMetric() - c.flushIfBufferFull(l) - c.mu.Unlock() -} - -// A Timing is an helper object that eases sending timing values. -type Timing struct { - start time.Time - c *Client -} - -// NewTiming creates a new Timing. -func (c *Client) NewTiming() Timing { - return Timing{start: now(), c: c} -} - -// Send sends the time elapsed since the creation of the Timing to a bucket -// with the given sampling rate. -func (t Timing) Send(bucket string, rate float32) { - t.c.Timing(bucket, int(t.Duration()/time.Millisecond), rate) -} - -// Duration gets the duration since the creation of the Timing. -func (t Timing) Duration() time.Duration { - return now().Sub(t.start) -} - -// Unique sends the given value to a set bucket. -func (c *Client) Unique(bucket string, value string) { - if c.muted { - return - } - - c.mu.Lock() - l := len(c.buf) - c.appendBucket(bucket) - c.appendString(value) - c.appendType("s") - c.closeMetric() - c.flushIfBufferFull(l) - c.mu.Unlock() -} - -// Flush flushes the Client's buffer. -func (c *Client) Flush() { - if c.muted { - return - } - - c.mu.Lock() - c.flush(0) - c.mu.Unlock() -} - -// Close flushes the Client's buffer and releases the associated ressources. -func (c *Client) Close() { - if c.muted { - return - } - - c.mu.Lock() - c.flush(0) - c.handleError(c.conn.Close()) - c.closed = true - c.mu.Unlock() -} - -func (c *Client) appendByte(b byte) { - c.buf = append(c.buf, b) -} - -func (c *Client) appendString(s string) { - c.buf = append(c.buf, s...) -} - -func (c *Client) appendInt(i int) { - c.buf = strconv.AppendInt(c.buf, int64(i), 10) -} - -func (c *Client) appendBucket(bucket string) { - if c.prefix != "" { - c.appendString(c.prefix) - } - c.appendString(bucket) - if c.tagFormat == influxDBFormat { - c.appendString(c.tags) - } - c.appendByte(':') -} - -func (c *Client) appendType(t string) { - c.appendByte('|') - c.appendString(t) -} - -func (c *Client) appendRate(rate float32) { - if rate == 1 { - return - } - if c.rateCache == nil { - c.rateCache = make(map[float32]string) - } - - c.appendString("|@") - if s, ok := c.rateCache[rate]; ok { - c.appendString(s) - } else { - s = strconv.FormatFloat(float64(rate), 'f', -1, 32) - c.rateCache[rate] = s - c.appendString(s) - } -} - -func (c *Client) closeMetric() { - if c.tagFormat == datadogFormat { - c.appendString(c.tags) - } - c.appendByte('\n') -} - -func (c *Client) flushIfBufferFull(lastSafeLen int) { - if len(c.buf) > c.maxPacketSize { - c.flush(lastSafeLen) - } -} - -// flush flushes the first n bytes of the buffer. -// If n is 0, the whole buffer is flushed. -func (c *Client) flush(n int) { - if len(c.buf) == 0 { - return - } - if n == 0 { - n = len(c.buf) - } - - // Trim the last \n, StatsD does not like it. - _, err := c.conn.Write(c.buf[:n-1]) - c.handleError(err) - if n < len(c.buf) { - copy(c.buf, c.buf[n:]) - } - c.buf = c.buf[:len(c.buf)-n] -} - -func (c *Client) handleError(err error) { - if err != nil && c.errorHandler != nil { - c.errorHandler(err) - } -} - -// Stubbed out for testing. -var ( - dialTimeout = net.DialTimeout - now = time.Now - randFloat = rand.Float32 -) diff --git a/vendor/github.com/dgryski/go-linlog/hist.go b/vendor/github.com/dgryski/go-linlog/hist.go new file mode 100644 index 0000000000..4f9c5e8b56 --- /dev/null +++ b/vendor/github.com/dgryski/go-linlog/hist.go @@ -0,0 +1,59 @@ +package linlog + +import "sync/atomic" + +type Histogram struct { + max uint64 + linear uint64 + subbin uint64 + bins []uint64 + sizes []uint64 +} + +func NewHistogram(max, linear, subbin uint64) *Histogram { + _, size := BinOf(max, linear, subbin) + return &Histogram{ + max: max, + linear: linear, + subbin: subbin, + bins: make([]uint64, size+1), + sizes: Bins(max, linear, subbin), + } +} + +func (h *Histogram) Insert(n uint64) { + _, bin := BinOf(n, h.linear, h.subbin) + if bin >= uint64(len(h.bins)) { + bin = uint64(len(h.bins) - 1) + } + h.bins[bin]++ +} + +func (h *Histogram) AtomicInsert(n uint64) { + _, bin := BinOf(n, h.linear, h.subbin) + if bin >= uint64(len(h.bins)) { + bin = uint64(len(h.bins) - 1) + } + atomic.AddUint64(&h.bins[bin], 1) +} + +type Bin struct { + Size uint64 + Count uint64 +} + +func (h *Histogram) Bins() []Bin { + bins := make([]Bin, len(h.bins)) + for i, v := range h.bins { + bins[i] = Bin{h.sizes[i], v} + } + return bins +} + +func (h *Histogram) AtomicBins() []Bin { + bins := make([]Bin, len(h.bins)) + for i := range h.bins { + bins[i] = Bin{h.sizes[i], atomic.LoadUint64(&h.bins[i])} + } + return bins +} diff --git a/vendor/github.com/dgryski/go-linlog/lb.go b/vendor/github.com/dgryski/go-linlog/lb.go new file mode 100644 index 0000000000..c7105bffad --- /dev/null +++ b/vendor/github.com/dgryski/go-linlog/lb.go @@ -0,0 +1,40 @@ +// +build !amd64 appengine + +package linlog + +func lb(x uint64) uint64 { + + if x == 0 { + return 0 + } + + var n uint64 + + if (x >> 32) == 0 { + n = n + 32 + x = x << 32 + } + if (x >> (32 + 16)) == 0 { + n = n + 16 + x = x << 16 + } + + if (x >> (32 + 16 + 8)) == 0 { + n = n + 8 + x = x << 8 + } + + if (x >> (32 + 16 + 8 + 4)) == 0 { + n = n + 4 + x = x << 4 + } + + if (x >> (32 + 16 + 8 + 4 + 2)) == 0 { + n = n + 2 + x = x << 2 + } + + n += 1 + + return 63 + (x >> 63) - n +} diff --git a/vendor/github.com/dgryski/go-linlog/lb_amd64.s b/vendor/github.com/dgryski/go-linlog/lb_amd64.s new file mode 100644 index 0000000000..a25be3f5be --- /dev/null +++ b/vendor/github.com/dgryski/go-linlog/lb_amd64.s @@ -0,0 +1,12 @@ +// +build amd64,!appengine + +// func lb(x uint64) uint64 +TEXT ·lb(SB), 4, $0-16 + BSRQ x+0(FP), AX + JZ zero + MOVQ AX, ret+8(FP) + RET + +zero: + MOVQ $64, ret+8(FP) + RET diff --git a/vendor/github.com/dgryski/go-linlog/lb_asm.go b/vendor/github.com/dgryski/go-linlog/lb_asm.go new file mode 100644 index 0000000000..ae240a0a9f --- /dev/null +++ b/vendor/github.com/dgryski/go-linlog/lb_asm.go @@ -0,0 +1,5 @@ +// +build amd64,!appengine + +package linlog + +func lb(x uint64) uint64 diff --git a/vendor/github.com/dgryski/go-linlog/linlog.go b/vendor/github.com/dgryski/go-linlog/linlog.go new file mode 100644 index 0000000000..50ef0a210b --- /dev/null +++ b/vendor/github.com/dgryski/go-linlog/linlog.go @@ -0,0 +1,58 @@ +// Package linlog implements linear-log bucketing +/* + +http://pvk.ca/Blog/2015/06/27/linear-log-bucketing-fast-versatile-simple/ + +*/ +package linlog + +// BinOf rounds size as appropriate, and returns the rounded size and bucket number. +func BinOf(size uint64, linear, subbin uint64) (rounded uint64, bucket uint64) { + nBits := lb(size | (1 << uint(linear))) + shift := nBits - subbin + mask := uint64(1<> shift + xrange := nBits - linear + + return (round & ^mask), (xrange << subbin) + subIndex +} + +// BinDownOf rounds size down, and returns the rounded size and bucket number. +func BinDownOf(size uint64, linear, subbin uint64) (rounded uint64, bucket uint64) { + nBits := lb(size | (1 << linear)) + shift := nBits - subbin + subIndex := size >> shift + xrange := nBits - linear + + return (subIndex << shift), (xrange << subbin) + subIndex +} + +func Bins(max uint64, linear, subbin uint64) []uint64 { + var buckets []uint64 + buckets = append(buckets, 0) + + var size uint64 + var incr uint64 = 1 << (linear - subbin) + + for size < (1 << linear) { + size += incr + if size >= max { + break + } + buckets = append(buckets, size) + } + + for size < max { + for steps := uint64(0); steps < (1 << subbin); steps++ { + size += incr + buckets = append(buckets, size) + if size > max { + break + } + } + incr <<= 1 + } + + return buckets +} diff --git a/vendor/github.com/raintank/met/README.md b/vendor/github.com/raintank/met/README.md deleted file mode 100644 index 4719c27bce..0000000000 --- a/vendor/github.com/raintank/met/README.md +++ /dev/null @@ -1,15 +0,0 @@ -an opinionated wrapper around metric client libraries - -imported from Grafana ffcc807ed34f853a8bc9600bcf7801547a5feb4f - -supports: -* statsd (recommended!) -* dogstatsd - -and later maybe more, like go-metrics - - -Why? -* make it easy to switch between libraries. -* some libraries just take in string arguments for gauge names etc, but it's nicer to have variables per object for robustness, especially when used in multiple places, and gives a central overview. -* allows you to set deleteGauges and deleteStats in your statsd servers ( a good thing for stateless servers), cause we will automatically keep a gauge sending. diff --git a/vendor/github.com/raintank/met/dogstatsd/count.go b/vendor/github.com/raintank/met/dogstatsd/count.go deleted file mode 100644 index beb519e9c3..0000000000 --- a/vendor/github.com/raintank/met/dogstatsd/count.go +++ /dev/null @@ -1,18 +0,0 @@ -package dogstatsd - -import "github.com/raintank/met" - -type Count struct { - key string - backend Backend -} - -func (b Backend) NewCount(key string) met.Count { - c := Count{key, b} - c.Inc(0) - return c -} - -func (c Count) Inc(val int64) { - c.backend.client.Count(c.key, val, []string{}, 1) -} diff --git a/vendor/github.com/raintank/met/dogstatsd/gauge.go b/vendor/github.com/raintank/met/dogstatsd/gauge.go deleted file mode 100644 index e3836187a4..0000000000 --- a/vendor/github.com/raintank/met/dogstatsd/gauge.go +++ /dev/null @@ -1,51 +0,0 @@ -package dogstatsd - -import ( - "sync" - "time" -) -import "github.com/raintank/met" - -type Gauge struct { - key string - val int64 - sync.Mutex - backend Backend -} - -func (b Backend) NewGauge(key string, val int64) met.Gauge { - g := Gauge{ - key: key, - backend: b, - } - go func() { - for { - g.Lock() - g.backend.client.Gauge(g.key, float64(g.val), []string{}, 1) - g.Unlock() - time.Sleep(time.Duration(1) * time.Second) - } - }() - return &g -} - -func (g *Gauge) Value(val int64) { - g.Lock() - defer g.Unlock() - g.val = val - g.backend.client.Gauge(g.key, float64(g.val), []string{}, 1) -} - -func (g *Gauge) Inc(val int64) { - g.Lock() - defer g.Unlock() - g.val += val - g.backend.client.Gauge(g.key, float64(g.val), []string{}, 1) -} - -func (g *Gauge) Dec(val int64) { - g.Lock() - defer g.Unlock() - g.val -= val - g.backend.client.Gauge(g.key, float64(g.val), []string{}, 1) -} diff --git a/vendor/github.com/raintank/met/dogstatsd/init.go b/vendor/github.com/raintank/met/dogstatsd/init.go deleted file mode 100644 index ccb968719f..0000000000 --- a/vendor/github.com/raintank/met/dogstatsd/init.go +++ /dev/null @@ -1,27 +0,0 @@ -// a metrics class that uses dogstatsd on the backend - -// note that on creation, we automatically send a default value so that: -// * influxdb doesn't complain when queried for series that don't exist yet, which breaks graphs in grafana -// * the series show up in your monitoring tool of choice, so you can easily do alerting rules, build dashes etc -// without having to wait for data. some series would otherwise only be created when things go badly wrong etc. -// note that for gauges and timers this can create inaccuracies because the true values are hard to predict, -// but it's worth the trade-off. -// (for count 0 is harmless and accurate) - -package dogstatsd - -import "github.com/DataDog/datadog-go/statsd" - -type Backend struct { - client *statsd.Client -} - -// note: library does not auto-add ending dot to prefix, specify it if you want it -func New(addr, prefix string, tags []string) (Backend, error) { - client, err := statsd.New(addr) - if err == nil { - client.Namespace = prefix - client.Tags = tags - } - return Backend{client}, err -} diff --git a/vendor/github.com/raintank/met/dogstatsd/meter.go b/vendor/github.com/raintank/met/dogstatsd/meter.go deleted file mode 100644 index 0ed5437502..0000000000 --- a/vendor/github.com/raintank/met/dogstatsd/meter.go +++ /dev/null @@ -1,20 +0,0 @@ -// it's commonly used for non-timer cases where we want these summaries, that's -// what this is for. -package dogstatsd - -import "github.com/raintank/met" - -type Meter struct { - key string - backend Backend -} - -func (b Backend) NewMeter(key string, val int64) met.Meter { - m := Meter{key, b} - m.Value(val) - return m -} - -func (m Meter) Value(val int64) { - m.backend.client.Histogram(m.key, float64(val), []string{}, 1) -} diff --git a/vendor/github.com/raintank/met/dogstatsd/timer.go b/vendor/github.com/raintank/met/dogstatsd/timer.go deleted file mode 100644 index 711cf9ee49..0000000000 --- a/vendor/github.com/raintank/met/dogstatsd/timer.go +++ /dev/null @@ -1,23 +0,0 @@ -package dogstatsd - -import "time" -import "github.com/raintank/met" - -// note that due the preseeding in init, you shouldn't rely on the count and count_ps summaries -// rather, consider maintaining a separate counter -// see https://github.com/raintank/grafana/issues/133 - -type Timer struct { - key string - backend Backend -} - -func (b Backend) NewTimer(key string, val time.Duration) met.Timer { - t := Timer{key, b} - t.Value(val) - return t -} - -func (t Timer) Value(val time.Duration) { - t.backend.client.TimeInMilliseconds(t.key, val.Seconds()*1000, []string{}, 1) -} diff --git a/vendor/github.com/raintank/met/helper/helper.go b/vendor/github.com/raintank/met/helper/helper.go deleted file mode 100644 index aef46e8a42..0000000000 --- a/vendor/github.com/raintank/met/helper/helper.go +++ /dev/null @@ -1,25 +0,0 @@ -package helper - -import ( - "fmt" - - "github.com/raintank/met" - "github.com/raintank/met/dogstatsd" - "github.com/raintank/met/statsd" -) - -func New(enabled bool, addr, t, service, instance string) (met.Backend, error) { - if t != "standard" && t != "datadog" { - panic(fmt.Sprintf("unrecognized statsd type: '%s'", t)) - } - if !enabled { - // we could implement a true "null-backend" - // but since statsd supports disabled mode, this is easier - return statsd.New(enabled, addr, "") - } - if t == "standard" { - return statsd.New(enabled, addr, fmt.Sprintf("%s.%s.", service, instance)) - } else { - return dogstatsd.New(addr, service+".", []string{"instance:" + instance}) - } -} diff --git a/vendor/github.com/raintank/met/interfaces.go b/vendor/github.com/raintank/met/interfaces.go deleted file mode 100644 index cd8d14d2e8..0000000000 --- a/vendor/github.com/raintank/met/interfaces.go +++ /dev/null @@ -1,35 +0,0 @@ -package met - -import "time" - -type Backend interface { - NewCount(key string) Count - NewGauge(key string, val int64) Gauge - NewMeter(key string, val int64) Meter - NewTimer(key string, val time.Duration) Timer -} - -// Count is a type that counts how many hits it's seen in each given interval -// and computes the rate per second -// it's not a long-running counter. -// values are explicit -type Count interface { - Inc(val int64) -} - -// gauge makes sure its value is explicit (i.e. for statsd, keep sending) -type Gauge interface { - Dec(val int64) - Inc(val int64) - Value(val int64) -} - -// like a timer, but not just for timings -type Meter interface { - Value(val int64) -} - -// computes stasticical summaries -type Timer interface { - Value(val time.Duration) -} diff --git a/vendor/github.com/raintank/met/statsd/count.go b/vendor/github.com/raintank/met/statsd/count.go deleted file mode 100644 index f04a9dabc9..0000000000 --- a/vendor/github.com/raintank/met/statsd/count.go +++ /dev/null @@ -1,18 +0,0 @@ -package statsd - -import "github.com/raintank/met" - -type Count struct { - key string - backend Backend -} - -func (b Backend) NewCount(key string) met.Count { - c := Count{key, b} - c.Inc(0) - return c -} - -func (c Count) Inc(val int64) { - c.backend.client.Count(c.key, int(val), 1) -} diff --git a/vendor/github.com/raintank/met/statsd/gauge.go b/vendor/github.com/raintank/met/statsd/gauge.go deleted file mode 100644 index 34ae8e9e33..0000000000 --- a/vendor/github.com/raintank/met/statsd/gauge.go +++ /dev/null @@ -1,52 +0,0 @@ -package statsd - -import ( - "sync" - "time" - - "github.com/raintank/met" -) - -type Gauge struct { - key string - val int64 - sync.Mutex - backend Backend -} - -func (b Backend) NewGauge(key string, val int64) met.Gauge { - g := Gauge{ - key: key, - backend: b, - } - go func() { - for { - g.Lock() - g.backend.client.Gauge(g.key, int(g.val)) - g.Unlock() - time.Sleep(time.Duration(1) * time.Second) - } - }() - return &g -} - -func (g *Gauge) Value(val int64) { - g.Lock() - g.val = val - g.Unlock() - g.backend.client.Gauge(g.key, int(val)) -} - -func (g *Gauge) Inc(val int64) { - g.Lock() - defer g.Unlock() - g.val += val - g.backend.client.Gauge(g.key, int(g.val)) -} - -func (g *Gauge) Dec(val int64) { - g.Lock() - defer g.Unlock() - g.val -= val - g.backend.client.Gauge(g.key, int(g.val)) -} diff --git a/vendor/github.com/raintank/met/statsd/init.go b/vendor/github.com/raintank/met/statsd/init.go deleted file mode 100644 index 4bf7ecc12a..0000000000 --- a/vendor/github.com/raintank/met/statsd/init.go +++ /dev/null @@ -1,14 +0,0 @@ -package statsd - -import "github.com/alexcesaro/statsd" - -type Backend struct { - client *statsd.Client -} - -// note: library does not auto add ending dot to prefix. -func New(enabled bool, addr, prefix string) (Backend, error) { - client, err := statsd.New(addr, statsd.WithPrefix(prefix), statsd.Mute(!enabled)) - b := Backend{client} - return b, err -} diff --git a/vendor/github.com/raintank/met/statsd/meter.go b/vendor/github.com/raintank/met/statsd/meter.go deleted file mode 100644 index 34a01460d6..0000000000 --- a/vendor/github.com/raintank/met/statsd/meter.go +++ /dev/null @@ -1,20 +0,0 @@ -// it's commonly used for non-timer cases where we want these summaries, that's -// what this is for. -package statsd - -import "github.com/raintank/met" - -type Meter struct { - key string - backend Backend -} - -func (b Backend) NewMeter(key string, val int64) met.Meter { - m := Meter{key, b} - m.Value(val) - return m -} - -func (m Meter) Value(val int64) { - m.backend.client.Timing(m.key, int(val), 1) -} diff --git a/vendor/github.com/raintank/met/statsd/timer.go b/vendor/github.com/raintank/met/statsd/timer.go deleted file mode 100644 index f4a72fb437..0000000000 --- a/vendor/github.com/raintank/met/statsd/timer.go +++ /dev/null @@ -1,23 +0,0 @@ -package statsd - -import "time" -import "github.com/raintank/met" - -// note that due the preseeding in init, you shouldn't rely on the count and count_ps summaries -// rather, consider maintaining a separate counter -// see https://github.com/raintank/grafana/issues/133 - -type Timer struct { - key string - backend Backend -} - -func (b Backend) NewTimer(key string, val time.Duration) met.Timer { - t := Timer{key, b} - t.Value(val) - return t -} - -func (t Timer) Value(val time.Duration) { - t.backend.client.Timing(t.key, int(val/time.Millisecond), 1) -} diff --git a/vendor/github.com/raintank/misc/instrumented_nsq/consumer.go b/vendor/github.com/raintank/misc/instrumented_nsq/consumer.go deleted file mode 100644 index aff8ce75c8..0000000000 --- a/vendor/github.com/raintank/misc/instrumented_nsq/consumer.go +++ /dev/null @@ -1,51 +0,0 @@ -package insq - -import ( - "fmt" - "sync/atomic" - "time" - - "github.com/nsqio/go-nsq" - "github.com/raintank/met" -) - -type Consumer struct { - *nsq.Consumer - msgsReceived met.Gauge - msgsFinished met.Gauge - msgsRequeued met.Gauge - msgsConnections met.Gauge - numHandlers int32 - handlers met.Gauge -} - -func NewConsumer(topic, channel string, config *nsq.Config, metricsPatt string, metrics met.Backend) (*Consumer, error) { - consumer, err := nsq.NewConsumer(topic, channel, config) - c := Consumer{ - consumer, - metrics.NewGauge(fmt.Sprintf(metricsPatt, "received"), 0), - metrics.NewGauge(fmt.Sprintf(metricsPatt, "finished"), 0), - metrics.NewGauge(fmt.Sprintf(metricsPatt, "requeued"), 0), - metrics.NewGauge(fmt.Sprintf(metricsPatt, "connections"), 0), - 0, - metrics.NewGauge(fmt.Sprintf(metricsPatt, "num_handlers"), 0), - } - go func() { - t := time.Tick(time.Second * time.Duration(1)) - for range t { - s := consumer.Stats() - c.msgsReceived.Value(int64(s.MessagesReceived)) - c.msgsFinished.Value(int64(s.MessagesFinished)) - c.msgsRequeued.Value(int64(s.MessagesRequeued)) - c.msgsConnections.Value(int64(s.Connections)) - h := atomic.LoadInt32(&c.numHandlers) - c.handlers.Value(int64(h)) - } - }() - return &c, err -} - -func (r *Consumer) AddConcurrentHandlers(handler nsq.Handler, concurrency int) { - atomic.AddInt32(&r.numHandlers, int32(concurrency)) - r.Consumer.AddConcurrentHandlers(handler, concurrency) -} diff --git a/vendor/vendor.json b/vendor/vendor.json index b99c44300c..bd8ed6d7f8 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -3,10 +3,16 @@ "ignore": "test", "package": [ { - "checksumSHA1": "IltfP0VyYFIARQsJ3hNMMxHjPvE=", - "path": "github.com/DataDog/datadog-go/statsd", - "revision": "b050cd8f4d7c394545fd7d966c8e2909ce89d552", - "revisionTime": "2015-09-30T14:07:41Z" + "checksumSHA1": "vPAHn6NQfXwZmzal7YBN93Fnd6I=", + "path": "github.com/Dieterbe/artisanalhistogram/hist12h", + "revision": "91cfed5bb965b5797e2e452023c431f017599dba", + "revisionTime": "2016-11-27T15:30:35Z" + }, + { + "checksumSHA1": "3f8ivK6IiacIw+mE3uOqoyJu/Mg=", + "path": "github.com/Dieterbe/artisanalhistogram/hist15s", + "revision": "91cfed5bb965b5797e2e452023c431f017599dba", + "revisionTime": "2016-11-27T15:30:35Z" }, { "checksumSHA1": "P8h2SWEK3NsJleSPe+mVNNpLG6Y=", @@ -32,12 +38,6 @@ "revision": "28b053d5a2923b87ce8c5a08f3af779894a72758", "revisionTime": "2015-10-08T13:54:07Z" }, - { - "checksumSHA1": "0x0Ry5ZsFJAXeXieCP40WycRNFs=", - "path": "github.com/alexcesaro/statsd", - "revision": "bbc5756ebb41c2ac4c4f6abe584d25eb7cbe2e18", - "revisionTime": "2015-12-15T22:11:50Z" - }, { "checksumSHA1": "acGG5NV9NBtjspVlWZ86Zzg8pW8=", "path": "github.com/alyu/configparser", @@ -74,6 +74,12 @@ "revision": "2c7641e7dfe3945a0fe755f58c85ab306624956d", "revisionTime": "2015-09-21T07:33:52Z" }, + { + "checksumSHA1": "e4U50WUz2ycziirb3+JAVz3t49w=", + "path": "github.com/dgryski/go-linlog", + "revision": "f18bb8a4e7bcd60fd4fb99f3e8752f5da20f70a2", + "revisionTime": "2016-05-05T06:20:16Z" + }, { "checksumSHA1": "bJkM/x02zeuwA6avbGn2yvqjp20=", "path": "github.com/dgryski/go-tsz", @@ -242,42 +248,12 @@ "revision": "7b06fb4cf9324a785c2dedea9aa4e47769d54b92", "revisionTime": "2016-07-13T16:23:06Z" }, - { - "checksumSHA1": "bgKeW8sz3mMg/3QlazFovSIsjLc=", - "path": "github.com/raintank/met", - "revision": "daf6d57fc20532f01eb2320040a8054b5618cbf1", - "revisionTime": "2016-01-13T08:38:23Z" - }, - { - "checksumSHA1": "strvyPp3CIOymoYh9/PrKXs37C0=", - "path": "github.com/raintank/met/dogstatsd", - "revision": "daf6d57fc20532f01eb2320040a8054b5618cbf1", - "revisionTime": "2016-01-13T08:38:23Z" - }, - { - "checksumSHA1": "6PsyVJUl7ZJkemp8MEOhDU/iag4=", - "path": "github.com/raintank/met/helper", - "revision": "daf6d57fc20532f01eb2320040a8054b5618cbf1", - "revisionTime": "2016-01-13T08:38:23Z" - }, - { - "checksumSHA1": "hCKwS6ZHAUomcH/ukmDo366wSE8=", - "path": "github.com/raintank/met/statsd", - "revision": "daf6d57fc20532f01eb2320040a8054b5618cbf1", - "revisionTime": "2016-01-13T08:38:23Z" - }, { "checksumSHA1": "G5q2mryb77aa5RqtRSIiIXZpTcA=", "path": "github.com/raintank/misc/app", "revision": "cb73203311ae2489b845469f622678f24ff98a16", "revisionTime": "2016-07-13T17:23:02Z" }, - { - "checksumSHA1": "0B0oQeTSasAEyG1RgnGvBYK0nRU=", - "path": "github.com/raintank/misc/instrumented_nsq", - "revision": "cb73203311ae2489b845469f622678f24ff98a16", - "revisionTime": "2016-07-13T17:23:02Z" - }, { "checksumSHA1": "oCMADD0xqElYRIXBgNkBGHXGQyk=", "path": "github.com/raintank/worldping-api/pkg/log",