update metrics descriptions

grafana · Dec 30, 2016 · ec47883 · ec47883
1 parent 2080c3c
commit ec47883
Show file tree

Hide file tree

Showing 13 changed files with 260 additions and 67 deletions.
diff --git a/api/api.go b/api/api.go
@@ -19,13 +19,20 @@ import (
 var LogLevel int
 
 var (
-	getTargetDuration     = stats.NewLatencyHistogram15s32("api.get_target")
+	// metric api.get_target is how long it takes to get a target
+	getTargetDuration = stats.NewLatencyHistogram15s32("api.get_target")
+
+	// metric api.iters_to_points is how long it takes to decode points from a chunk iterator
 	itersToPointsDuration = stats.NewLatencyHistogram15s32("api.iters_to_points")
-	// just 1 global timer of request handling time. includes mem/cassandra gets, chunk decode/iters, json building etc
-	// there is such a thing as too many metrics.  we have this, and cassandra timings, that should be enough for realtime profiling
+
+	// metric api.request_handle is how long it takes to handle a render request
 	reqHandleDuration = stats.NewLatencyHistogram15s32("api.request_handle")
-	reqSpanBoth       = stats.NewMeter32("api.requests_span.mem_and_cassandra", false)
-	reqSpanMem        = stats.NewMeter32("api.requests_span.mem", false)
+
+	// metric api.requests_span.mem_and_cassandra is the timerange of requests hitting both in-memory and cassandra
+	reqSpanBoth = stats.NewMeter32("api.requests_span.mem_and_cassandra", false)
+
+	// metric api.requests_span.mem is the timerange of requests hitting only the ringbuffer
+	reqSpanMem = stats.NewMeter32("api.requests_span.mem", false)
 )
 
 type Server struct {

diff --git a/cassandra/metrics.go b/cassandra/metrics.go
@@ -20,13 +20,41 @@ type ErrMetrics struct {
 
 func NewErrMetrics(component string) ErrMetrics {
 	return ErrMetrics{
-		cassErrTimeout:                  stats.NewCounter32(fmt.Sprintf("%s.error.timeout", component)),
-		cassErrTooManyTimeouts:          stats.NewCounter32(fmt.Sprintf("%s.error.too-many-timeouts", component)),
-		cassErrConnClosed:               stats.NewCounter32(fmt.Sprintf("%s.error.conn-closed", component)),
-		cassErrNoConns:                  stats.NewCounter32(fmt.Sprintf("%s.error.no-connections", component)),
-		cassErrUnavailable:              stats.NewCounter32(fmt.Sprintf("%s.error.unavailable", component)),
+
+		// metric idx.cassandra.error.timeout is a counter of timeouts seen to the cassandra idx
+
+		// metric store.cassandra.error.timeout is a counter of timeouts seen to the cassandra store
+		cassErrTimeout: stats.NewCounter32(fmt.Sprintf("%s.error.timeout", component)),
+
+		// metric idx.cassandra.error.too-many-timeouts is a counter of how many times we saw to many timeouts and closed the connection to the cassandra idx
+
+		// metric store.cassandra.error.too-many-timeouts is a counter of how many times we saw to many timeouts and closed the connection to the cassandra store
+		cassErrTooManyTimeouts: stats.NewCounter32(fmt.Sprintf("%s.error.too-many-timeouts", component)),
+
+		// metric idx.cassandra.error.conn-closed is a counter of how many times we saw a connection closed to the cassandra idx
+
+		// metric store.cassandra.error.conn-closed is a counter of how many times we saw a connection closed to the cassandra store
+		cassErrConnClosed: stats.NewCounter32(fmt.Sprintf("%s.error.conn-closed", component)),
+
+		// metric idx.cassandra.error.no-connections is a counter of how many times we had no connections remaining to the cassandra idx
+
+		// metric store.cassandra.error.no-connections is a counter of how many times we had no connections remaining to the cassandra store
+		cassErrNoConns: stats.NewCounter32(fmt.Sprintf("%s.error.no-connections", component)),
+
+		// metric idx.cassandra.error.unavailable is a counter of how many times the cassandra idx was unavailable
+
+		// metric store.cassandra.error.unavailable is a counter of how many times the cassandra store was unavailable
+		cassErrUnavailable: stats.NewCounter32(fmt.Sprintf("%s.error.unavailable", component)),
+
+		// metric idx.cassandra.error.cannot-achieve-consistency is a counter of the cassandra idx not being able to achieve consistency for a given query
+
+		// metric store.cassandra.error.cannot-achieve-consistency is a counter of the cassandra store not being able to achieve consistency for a given query
 		cassErrCannotAchieveConsistency: stats.NewCounter32(fmt.Sprintf("%s.error.cannot-achieve-consistency", component)),
-		cassErrOther:                    stats.NewCounter32(fmt.Sprintf("%s.error.other", component)),
+
+		// metric idx.cassandra.error.other is a counter of other errors talking to the cassandra idx
+
+		// metric store.cassandra.error.other is a counter of other errors talking to the cassandra store
+		cassErrOther: stats.NewCounter32(fmt.Sprintf("%s.error.other", component)),
 	}
 }
 

diff --git a/docs/metrics.md b/docs/metrics.md
@@ -1,38 +1,156 @@
 # Overview of metrics
 (only shows metrics that are documented. generated with [metrics2docs](github.com/Dieterbe/metrics2docs))
 
-* `add_to_saved_chunk`:  
+* `api.get_target`:  
+how long it takes to get a target
+* `api.iters_to_points`:  
+how long it takes to decode points from a chunk iterator
+* `api.request_handle`:  
+how long it takes to handle a render request
+* `api.requests_span.mem`:  
+the timerange of requests hitting only the ringbuffer
+* `api.requests_span.mem_and_cassandra`:  
+the timerange of requests hitting both in-memory and cassandra
+* `cluster.notifier.kafka.message_size`:  
+the sizes seen of messages through the kafka cluster notifier
+* `cluster.notifier.kafka.messages-published`:  
+a counter of messages published to the kafka cluster notifier
+* `cluster.notifier.nsq.message_size`:  
+the sizes seen of messages through the nsq cluster notifier
+* `cluster.notifier.nsq.messages-published`:  
+a counter of messages published to the nsq cluster notifier
+* `cluster.promotion_wait`:  
+how long a candidate (secondary node) has to wait until it can become a primary
+When the timer becomes 0 it means the in-memory buffer has been able to fully populate so that if you stop a primary
+and it was able to save its complete chunks, this node will be able to take over without dataloss.
+You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save.
+* `idx.cassadra.add.ok`:  
+how many metrics are successfully being indexed
+* `idx.cassandra.add`:  
+the duration of addititons to the cassandra idx
+* `idx.cassandra.add.fail`:  
+how many failures were encountered while trying to index metrics
+* `idx.cassandra.delete`:  
+the duration of deletions from the cassandra idx
+* `idx.cassandra.error.cannot-achieve-consistency`:  
+a counter of the cassandra idx not being able to achieve consistency for a given query
+* `idx.cassandra.error.conn-closed`:  
+a counter of how many times we saw a connection closed to the cassandra idx
+* `idx.cassandra.error.no-connections`:  
+a counter of how many times we had no connections remaining to the cassandra idx
+* `idx.cassandra.error.other`:  
+a counter of other errors talking to the cassandra idx
+* `idx.cassandra.error.timeout`:  
+a counter of timeouts seen to the cassandra idx
+* `idx.cassandra.error.too-many-timeouts`:  
+a counter of how many times we saw to many timeouts and closed the connection to the cassandra idx
+* `idx.cassandra.error.unavailable`:  
+a counter of how many times the cassandra idx was unavailable
+* `idx.elasticsearch.add`:  
+the duration of additions to the ES idx
+* `idx.elasticsearch.add.fail`:  
+the number of failed additions to the ES idx
+* `idx.elasticsearch.add.ok`:  
+the number of successfull additions to the ES idx
+* `idx.elasticsearch.delete`:  
+the duration of deletes from the ES idx
+* `idx.elasticsearch.retrybuf.items`:  
+the amount of items currently in the retry buffer
+* `idx.memory.add`:  
+the duration of (successfull) memory idx additions
+* `idx.memory.add.fail`:  
+the number of failed additions to the memory idx
+* `idx.memory.add.ok`:  
+the number of successful additions to the memory idx
+* `idx.memory.delete`:  
+the duration of memory idx deletes
+* `idx.memory.find`:  
+the duration of memory idx find
+* `idx.memory.get`:  
+the duration of memory idx gets
+* `idx.memory.list`:  
+the duration of memory idx listings
+* `mem.to_iter`:  
+how long it takes to transform in-memory chunks to iterators
+* `memory.bytes.obtained_from_sys`:  
+the amount of bytes currently obtained from the system by the process.  This is what the profiletrigger looks at.
+* `memory.bytes_allocated_on_heap`:  
+a gauge of currently allocated (within the runtime) memory.
+* `memory.gc.cpu_fraction`:  
+how much cpu is consumed by the GC across process lifetime, in pro-mille
+* `memory.gc.heap_objects`:  
+how many objects are allocated on the heap, it's a key indicator for GC workload
+* `memory.gc.last_duration`:  
+the duration of the last GC STW pause in nanoseconds
+* `memory.total_bytes_allocated`:  
+a counter of total amount of bytes allocated during process lifetime
+* `memory.total_gc_cycles`:  
+a counter of the number of GC cycles since process start
+* `metric_invalid`:  
+a count of times a metric did not validate
+* `metrics_decode_err`:  
+a count of times an input message (MetricData, MetricDataArray or carbon line) failed to parse
+* `store.cassandra.chunk_operations.save_fail`:  
+counter of failed saves
+* `store.cassandra.chunk_operations.save_ok`:  
+counter of successfull saves
+* `store.cassandra.chunk_size.at_load`:  
+the sizes of chunks seen when loading them
+* `store.cassandra.chunk_size.at_save`:  
+the sizes of chunks seen when saving them
+* `store.cassandra.chunks_per_row`:  
+how many chunks are retrieved per row in get queries
+* `store.cassandra.error.cannot-achieve-consistency`:  
+a counter of the cassandra store not being able to achieve consistency for a given query
+* `store.cassandra.error.conn-closed`:  
+a counter of how many times we saw a connection closed to the cassandra store
+* `store.cassandra.error.no-connections`:  
+a counter of how many times we had no connections remaining to the cassandra store
+* `store.cassandra.error.other`:  
+a counter of other errors talking to the cassandra store
+* `store.cassandra.error.timeout`:  
+a counter of timeouts seen to the cassandra store
+* `store.cassandra.error.too-many-timeouts`:  
+a counter of how many times we saw to many timeouts and closed the connection to the cassandra store
+* `store.cassandra.error.unavailable`:  
+a counter of how many times the cassandra store was unavailable
+* `store.cassandra.get.exec`:  
+the duration of getting from cassandra store
+* `store.cassandra.get.wait`:  
+the duration of the get spent in the queue
+* `store.cassandra.get_chunks`:  
+the duration of how long it takes to get chunks
+* `store.cassandra.put.exec`:  
+the duration of putting in cassandra store
+* `store.cassandra.put.wait`:  
+the duration of a put in the wait queue
+* `store.cassandra.rows_per_response`:  
+how many rows come per get response
+* `store.cassandra.to_iter`:  
+the duration of converting chunks to iterators
+* `tank.add_to_saved_chunk`:  
 points received - by a secondary node - for the most recent chunk when that chunk
 has already been saved by a primary.  A secondary can add this data to its chunks.
-* `add_to_saving_chunk`:  
+* `tank.add_to_saving_chunk`:  
 points received - by the primary node - for the most recent chunk
 when that chunk is already being saved (or has been saved).
 this indicates that your GC is actively sealing chunks and saving them before you have the chance to send
 your (infrequent) updates.  The primary won't add them to its in-memory chunks, but secondaries will
 (because they are never in "saving" state for them), see below.
-* `bytes_alloc.incl_freed`:  
-a counter of total amount of bytes allocated during process lifetime. (incl freed data)
-* `bytes_alloc.not_freed`:  
-a gauge of currently allocated (within the runtime) memory.
-it does not include freed data so it drops at every GC run.
-* `bytes_sys`:  
-the amount of bytes currently obtained from the system by the process.  This is what the profiletrigger looks at.
-* `cluster.promotion_wait`:  
-how long a candidate (secondary node) has to wait until it can become a primary
-When the timer becomes 0 it means the in-memory buffer has been able to fully populate so that if you stop a primary
-and it was able to save its complete chunks, this node will be able to take over without dataloss.
-You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save.
-* `gc.heap_objects`:  
-how many objects are allocated on the heap, it's a key indicator for GC workload
-* `gc_metric`:  
+* `tank.chunk_operations.clear`:  
+a counter of how many chunks are cleared (replaced by new chunks)
+* `tank.chunk_operations.create`:  
+a counter of how many chunks are created
+* `tank.gc_metric`:  
 the amount of times the metrics GC is about to inspect a metric (series)
-* `idx.cassadra.ok`:  
-how many metrics are successfully being indexed
-* `idx.cassandra.fail`:  
-how failures encountered while trying to index metrics
-* `metrics_active`:  
+* `tank.metrics_active`:  
 the amount of currently known metrics (excl rollup series), measured every second
-* `metrics_too_old`:  
+* `tank.metrics_too_old`:  
 points that go back in time.
 E.g. for any given series, when a point has a timestamp
 that is not higher than the timestamp of the last written timestamp for that series.
+* `tank.persist`:  
+how long it takes to persist a chunk (and chunks preceeding it)
+this is subject to backpressure from the store when the store's queue runs full
+* `tank.total_points`:  
+the number of points currently held in the in-memory ringbuffer
diff --git a/idx/cassandra/cassandra.go b/idx/cassandra/cassandra.go
@@ -37,11 +37,13 @@ const TableSchema = `CREATE TABLE IF NOT EXISTS %s.metric_idx (
 const MetricIdxPartitionIndex = `CREATE INDEX IF NOT EXISTS ON %s.metric_idx(partition)`
 
 var (
-	// metric idx.cassadra.ok is how many metrics are successfully being indexed
+	// metric idx.cassadra.add.ok is how many metrics are successfully being indexed
 	idxCasOk = stats.NewCounter32("idx.cassandra.add.ok")
-	// metric idx.cassandra.fail is how failures encountered while trying to index metrics
-	idxCasFail           = stats.NewCounter32("idx.cassandra.add.fail")
-	idxCasAddDuration    = stats.NewLatencyHistogram15s32("idx.cassandra.add")
+	// metric idx.cassandra.add.fail is how many failures were encountered while trying to index metrics
+	idxCasFail = stats.NewCounter32("idx.cassandra.add.fail")
+	// metric idx.cassandra.add is the duration of addititons to the cassandra idx
+	idxCasAddDuration = stats.NewLatencyHistogram15s32("idx.cassandra.add")
+	// metric idx.cassandra.delete is the duration of deletions from the cassandra idx
 	idxCasDeleteDuration = stats.NewLatencyHistogram15s32("idx.cassandra.delete")
 	errmetrics           = cassandra.NewErrMetrics("idx.cassandra")
 

diff --git a/idx/elasticsearch/elasticsearch.go b/idx/elasticsearch/elasticsearch.go
@@ -21,11 +21,16 @@ import (
 )
 
 var (
-	idxEsOk             = stats.NewCounter32("idx.elasticsearch.add.ok")
-	idxEsFail           = stats.NewCounter32("idx.elasticsearch.add.fail")
-	idxEsAddDuration    = stats.NewLatencyHistogram15s32("idx.elasticsearch.add")
+	// metric idx.elasticsearch.add.ok is the number of successfull additions to the ES idx
+	idxEsOk = stats.NewCounter32("idx.elasticsearch.add.ok")
+	// metric idx.elasticsearch.add.fail is the number of failed additions to the ES idx
+	idxEsFail = stats.NewCounter32("idx.elasticsearch.add.fail")
+	// metric idx.elasticsearch.add is the duration of additions to the ES idx
+	idxEsAddDuration = stats.NewLatencyHistogram15s32("idx.elasticsearch.add")
+	// metric idx.elasticsearch.delete is the duration of deletes from the ES idx
 	idxEsDeleteDuration = stats.NewLatencyHistogram15s32("idx.elasticsearch.delete")
-	retryBufItems       = stats.NewGauge32("idx.elasticsearch.retrybuf.items")
+	// metric idx.elasticsearch.retrybuf.items is the amount of items currently in the retry buffer
+	retryBufItems = stats.NewGauge32("idx.elasticsearch.retrybuf.items")
 
 	Enabled          bool
 	esIndex          string

diff --git a/idx/memory/memory.go b/idx/memory/memory.go
@@ -16,12 +16,19 @@ import (
 )
 
 var (
-	idxOk             = stats.NewCounter32("idx.memory.add.ok")
-	idxFail           = stats.NewCounter32("idx.memory.add.fail")
-	idxAddDuration    = stats.NewLatencyHistogram15s32("idx.memory.add")
-	idxGetDuration    = stats.NewLatencyHistogram15s32("idx.memory.get")
-	idxListDuration   = stats.NewLatencyHistogram15s32("idx.memory.list")
-	idxFindDuration   = stats.NewLatencyHistogram15s32("idx.memory.find")
+	// metric idx.memory.add.ok is the number of successful additions to the memory idx
+	idxOk = stats.NewCounter32("idx.memory.add.ok")
+	// metric idx.memory.add.fail is the number of failed additions to the memory idx
+	idxFail = stats.NewCounter32("idx.memory.add.fail")
+	// metric idx.memory.add is the duration of (successfull) memory idx additions
+	idxAddDuration = stats.NewLatencyHistogram15s32("idx.memory.add")
+	// metric idx.memory.get is the duration of memory idx gets
+	idxGetDuration = stats.NewLatencyHistogram15s32("idx.memory.get")
+	// metric idx.memory.list is the duration of memory idx listings
+	idxListDuration = stats.NewLatencyHistogram15s32("idx.memory.list")
+	// metric idx.memory.find is the duration of memory idx find
+	idxFindDuration = stats.NewLatencyHistogram15s32("idx.memory.find")
+	// metric idx.memory.delete is the duration of memory idx deletes
 	idxDeleteDuration = stats.NewLatencyHistogram15s32("idx.memory.delete")
 
 	Enabled bool

diff --git a/input/input.go b/input/input.go
@@ -14,6 +14,8 @@ import (
 	"gopkg.in/raintank/schema.v1"
 )
 
+// TODO: clever way to document all metrics for all different inputs
+
 // In is a base handler for a metrics packet, aimed to be embedded by concrete implementations
 type Input struct {
 	MetricsPerMessage *stats.Meter32

diff --git a/mdata/chunk/chunk.go b/mdata/chunk/chunk.go
@@ -8,6 +8,7 @@ import (
 	"github.com/raintank/metrictank/stats"
 )
 
+// metric tank.total_points is the number of points currently held in the in-memory ringbuffer
 var totalPoints = stats.NewGauge64("tank.total_points")
 
 // Chunk is a chunk of data. not concurrency safe.