Skip to content
This repository has been archived by the owner on Aug 23, 2023. It is now read-only.

Commit

Permalink
update metrics descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
Dieterbe committed Dec 30, 2016
1 parent 2080c3c commit ec47883
Show file tree
Hide file tree
Showing 13 changed files with 260 additions and 67 deletions.
17 changes: 12 additions & 5 deletions api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,20 @@ import (
var LogLevel int

var (
getTargetDuration = stats.NewLatencyHistogram15s32("api.get_target")
// metric api.get_target is how long it takes to get a target
getTargetDuration = stats.NewLatencyHistogram15s32("api.get_target")

// metric api.iters_to_points is how long it takes to decode points from a chunk iterator
itersToPointsDuration = stats.NewLatencyHistogram15s32("api.iters_to_points")
// just 1 global timer of request handling time. includes mem/cassandra gets, chunk decode/iters, json building etc
// there is such a thing as too many metrics. we have this, and cassandra timings, that should be enough for realtime profiling

// metric api.request_handle is how long it takes to handle a render request
reqHandleDuration = stats.NewLatencyHistogram15s32("api.request_handle")
reqSpanBoth = stats.NewMeter32("api.requests_span.mem_and_cassandra", false)
reqSpanMem = stats.NewMeter32("api.requests_span.mem", false)

// metric api.requests_span.mem_and_cassandra is the timerange of requests hitting both in-memory and cassandra
reqSpanBoth = stats.NewMeter32("api.requests_span.mem_and_cassandra", false)

// metric api.requests_span.mem is the timerange of requests hitting only the ringbuffer
reqSpanMem = stats.NewMeter32("api.requests_span.mem", false)
)

type Server struct {
Expand Down
40 changes: 34 additions & 6 deletions cassandra/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,41 @@ type ErrMetrics struct {

func NewErrMetrics(component string) ErrMetrics {
return ErrMetrics{
cassErrTimeout: stats.NewCounter32(fmt.Sprintf("%s.error.timeout", component)),
cassErrTooManyTimeouts: stats.NewCounter32(fmt.Sprintf("%s.error.too-many-timeouts", component)),
cassErrConnClosed: stats.NewCounter32(fmt.Sprintf("%s.error.conn-closed", component)),
cassErrNoConns: stats.NewCounter32(fmt.Sprintf("%s.error.no-connections", component)),
cassErrUnavailable: stats.NewCounter32(fmt.Sprintf("%s.error.unavailable", component)),

// metric idx.cassandra.error.timeout is a counter of timeouts seen to the cassandra idx

// metric store.cassandra.error.timeout is a counter of timeouts seen to the cassandra store
cassErrTimeout: stats.NewCounter32(fmt.Sprintf("%s.error.timeout", component)),

// metric idx.cassandra.error.too-many-timeouts is a counter of how many times we saw to many timeouts and closed the connection to the cassandra idx

// metric store.cassandra.error.too-many-timeouts is a counter of how many times we saw to many timeouts and closed the connection to the cassandra store
cassErrTooManyTimeouts: stats.NewCounter32(fmt.Sprintf("%s.error.too-many-timeouts", component)),

// metric idx.cassandra.error.conn-closed is a counter of how many times we saw a connection closed to the cassandra idx

// metric store.cassandra.error.conn-closed is a counter of how many times we saw a connection closed to the cassandra store
cassErrConnClosed: stats.NewCounter32(fmt.Sprintf("%s.error.conn-closed", component)),

// metric idx.cassandra.error.no-connections is a counter of how many times we had no connections remaining to the cassandra idx

// metric store.cassandra.error.no-connections is a counter of how many times we had no connections remaining to the cassandra store
cassErrNoConns: stats.NewCounter32(fmt.Sprintf("%s.error.no-connections", component)),

// metric idx.cassandra.error.unavailable is a counter of how many times the cassandra idx was unavailable

// metric store.cassandra.error.unavailable is a counter of how many times the cassandra store was unavailable
cassErrUnavailable: stats.NewCounter32(fmt.Sprintf("%s.error.unavailable", component)),

// metric idx.cassandra.error.cannot-achieve-consistency is a counter of the cassandra idx not being able to achieve consistency for a given query

// metric store.cassandra.error.cannot-achieve-consistency is a counter of the cassandra store not being able to achieve consistency for a given query
cassErrCannotAchieveConsistency: stats.NewCounter32(fmt.Sprintf("%s.error.cannot-achieve-consistency", component)),
cassErrOther: stats.NewCounter32(fmt.Sprintf("%s.error.other", component)),

// metric idx.cassandra.error.other is a counter of other errors talking to the cassandra idx

// metric store.cassandra.error.other is a counter of other errors talking to the cassandra store
cassErrOther: stats.NewCounter32(fmt.Sprintf("%s.error.other", component)),
}
}

Expand Down
164 changes: 141 additions & 23 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -1,38 +1,156 @@
# Overview of metrics
(only shows metrics that are documented. generated with [metrics2docs](github.com/Dieterbe/metrics2docs))

* `add_to_saved_chunk`:
* `api.get_target`:
how long it takes to get a target
* `api.iters_to_points`:
how long it takes to decode points from a chunk iterator
* `api.request_handle`:
how long it takes to handle a render request
* `api.requests_span.mem`:
the timerange of requests hitting only the ringbuffer
* `api.requests_span.mem_and_cassandra`:
the timerange of requests hitting both in-memory and cassandra
* `cluster.notifier.kafka.message_size`:
the sizes seen of messages through the kafka cluster notifier
* `cluster.notifier.kafka.messages-published`:
a counter of messages published to the kafka cluster notifier
* `cluster.notifier.nsq.message_size`:
the sizes seen of messages through the nsq cluster notifier
* `cluster.notifier.nsq.messages-published`:
a counter of messages published to the nsq cluster notifier
* `cluster.promotion_wait`:
how long a candidate (secondary node) has to wait until it can become a primary
When the timer becomes 0 it means the in-memory buffer has been able to fully populate so that if you stop a primary
and it was able to save its complete chunks, this node will be able to take over without dataloss.
You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save.
* `idx.cassadra.add.ok`:
how many metrics are successfully being indexed
* `idx.cassandra.add`:
the duration of addititons to the cassandra idx
* `idx.cassandra.add.fail`:
how many failures were encountered while trying to index metrics
* `idx.cassandra.delete`:
the duration of deletions from the cassandra idx
* `idx.cassandra.error.cannot-achieve-consistency`:
a counter of the cassandra idx not being able to achieve consistency for a given query
* `idx.cassandra.error.conn-closed`:
a counter of how many times we saw a connection closed to the cassandra idx
* `idx.cassandra.error.no-connections`:
a counter of how many times we had no connections remaining to the cassandra idx
* `idx.cassandra.error.other`:
a counter of other errors talking to the cassandra idx
* `idx.cassandra.error.timeout`:
a counter of timeouts seen to the cassandra idx
* `idx.cassandra.error.too-many-timeouts`:
a counter of how many times we saw to many timeouts and closed the connection to the cassandra idx
* `idx.cassandra.error.unavailable`:
a counter of how many times the cassandra idx was unavailable
* `idx.elasticsearch.add`:
the duration of additions to the ES idx
* `idx.elasticsearch.add.fail`:
the number of failed additions to the ES idx
* `idx.elasticsearch.add.ok`:
the number of successfull additions to the ES idx
* `idx.elasticsearch.delete`:
the duration of deletes from the ES idx
* `idx.elasticsearch.retrybuf.items`:
the amount of items currently in the retry buffer
* `idx.memory.add`:
the duration of (successfull) memory idx additions
* `idx.memory.add.fail`:
the number of failed additions to the memory idx
* `idx.memory.add.ok`:
the number of successful additions to the memory idx
* `idx.memory.delete`:
the duration of memory idx deletes
* `idx.memory.find`:
the duration of memory idx find
* `idx.memory.get`:
the duration of memory idx gets
* `idx.memory.list`:
the duration of memory idx listings
* `mem.to_iter`:
how long it takes to transform in-memory chunks to iterators
* `memory.bytes.obtained_from_sys`:
the amount of bytes currently obtained from the system by the process. This is what the profiletrigger looks at.
* `memory.bytes_allocated_on_heap`:
a gauge of currently allocated (within the runtime) memory.
* `memory.gc.cpu_fraction`:
how much cpu is consumed by the GC across process lifetime, in pro-mille
* `memory.gc.heap_objects`:
how many objects are allocated on the heap, it's a key indicator for GC workload
* `memory.gc.last_duration`:
the duration of the last GC STW pause in nanoseconds
* `memory.total_bytes_allocated`:
a counter of total amount of bytes allocated during process lifetime
* `memory.total_gc_cycles`:
a counter of the number of GC cycles since process start
* `metric_invalid`:
a count of times a metric did not validate
* `metrics_decode_err`:
a count of times an input message (MetricData, MetricDataArray or carbon line) failed to parse
* `store.cassandra.chunk_operations.save_fail`:
counter of failed saves
* `store.cassandra.chunk_operations.save_ok`:
counter of successfull saves
* `store.cassandra.chunk_size.at_load`:
the sizes of chunks seen when loading them
* `store.cassandra.chunk_size.at_save`:
the sizes of chunks seen when saving them
* `store.cassandra.chunks_per_row`:
how many chunks are retrieved per row in get queries
* `store.cassandra.error.cannot-achieve-consistency`:
a counter of the cassandra store not being able to achieve consistency for a given query
* `store.cassandra.error.conn-closed`:
a counter of how many times we saw a connection closed to the cassandra store
* `store.cassandra.error.no-connections`:
a counter of how many times we had no connections remaining to the cassandra store
* `store.cassandra.error.other`:
a counter of other errors talking to the cassandra store
* `store.cassandra.error.timeout`:
a counter of timeouts seen to the cassandra store
* `store.cassandra.error.too-many-timeouts`:
a counter of how many times we saw to many timeouts and closed the connection to the cassandra store
* `store.cassandra.error.unavailable`:
a counter of how many times the cassandra store was unavailable
* `store.cassandra.get.exec`:
the duration of getting from cassandra store
* `store.cassandra.get.wait`:
the duration of the get spent in the queue
* `store.cassandra.get_chunks`:
the duration of how long it takes to get chunks
* `store.cassandra.put.exec`:
the duration of putting in cassandra store
* `store.cassandra.put.wait`:
the duration of a put in the wait queue
* `store.cassandra.rows_per_response`:
how many rows come per get response
* `store.cassandra.to_iter`:
the duration of converting chunks to iterators
* `tank.add_to_saved_chunk`:
points received - by a secondary node - for the most recent chunk when that chunk
has already been saved by a primary. A secondary can add this data to its chunks.
* `add_to_saving_chunk`:
* `tank.add_to_saving_chunk`:
points received - by the primary node - for the most recent chunk
when that chunk is already being saved (or has been saved).
this indicates that your GC is actively sealing chunks and saving them before you have the chance to send
your (infrequent) updates. The primary won't add them to its in-memory chunks, but secondaries will
(because they are never in "saving" state for them), see below.
* `bytes_alloc.incl_freed`:
a counter of total amount of bytes allocated during process lifetime. (incl freed data)
* `bytes_alloc.not_freed`:
a gauge of currently allocated (within the runtime) memory.
it does not include freed data so it drops at every GC run.
* `bytes_sys`:
the amount of bytes currently obtained from the system by the process. This is what the profiletrigger looks at.
* `cluster.promotion_wait`:
how long a candidate (secondary node) has to wait until it can become a primary
When the timer becomes 0 it means the in-memory buffer has been able to fully populate so that if you stop a primary
and it was able to save its complete chunks, this node will be able to take over without dataloss.
You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save.
* `gc.heap_objects`:
how many objects are allocated on the heap, it's a key indicator for GC workload
* `gc_metric`:
* `tank.chunk_operations.clear`:
a counter of how many chunks are cleared (replaced by new chunks)
* `tank.chunk_operations.create`:
a counter of how many chunks are created
* `tank.gc_metric`:
the amount of times the metrics GC is about to inspect a metric (series)
* `idx.cassadra.ok`:
how many metrics are successfully being indexed
* `idx.cassandra.fail`:
how failures encountered while trying to index metrics
* `metrics_active`:
* `tank.metrics_active`:
the amount of currently known metrics (excl rollup series), measured every second
* `metrics_too_old`:
* `tank.metrics_too_old`:
points that go back in time.
E.g. for any given series, when a point has a timestamp
that is not higher than the timestamp of the last written timestamp for that series.
* `tank.persist`:
how long it takes to persist a chunk (and chunks preceeding it)
this is subject to backpressure from the store when the store's queue runs full
* `tank.total_points`:
the number of points currently held in the in-memory ringbuffer
10 changes: 6 additions & 4 deletions idx/cassandra/cassandra.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@ const TableSchema = `CREATE TABLE IF NOT EXISTS %s.metric_idx (
const MetricIdxPartitionIndex = `CREATE INDEX IF NOT EXISTS ON %s.metric_idx(partition)`

var (
// metric idx.cassadra.ok is how many metrics are successfully being indexed
// metric idx.cassadra.add.ok is how many metrics are successfully being indexed
idxCasOk = stats.NewCounter32("idx.cassandra.add.ok")
// metric idx.cassandra.fail is how failures encountered while trying to index metrics
idxCasFail = stats.NewCounter32("idx.cassandra.add.fail")
idxCasAddDuration = stats.NewLatencyHistogram15s32("idx.cassandra.add")
// metric idx.cassandra.add.fail is how many failures were encountered while trying to index metrics
idxCasFail = stats.NewCounter32("idx.cassandra.add.fail")
// metric idx.cassandra.add is the duration of addititons to the cassandra idx
idxCasAddDuration = stats.NewLatencyHistogram15s32("idx.cassandra.add")
// metric idx.cassandra.delete is the duration of deletions from the cassandra idx
idxCasDeleteDuration = stats.NewLatencyHistogram15s32("idx.cassandra.delete")
errmetrics = cassandra.NewErrMetrics("idx.cassandra")

Expand Down
13 changes: 9 additions & 4 deletions idx/elasticsearch/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,16 @@ import (
)

var (
idxEsOk = stats.NewCounter32("idx.elasticsearch.add.ok")
idxEsFail = stats.NewCounter32("idx.elasticsearch.add.fail")
idxEsAddDuration = stats.NewLatencyHistogram15s32("idx.elasticsearch.add")
// metric idx.elasticsearch.add.ok is the number of successfull additions to the ES idx
idxEsOk = stats.NewCounter32("idx.elasticsearch.add.ok")
// metric idx.elasticsearch.add.fail is the number of failed additions to the ES idx
idxEsFail = stats.NewCounter32("idx.elasticsearch.add.fail")
// metric idx.elasticsearch.add is the duration of additions to the ES idx
idxEsAddDuration = stats.NewLatencyHistogram15s32("idx.elasticsearch.add")
// metric idx.elasticsearch.delete is the duration of deletes from the ES idx
idxEsDeleteDuration = stats.NewLatencyHistogram15s32("idx.elasticsearch.delete")
retryBufItems = stats.NewGauge32("idx.elasticsearch.retrybuf.items")
// metric idx.elasticsearch.retrybuf.items is the amount of items currently in the retry buffer
retryBufItems = stats.NewGauge32("idx.elasticsearch.retrybuf.items")

Enabled bool
esIndex string
Expand Down
19 changes: 13 additions & 6 deletions idx/memory/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,19 @@ import (
)

var (
idxOk = stats.NewCounter32("idx.memory.add.ok")
idxFail = stats.NewCounter32("idx.memory.add.fail")
idxAddDuration = stats.NewLatencyHistogram15s32("idx.memory.add")
idxGetDuration = stats.NewLatencyHistogram15s32("idx.memory.get")
idxListDuration = stats.NewLatencyHistogram15s32("idx.memory.list")
idxFindDuration = stats.NewLatencyHistogram15s32("idx.memory.find")
// metric idx.memory.add.ok is the number of successful additions to the memory idx
idxOk = stats.NewCounter32("idx.memory.add.ok")
// metric idx.memory.add.fail is the number of failed additions to the memory idx
idxFail = stats.NewCounter32("idx.memory.add.fail")
// metric idx.memory.add is the duration of (successfull) memory idx additions
idxAddDuration = stats.NewLatencyHistogram15s32("idx.memory.add")
// metric idx.memory.get is the duration of memory idx gets
idxGetDuration = stats.NewLatencyHistogram15s32("idx.memory.get")
// metric idx.memory.list is the duration of memory idx listings
idxListDuration = stats.NewLatencyHistogram15s32("idx.memory.list")
// metric idx.memory.find is the duration of memory idx find
idxFindDuration = stats.NewLatencyHistogram15s32("idx.memory.find")
// metric idx.memory.delete is the duration of memory idx deletes
idxDeleteDuration = stats.NewLatencyHistogram15s32("idx.memory.delete")

Enabled bool
Expand Down
2 changes: 2 additions & 0 deletions input/input.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"gopkg.in/raintank/schema.v1"
)

// TODO: clever way to document all metrics for all different inputs

// In is a base handler for a metrics packet, aimed to be embedded by concrete implementations
type Input struct {
MetricsPerMessage *stats.Meter32
Expand Down
1 change: 1 addition & 0 deletions mdata/chunk/chunk.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/raintank/metrictank/stats"
)

// metric tank.total_points is the number of points currently held in the in-memory ringbuffer
var totalPoints = stats.NewGauge64("tank.total_points")

// Chunk is a chunk of data. not concurrency safe.
Expand Down
Loading

0 comments on commit ec47883

Please sign in to comment.