Merge pull request #449 from raintank/configs

update configs and docs
grafana · Feb 7, 2017 · e726ea5 · e726ea5
2 parents 8b6e6d7 + f0794cf
commit e726ea5
Show file tree

Hide file tree

Showing 18 changed files with 378 additions and 216 deletions.
diff --git a/README.md b/README.md
@@ -56,7 +56,8 @@ So we can do consolidation (combined runtime+archived) accurately and correctly,
 * [Quick start using docker](https://github.com/raintank/metrictank/blob/master/docs/quick-start-docker.md)
 * [Installation guides](https://github.com/raintank/metrictank/blob/master/docs/installation.md)
 * [Configuration](https://github.com/raintank/metrictank/blob/master/docs/config.md)
-* [Data knobs](https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md)
+* [Memory server](https://github.com/raintank/metrictank/blob/master/docs/memory-server.md)
+* [Compression tips](https://github.com/raintank/metrictank/blob/master/docs/compression-tips.md)
 * [Cassandra](https://github.com/raintank/metrictank/blob/master/docs/cassandra.md)
 * [Kafka](https://github.com/raintank/metrictank/blob/master/docs/kafka.md)
 * [Inputs](https://github.com/raintank/metrictank/blob/master/docs/inputs.md)

diff --git a/docker/docker-cluster/docker-compose.yml b/docker/docker-cluster/docker-compose.yml
@@ -40,6 +40,7 @@ services:
       MT_CLUSTER_MODE: multi
       MT_CLUSTER_PEERS: metrictank0,metrictank2,metrictank3
       MT_CLUSTER_BIND_ADDR: "metrictank1:7946"
+      MT_CLUSTER_PRIMARY_NODE: "false"
     links:
      - cassandra
      - metrictank0
@@ -85,6 +86,7 @@ services:
       MT_CLUSTER_MODE: multi
       MT_CLUSTER_PEERS: metrictank0,metrictank1,metrictank2
       MT_CLUSTER_BIND_ADDR: "metrictank3:7946"
+      MT_CLUSTER_PRIMARY_NODE: "false"
     links:
      - cassandra
      - metrictank0

diff --git a/docker/docker-cluster/metrictank.ini b/docker/docker-cluster/metrictank.ini
@@ -8,10 +8,14 @@ accounting-period = 5min
 
 ## data ##
 
-# see https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md for more details
+# see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for more details
+
 # duration of raw chunks. e.g. 10min, 30min, 1h, 90min...
+# must be valid value as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
 chunkspan = 10min
-# number of raw chunks to keep in memory. should be at least 1 more than what's needed to satisfy aggregation rules
+# number of raw chunks to keep in in-memory ring buffer
+# See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache
+# (settings further down) which may be a more effective method to cache data and alleviate workload for cassandra.
 numchunks = 7
 # minimum wait before raw metrics are removed from storage
 ttl = 35d
@@ -29,11 +33,15 @@ gc-interval = 1h
 warm-up-period = 1h
 
 # settings for rollups (aggregation for archives)
-# comma-separated of archive specifications.
+# comma-separated list of archive specifications.
 # archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true]
-# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false
-# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in memory, keep for 3months in cassandra
-# 1hr worth of data, in chunks of 6 hours, 2 chunks in mem, keep for 1 year, but this series is not ready yet for querying.
+# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false you get:
+# - 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
+# - 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
+# When running a cluster of metrictank instances, all instances should have the same agg-settings.
+# Note:
+# * chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# * numchunks -like the global setting- has nuanced use compared to chunk cache. see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md
 agg-settings =
 
 ## metric data storage in cassandra ##
@@ -81,20 +89,13 @@ cassandra-username = cassandra
 # password for authentication
 cassandra-password = cassandra
 
-## Profiling, instrumentation and logging ##
+## Profiling and logging ##
 
 # see https://golang.org/pkg/runtime/#SetBlockProfileRate
 block-profile-rate = 0
 # 0 to disable. 1 for max precision (expensive!) see https://golang.org/pkg/runtime/#pkg-variables")
 mem-profile-rate = 524288 # 512*1024
 
-# enable sending statsd messages for instrumentation
-statsd-enabled = true
-# statsd address
-statsd-addr = statsdaemon:8125
-# standard or datadog
-statsd-type = standard
-
 # inspect status frequency. set to 0 to disable
 proftrigger-freq = 1s
 # path to store triggered profiles
@@ -108,6 +109,28 @@ proftrigger-heap-thresh = 25000000000
 # only log log-level and higher. 0=TRACE|1=DEBUG|2=INFO|3=WARN|4=ERROR|5=CRITICAL|6=FATAL
 log-level = 2
 
+# instrumentation stats
+[stats]
+# enable sending graphite messages for instrumentation
+enabled = true
+# stats prefix (will add trailing dot automatically if needed)
+# The default matches what the Grafana dashboard expects
+# $instance will be replaced with the `instance` setting.
+# note, the 3rd word describes the environment you deployed in.
+prefix = metrictank.stats.docker-cluster.$instance
+# graphite address
+addr = localhost:2003
+# interval at which to send statistics
+interval = 1
+# how many messages (holding all measurements from one interval. rule of thumb: a message is ~25kB) to buffer up in case graphite endpoint is unavailable.
+# With the default of 20k you will use max about 500MB and bridge 5 hours of downtime when needed
+buffer-size = 20000
+
+## chunk cache ##
+[chunk-cache]
+# maximum size of chunk cache in bytes. (1024 ^ 3) * 4 = 4294967296 = 4G
+max-size = 4294967296
+
 ## http api ##
 [http]
 # tcp address for metrictank to bind to for its HTTP interface
@@ -132,6 +155,8 @@ log-min-dur = 5min
 enabled = true
 # tcp address
 addr = :2003
+# represents the "partition" of your data if you decide to partition your data.
+partition = 0
 # needed to know your raw resolution for your metrics. see http://graphite.readthedocs.io/en/latest/config-carbon.html#storage-schemas-conf
 # NOTE: does NOT use aggregation and retention settings from this file.  We use agg-settings and ttl for that.
 schemas-file = /etc/raintank/storage-schemas.conf
@@ -219,10 +244,6 @@ max-in-flight = 200
 
 ## metric metadata index ##
 
-### in-memory
-[memory-idx]
-enabled = false
-
 ### in memory, cassandra-backed
 [cassandra-idx]
 enabled = true
@@ -260,3 +281,7 @@ auth = false
 username = cassandra
 # password for authentication
 password = cassandra
+
+### in-memory only
+[memory-idx]
+enabled = false
diff --git a/docker/docker-dev-custom-cfg-kafka/metrictank.ini b/docker/docker-dev-custom-cfg-kafka/metrictank.ini
@@ -8,11 +8,15 @@ accounting-period = 5min
 
 ## data ##
 
-# see https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md for more details
+# see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for more details
+
 # duration of raw chunks. e.g. 10min, 30min, 1h, 90min...
+# must be valid value as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
 chunkspan = 2min
-# number of raw chunks to keep in memory. should be at least 1 more than what's needed to satisfy aggregation rules
-numchunks = 1
+# number of raw chunks to keep in in-memory ring buffer
+# See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache
+# (settings further down) which may be a more effective method to cache data and alleviate workload for cassandra.
+numchunks = 2
 # minimum wait before raw metrics are removed from storage
 ttl = 35d
 
@@ -31,11 +35,13 @@ warm-up-period = 1h
 # settings for rollups (aggregation for archives)
 # comma-separated list of archive specifications.
 # archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true]
-# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false
-# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in memory, keep for 3months in cassandra
-# 1hr worth of data, in chunks of 6 hours, 2 chunks in mem, keep for 1 year, but this series is not ready yet for querying.
+# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false you get:
+# - 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
+# - 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
 # When running a cluster of metrictank instances, all instances should have the same agg-settings.
-# chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md
+# Note:
+# * chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# * numchunks -like the global setting- has nuanced use compared to chunk cache. see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md
 agg-settings =
 
 ## metric data storage in cassandra ##
@@ -111,7 +117,7 @@ enabled = true
 # The default matches what the Grafana dashboard expects
 # $instance will be replaced with the `instance` setting.
 # note, the 3rd word describes the environment you deployed in.
-prefix = metrictank.stats.default.$instance
+prefix = metrictank.stats.docker-dev-custom-cfg-kafka.$instance
 # graphite address
 addr = localhost:2003
 # interval at which to send statistics
@@ -120,6 +126,11 @@ interval = 1
 # With the default of 20k you will use max about 500MB and bridge 5 hours of downtime when needed
 buffer-size = 20000
 
+## chunk cache ##
+[chunk-cache]
+# maximum size of chunk cache in bytes. (1024 ^ 3) * 4 = 4294967296 = 4G
+max-size = 4294967296
+
 ## http api ##
 [http]
 # tcp address for metrictank to bind to for its HTTP interface
@@ -233,10 +244,6 @@ max-in-flight = 200
 
 ## metric metadata index ##
 
-### in-memory
-[memory-idx]
-enabled = false
-
 ### in memory, cassandra-backed
 [cassandra-idx]
 enabled = true
@@ -274,3 +281,7 @@ auth = false
 username = cassandra
 # password for authentication
 password = cassandra
+
+### in-memory only
+[memory-idx]
+enabled = false
diff --git a/docs/compression-tips.md b/docs/compression-tips.md
@@ -0,0 +1,11 @@
+# Compression tips
+
+* values that never - or infrequently - change compress extremely well, so are very cheap to track and store.
+* pay attention to your timestamps, make sure they are evenly spaced. That compresses better.
+* storing values as integers (or more precisely: floats without decimal numbers) compresses very well.
+  So you best store the numbers with the same unit as what your precision is.
+  E.g. let's say you measure latencies such as 0.035 seconds (3 decimals precision, e.g. ms precision), it's better to
+  track that as the number 35 (milliseconds) instead of 0.035 (seconds).
+
+For more details, see the [go-tsz eval program](https://github.com/dgryski/go-tsz/tree/master/eval) or the 
+[results table](https://raw.githubusercontent.com/dgryski/go-tsz/master/eval/eval-results.png)
diff --git a/docs/config.md b/docs/config.md
@@ -34,11 +34,14 @@ accounting-period = 5min
 ## data ##
 
 ```
-# see https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md for more details
+# see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for more details
 # duration of raw chunks. e.g. 10min, 30min, 1h, 90min...
-chunkspan = 2h
-# number of raw chunks to keep in memory. should be at least 1 more than what's needed to satisfy aggregation rules
-numchunks = 5
+# must be valid value as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+chunkspan = 10min
+# number of raw chunks to keep in in-memory ring buffer
+# See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache
+# (settings further down) which may be a more effective method to cache data and alleviate workload for cassandra.
+numchunks = 7
 # minimum wait before raw metrics are removed from storage
 ttl = 35d
 # max age for a chunk before to be considered stale and to be persisted to Cassandra
@@ -54,11 +57,13 @@ warm-up-period = 1h
 # settings for rollups (aggregation for archives)
 # comma-separated list of archive specifications.
 # archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true]
-# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false
-# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in memory, keep for 3months in cassandra
-# 1hr worth of data, in chunks of 6 hours, 2 chunks in mem, keep for 1 year, but this series is not ready yet for querying.
+# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false you get:
+# - 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
+# - 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
 # When running a cluster of metrictank instances, all instances should have the same agg-settings.
-# chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md
+# Note:
+# * chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# * numchunks -like the global setting- has nuanced use compared to chunk cache. see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md
 agg-settings =
 ```
 
@@ -144,6 +149,14 @@ interval = 1
 buffer-size = 20000
 ```
 
+## chunk cache ##
+
+```
+[chunk-cache]
+# maximum size of chunk cache in bytes. (1024 ^ 3) * 4 = 4294967296 = 4G
+max-size = 4294967296
+```
+
 ## http api ##
 
 ```
@@ -239,7 +252,7 @@ brokers = kafka:9092
 topic = metricpersist
 # kafka partitions to consume. use '*' or a comma separated list of id's. Should match kafka-mdm-in's partitions.
 partitions = *
-# method used for partitioning metrics. This should match the settings of tsdb-gw. One of byOrg|bySeries
+# method used for partitioning metrics. This should match the settings of tsdb-gw. (byOrg|bySeries)
 partition-scheme = bySeries
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
 offset = last
@@ -272,18 +285,11 @@ max-in-flight = 200
 ```
 
 ## metric metadata index ##
-### in-memory
-
-```
-[memory-idx]
-enabled = true
-```
-
 ### in memory, cassandra-backed
 
 ```
 [cassandra-idx]
-enabled = false
+enabled = true
 # Cassandra keyspace to store metricDefinitions in.
 keyspace = metrictank
 # comma separated list of cassandra addresses in host:port form
@@ -319,3 +325,10 @@ username = cassandra
 # password for authentication
 password = cassandra
 ```
+
+### in-memory only
+
+```
+[memory-idx]
+enabled = false
+```
diff --git a/docs/consolidation.md b/docs/consolidation.md
@@ -108,7 +108,7 @@ must cleanly multiply between one another (why again?)
 try to minimize storage overhead of each band
 
 SPAN CHOICE
-As described in the page [Data knobs](https://github.com/raintank/metrictank/blob/master/docs/data-knobs.md#valid-chunk-spans), only a finite set of values are valid chunk spans. This applies to rolled up chunks as well.
+As described in the page [Memory server](https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans), only a finite set of values are valid chunk spans. This applies to rolled up chunks as well.
 
 RETENTION:
 should at the minimum be maxT otherwise what's the point