Merge pull request #861 from grafana/issue-860

when duration kafka offset fails, fall back to oldest
grafana · Mar 7, 2018 · c9cd6a6 · c9cd6a6
2 parents 87ae555 + bff3193
commit c9cd6a6
Show file tree

Hide file tree

Showing 9 changed files with 43 additions and 12 deletions.
diff --git a/docker/docker-chaos/metrictank.ini b/docker/docker-chaos/metrictank.ini
@@ -11,7 +11,7 @@ instance = default
 drop-first-chunk = false
 # max age for a chunk before to be considered stale and to be persisted to Cassandra
 chunk-max-stale = 1h
-# max age for a metric before to be considered stale and to be purged from memory
+# max age for a metric before to be considered stale and to be purged from in-memory ring buffer.
 metric-max-stale = 6h
 # Interval to run garbage collection job
 gc-interval = 1h
@@ -146,6 +146,8 @@ fallback-graphite-addr = http://graphite
 log-min-dur = 5min
 # timezone for interpreting from/until values when needed, specified using [zoneinfo name](https://en.wikipedia.org/wiki/Tz_database#Names_of_time_zones) e.g. 'America/New_York', 'UTC' or 'local' to use local server timezone.
 time-zone = local
+# maximum number of concurrent threads for fetching data on the local node. Each thread handles a single series.
+get-targets-concurrency = 20
 # default limit for tagdb query results, can be overridden with query parameter "limit"
 tagdb-default-limit = 100
 
@@ -175,6 +177,7 @@ brokers = kafka:9092
 # kafka topic (may be given multiple times as a comma-separated list)
 topics = mdm
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
 # the further back in time you go, the more old data you can load into metrictank, but the longer it takes to catch up to realtime data
 offset = last
 # kafka partitions to consume. use '*' or a comma separated list of id's
@@ -273,6 +276,8 @@ partitions = *
 # method used for partitioning metrics. This should match the settings of tsdb-gw. (byOrg|bySeries)
 partition-scheme = bySeries
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
+# Should match your kafka-mdm-in setting
 offset = last
 # save interval for offsets
 offset-commit-interval = 5s

diff --git a/docker/docker-cluster/metrictank.ini b/docker/docker-cluster/metrictank.ini
@@ -11,7 +11,7 @@ instance = default
 drop-first-chunk = false
 # max age for a chunk before to be considered stale and to be persisted to Cassandra
 chunk-max-stale = 1h
-# max age for a metric before to be considered stale and to be purged from memory
+# max age for a metric before to be considered stale and to be purged from in-memory ring buffer.
 metric-max-stale = 6h
 # Interval to run garbage collection job
 gc-interval = 1h
@@ -177,6 +177,7 @@ brokers = kafka:9092
 # kafka topic (may be given multiple times as a comma-separated list)
 topics = mdm
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
 # the further back in time you go, the more old data you can load into metrictank, but the longer it takes to catch up to realtime data
 offset = last
 # kafka partitions to consume. use '*' or a comma separated list of id's
@@ -275,6 +276,8 @@ partitions = *
 # method used for partitioning metrics. This should match the settings of tsdb-gw. (byOrg|bySeries)
 partition-scheme = bySeries
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
+# Should match your kafka-mdm-in setting
 offset = last
 # save interval for offsets
 offset-commit-interval = 5s

diff --git a/docker/docker-dev-custom-cfg-kafka/metrictank.ini b/docker/docker-dev-custom-cfg-kafka/metrictank.ini
@@ -11,7 +11,7 @@ instance = default
 drop-first-chunk = false
 # max age for a chunk before to be considered stale and to be persisted to Cassandra
 chunk-max-stale = 1h
-# max age for a metric before to be considered stale and to be purged from memory
+# max age for a metric before to be considered stale and to be purged from in-memory ring buffer.
 metric-max-stale = 6h
 # Interval to run garbage collection job
 gc-interval = 1h
@@ -177,6 +177,7 @@ brokers = kafka:9092
 # kafka topic (may be given multiple times as a comma-separated list)
 topics = mdm
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
 # the further back in time you go, the more old data you can load into metrictank, but the longer it takes to catch up to realtime data
 offset = last
 # kafka partitions to consume. use '*' or a comma separated list of id's
@@ -275,6 +276,8 @@ partitions = *
 # method used for partitioning metrics. This should match the settings of tsdb-gw. (byOrg|bySeries)
 partition-scheme = bySeries
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
+# Should match your kafka-mdm-in setting
 offset = last
 # save interval for offsets
 offset-commit-interval = 5s

diff --git a/docs/config.md b/docs/config.md
@@ -226,6 +226,7 @@ brokers = kafka:9092
 # kafka topic (may be given multiple times as a comma-separated list)
 topics = mdm
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
 # the further back in time you go, the more old data you can load into metrictank, but the longer it takes to catch up to realtime data
 offset = last
 # kafka partitions to consume. use '*' or a comma separated list of id's
@@ -333,6 +334,8 @@ partitions = *
 # method used for partitioning metrics. This should match the settings of tsdb-gw. (byOrg|bySeries)
 partition-scheme = bySeries
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
+# Should match your kafka-mdm-in setting
 offset = last
 # save interval for offsets
 offset-commit-interval = 5s

diff --git a/input/kafkamdm/kafkamdm.go b/input/kafkamdm/kafkamdm.go
@@ -210,12 +210,16 @@ func (k *KafkaMdm) Start(handler input.Handler, fatal chan struct{}) error {
 				offset = sarama.OffsetNewest
 			case "last":
 				offset, err = offsetMgr.Last(topic, partition)
+				if err != nil {
+					log.Error(4, "kafka-mdm: Failed to get %q duration offset for %s:%d. %q", offsetStr, topic, partition, err)
+					return err
+				}
 			default:
 				offset, err = k.client.GetOffset(topic, partition, time.Now().Add(-1*offsetDuration).UnixNano()/int64(time.Millisecond))
-			}
-			if err != nil {
-				log.Error(4, "kafka-mdm: Failed to get %q duration offset for %s:%d. %q", offsetStr, topic, partition, err)
-				return err
+				if err != nil {
+					offset = sarama.OffsetOldest
+					log.Warn("kafka-mdm failed to get offset %s: %s -> will use oldest instead", offsetDuration, err)
+				}
 			}
 			k.wg.Add(1)
 			go k.consumePartition(topic, partition, offset)

diff --git a/mdata/notifierKafka/notifierKafka.go b/mdata/notifierKafka/notifierKafka.go
@@ -88,11 +88,15 @@ func (c *NotifierKafka) start() {
 			offset = -1
 		case "last":
 			offset, err = c.offsetMgr.Last(topic, partition)
+			if err != nil {
+				log.Fatal(4, "kafka-cluster: Failed to get %q duration offset for %s:%d. %q", offsetStr, topic, partition, err)
+			}
 		default:
 			offset, err = c.client.GetOffset(topic, partition, time.Now().Add(-1*offsetDuration).UnixNano()/int64(time.Millisecond))
-		}
-		if err != nil {
-			log.Fatal(4, "kafka-cluster: Failed to get %q duration offset for %s:%d. %q", offsetStr, topic, partition, err)
+			if err != nil {
+				offset = sarama.OffsetOldest
+				log.Warn("kafka-cluster failed to get offset %s: %s -> will use oldest instead", offsetDuration, err)
+			}
 		}
 		partitionLogSize[partition].Set(int(bootTimeOffsets[partition]))
 		if offset >= 0 {

diff --git a/metrictank-sample.ini b/metrictank-sample.ini
@@ -180,6 +180,7 @@ brokers = kafka:9092
 # kafka topic (may be given multiple times as a comma-separated list)
 topics = mdm
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
 # the further back in time you go, the more old data you can load into metrictank, but the longer it takes to catch up to realtime data
 offset = last
 # kafka partitions to consume. use '*' or a comma separated list of id's
@@ -278,6 +279,8 @@ partitions = *
 # method used for partitioning metrics. This should match the settings of tsdb-gw. (byOrg|bySeries)
 partition-scheme = bySeries
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
+# Should match your kafka-mdm-in setting
 offset = last
 # save interval for offsets
 offset-commit-interval = 5s

diff --git a/scripts/config/metrictank-docker.ini b/scripts/config/metrictank-docker.ini
@@ -11,7 +11,7 @@ instance = default
 drop-first-chunk = false
 # max age for a chunk before to be considered stale and to be persisted to Cassandra
 chunk-max-stale = 1h
-# max age for a metric before to be considered stale and to be purged from memory
+# max age for a metric before to be considered stale and to be purged from in-memory ring buffer.
 metric-max-stale = 6h
 # Interval to run garbage collection job
 gc-interval = 1h
@@ -177,6 +177,7 @@ brokers = kafka:9092
 # kafka topic (may be given multiple times as a comma-separated list)
 topics = mdm
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
 # the further back in time you go, the more old data you can load into metrictank, but the longer it takes to catch up to realtime data
 offset = last
 # kafka partitions to consume. use '*' or a comma separated list of id's
@@ -275,6 +276,8 @@ partitions = *
 # method used for partitioning metrics. This should match the settings of tsdb-gw. (byOrg|bySeries)
 partition-scheme = bySeries
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
+# Should match your kafka-mdm-in setting
 offset = last
 # save interval for offsets
 offset-commit-interval = 5s

diff --git a/scripts/config/metrictank-package.ini b/scripts/config/metrictank-package.ini
@@ -11,7 +11,7 @@ instance = default
 drop-first-chunk = false
 # max age for a chunk before to be considered stale and to be persisted to Cassandra
 chunk-max-stale = 1h
-# max age for a metric before to be considered stale and to be purged from memory
+# max age for a metric before to be considered stale and to be purged from in-memory ring buffer.
 metric-max-stale = 6h
 # Interval to run garbage collection job
 gc-interval = 1h
@@ -177,6 +177,7 @@ brokers = localhost:9092
 # kafka topic (may be given multiple times as a comma-separated list)
 topics = mdm
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
 # the further back in time you go, the more old data you can load into metrictank, but the longer it takes to catch up to realtime data
 offset = last
 # kafka partitions to consume. use '*' or a comma separated list of id's
@@ -275,6 +276,8 @@ partitions = *
 # method used for partitioning metrics. This should match the settings of tsdb-gw. (byOrg|bySeries)
 partition-scheme = bySeries
 # offset to start consuming from. Can be one of newest, oldest,last or a time duration
+# When using a duration but the offset request fails (e.g. Kafka doesn't have data so far back), metrictank falls back to `oldest`.
+# Should match your kafka-mdm-in setting
 offset = last
 # save interval for offsets
 offset-commit-interval = 5s