Skip to content
This repository has been archived by the owner on Aug 23, 2023. It is now read-only.

Commit

Permalink
improve metrics/logging of recovered errors
Browse files Browse the repository at this point in the history
* add a corruption case metric+log
* standardize the format of all recoverable internal errors
  makes a simpler, cleaner namespace, on which we can easily alert.
* standardize terms:
  - corrupt for internal datastructures
  - invalid for user data
* no need to export vars
  • Loading branch information
Dieterbe committed Oct 13, 2017
1 parent ae9e5d7 commit 760bd06
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 19 deletions.
1 change: 1 addition & 0 deletions docs/CONTRIBUTING
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ When contributing PR's:
(example: `docker/docker-cluster/metrictank.ini` is the same as metrictank-sample.ini except for the options that support the use case of running metrictank in a cluster)
Use `scripts/sync-configs.sh` which helps with the process of updating all configs based on metrictank-sample.ini.
9. PR's will only be merged if all tests pass
10. Any errors which can be recovered from sanely, must do so. And must trigger a high-level recovery metric (see other recovered_errors metrics) and an error message describing the problem.
1 change: 1 addition & 0 deletions docs/operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ If you expect consistent or predictable load, you may also want to monitor:

* `metrictank.stats.$environment.$instance.store.cassandra.chunk_operations.save_ok.counter32`: number of saved chunks (based on your chunkspan settings)
* `metrictank.stats.$environment.$instance.api.request_handle.values.rate32` : rate per second of render requests
* `metrictank.stats.$environment.$instance.recovered_errors.*.*.*` : any internal errors that were recovered from automatically (should be 0. If not, please create an issue)



Expand Down
22 changes: 12 additions & 10 deletions idx/memory/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ func (m *MemoryIdx) indexTags(def *schema.MetricDefinition) {
if len(tagSplits) < 2 {
// should never happen because every tag in the index
// must have a valid format
InvalidTagInIndex.Inc()
invalidTag.Inc()
log.Error(3, "memory-idx: Tag %q of id %q has an invalid format", tag, def.Id)
continue
}
Expand All @@ -181,7 +181,7 @@ func (m *MemoryIdx) indexTags(def *schema.MetricDefinition) {
if err != nil {
// should never happen because all IDs in the index must have
// a valid format
CorruptIndex.Inc()
invalidId.Inc()
log.Error(3, "memory-idx: ID %q has invalid format", def.Id)
continue
}
Expand All @@ -194,6 +194,8 @@ func (m *MemoryIdx) indexTags(def *schema.MetricDefinition) {
func (m *MemoryIdx) deindexTags(def *schema.MetricDefinition) {
tags, ok := m.Tags[def.OrgId]
if !ok {
corruptIndex.Inc()
log.Error(3, "memory-idx: corrupt index. ID %q can't be removed. no tag index for org %d", def.Id, def.OrgId)
return
}

Expand All @@ -202,7 +204,7 @@ func (m *MemoryIdx) deindexTags(def *schema.MetricDefinition) {
if len(tagSplits) < 2 {
// should never happen because every tag in the index
// must have a valid format
InvalidTagInIndex.Inc()
invalidTag.Inc()
log.Error(3, "memory-idx: Tag %q of id %q has an invalid format", tag, def.Id)
continue
}
Expand All @@ -214,7 +216,7 @@ func (m *MemoryIdx) deindexTags(def *schema.MetricDefinition) {
if err != nil {
// should never happen because all IDs in the index must have
// a valid format
CorruptIndex.Inc()
invalidId.Inc()
log.Error(3, "memory-idx: ID %q has invalid format", def.Id)
continue
}
Expand Down Expand Up @@ -399,8 +401,8 @@ func (m *MemoryIdx) Tag(orgId int, tag string, from int64) map[string]uint32 {
if def, ok = m.DefById[id.String()]; !ok {
// should never happen because every ID that is in the tag index
// must be present in the byId lookup table
CorruptIndex.Inc()
log.Error(3, "memory-idx: ID %q is in tag index but not in the byId lookup table", id.String())
corruptIndex.Inc()
log.Error(3, "memory-idx: corrupt. ID %q is in tag index but not in the byId lookup table", id.String())
continue
}

Expand Down Expand Up @@ -663,7 +665,7 @@ func (m *MemoryIdx) delete(orgId int, n *Node, deleteEmptyParents bool) []idx.Ar
for _, child := range n.Children {
node, ok := tree.Items[n.Path+"."+child]
if !ok {
CorruptIndex.Inc()
corruptIndex.Inc()
log.Error(3, "memory-idx: node %s missing. Index is corrupt.", n.Path+"."+child)
continue
}
Expand Down Expand Up @@ -698,7 +700,7 @@ func (m *MemoryIdx) delete(orgId int, n *Node, deleteEmptyParents bool) []idx.Ar
log.Debug("memory-idx: removing %s from branch %s", nodes[i], branch)
bNode, ok := tree.Items[branch]
if !ok {
CorruptIndex.Inc()
corruptIndex.Inc()
log.Error(3, "memory-idx: node %s missing. Index is corrupt.", branch)
continue
}
Expand All @@ -719,13 +721,13 @@ func (m *MemoryIdx) delete(orgId int, n *Node, deleteEmptyParents bool) []idx.Ar
}

if len(bNode.Children) == 0 {
CorruptIndex.Inc()
corruptIndex.Inc()
log.Error(3, "memory-idx: branch %s has no children while trying to delete %s. Index is corrupt", branch, nodes[i])
break
}

if bNode.Children[0] != nodes[i] {
CorruptIndex.Inc()
corruptIndex.Inc()
log.Error(3, "memory-idx: %s not in children list for branch %s. Index is corrupt", nodes[i], branch)
break
}
Expand Down
17 changes: 13 additions & 4 deletions idx/memory/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,18 @@ package memory
import "github.com/grafana/metrictank/stats"

var (
// how many times a corrupt index has been detected
CorruptIndex = stats.NewCounter32("idx.memory.corrupt-index")
// metric recovered_errors.idx.memory.corrupt-index is how many times
// a corruption has been detected in one of the internal index structures
// each time this happens, an error is logged with more details.
corruptIndex = stats.NewCounter32("recovered_errors.idx.memory.corrupt-index")

// how many times an invalid tag has been found inside the index
InvalidTagInIndex = stats.NewCounter32("idx.memory.invalid-tag-in-index")
// metric recovered_errors.idx.memory.invalid-id is how many times
// an invalid metric id is encountered.
// each time this happens, an error is logged with more details.
invalidId = stats.NewCounter32("recovered_errors.idx.memory.invalid-id")

// metric recovered_errors.idx.memory.invalid-tag is how many times
// an invalid tag for a metric is encountered.
// each time this happens, an error is logged with more details.
invalidTag = stats.NewCounter32("recovered_errors.idx.memory.invalid-tag")
)
4 changes: 2 additions & 2 deletions idx/memory/tag_query.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ func (q *TagQuery) filterByMatch(resultSet TagIDs, byId map[string]*idx.Archive,
if def, ok = byId[id.String()]; !ok {
// should never happen because every ID in the tag index
// must be present in the byId lookup table
CorruptIndex.Inc()
corruptIndex.Inc()
log.Error(3, "memory-idx: ID %q is in tag index but not in the byId lookup table", id.String())
delete(resultSet, id)
continue IDS
Expand Down Expand Up @@ -360,7 +360,7 @@ func (q *TagQuery) filterByFrom(resultSet TagIDs, byId map[string]*idx.Archive)
if def, ok = byId[id.String()]; !ok {
// should never happen because every ID in the tag index
// must be present in the byId lookup table
CorruptIndex.Inc()
corruptIndex.Inc()
log.Error(3, "memory-idx: ID %q is in tag index but not in the byId lookup table", id.String())
delete(resultSet, id)
continue
Expand Down
4 changes: 2 additions & 2 deletions mdata/cache/ccache.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ import (
)

var (
maxSize uint64
cacheMetricBug = stats.NewCounter32("cache.ops.metric.searchForward-bug-surpressed")
maxSize uint64
searchFwdBug = stats.NewCounter32("recovered_errors.cache.metric.searchForwardBug")
)

func init() {
Expand Down
2 changes: 1 addition & 1 deletion mdata/cache/ccache_metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ func (mc *CCacheMetric) searchForward(ctx context.Context, metric string, from,
log.Warn("CCacheMetric: suspected bug suppressed. searchForward(%q, %d, %d, res) ts is %d while Next is %d", metric, from, until, ts, mc.chunks[ts].Next)
span := opentracing.SpanFromContext(ctx)
span.SetTag("searchForwardBug", true)
cacheMetricBug.Inc()
searchFwdBug.Inc()
break
}
}
Expand Down

0 comments on commit 760bd06

Please sign in to comment.