From 67446d336f48b14aa97df96e0f0644cd79147b28 Mon Sep 17 00:00:00 2001 From: bom-d-van Date: Tue, 11 Jan 2022 09:25:07 +0100 Subject: [PATCH 1/2] trie: stop indexing empty directory nodes There is a strange bug in trie that seems related to indexing empty directory nodes. The actual root cause is still unknown to me and I fail to reproduce the issue. In our production, we notice the issue associates with a cron job for removing files (which would causes the generation of empty directories). Considering that empty directories doesnt produce much value in go-carbon/graphite, excluding it from index should cause no harm. And this change also appears to resolve the issue on our production. --- carbonserver/carbonserver.go | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/carbonserver/carbonserver.go b/carbonserver/carbonserver.go index 546e6d223..59815e0b8 100644 --- a/carbonserver/carbonserver.go +++ b/carbonserver/carbonserver.go @@ -990,21 +990,27 @@ func (listener *CarbonserverListener) updateFileList(dir string, cacheMetricName delete(cacheMetricNames, trimmedName) } else { if listener.trieIndex { - var dataPoints int64 - if isFullMetric && listener.estimateSize != nil { - m := strings.ReplaceAll(trimmedName, "/", ".") - m = m[1 : len(m)-4] - _, dataPoints = listener.estimateSize(m) - } - - var physicalSize = info.Size() - if stat, ok := info.Sys().(*syscall.Stat_t); ok { - physicalSize = stat.Blocks * 512 - } - - if err := trieIdx.insert(trimmedName, info.Size(), physicalSize, dataPoints); err != nil { - // It's better to just log an error than stop indexing - listener.logTrieInsertError(logger, "updateFileList.trie: failed to index path", trimmedName, err) + // WHY: + // * this would only affects empty directories + // * indexing empty directories causes an strange bug in trie index + // * empty dir isn't useful (at least most of the time)? + if isFullMetric { + var dataPoints int64 + if listener.estimateSize != nil { + m := strings.ReplaceAll(trimmedName, "/", ".") + m = m[1 : len(m)-4] + _, dataPoints = listener.estimateSize(m) + } + + var physicalSize = info.Size() + if stat, ok := info.Sys().(*syscall.Stat_t); ok { + physicalSize = stat.Blocks * 512 + } + + if err := trieIdx.insert(trimmedName, info.Size(), physicalSize, dataPoints); err != nil { + // It's better to just log an error than stop indexing + listener.logTrieInsertError(logger, "updateFileList.trie: failed to index path", trimmedName, err) + } } } else { files = append(files, trimmedName) From 7d1ce7c46d9945a7b132cce31e6ae0c68871b452 Mon Sep 17 00:00:00 2001 From: bom-d-van Date: Wed, 12 Jan 2022 15:07:38 +0100 Subject: [PATCH 2/2] trie: fix empty directory related indexing and query bugs There are two bugs fixed in this commit: 1. Empty directories not properly renewed during indexing (and could be trimmed at every two file list scans). During insert, trie.insert failed to bump up trie nodes properly if it's inserting directory. This means that if the directory being inserted is empty and already exists in the trie tree, when concurrent index is enabled, the directory nodes might be pruned and then re-inserted in the next file list scan. And then on and on. This issue it self is not a serious concern if bug #2 below doesn't exist. 2. Query logics do not handle well for empty directories. The symptoms are that if the trie index tree contains empty directories, and if a query happens to matching it, it would causes the query state stack (matchers) jump to the wrong index, and lead to incorrect matches of the metrics. Context: On our Graphite production, for cleanup purpose, we have daily cron jobs removing stale graphite metric files and empty directories in the whisper tree. The cronjob that removes files is run at 5AM and empty directory removal at 7AM. This means the above bugs have a time window of 1-2 hours being triggered. And because empty directories are not handled properly due to bug #1, the issue is triggered every 2 file list scans. Essentially, if the query matched an empty directory node and there are other metrics listed after the empty directory path, trie query can't return proper result. In our case, it's returning missing results, in theory, it could also return incorrect results. In theory, these fixes are not needed because we stop indexing empty directory nodes in commit 67446d336f48b14aa97df96e0f0644cd79147b28. But it's nice to figure out the root cause and resolve the issue properly! Took me almost a week to figure it out! But I'm happy. Tears in rain. --- carbonserver/carbonserver.go | 1 + carbonserver/trie.go | 4 +++- carbonserver/trie_test.go | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/carbonserver/carbonserver.go b/carbonserver/carbonserver.go index 59815e0b8..64b69e6c4 100644 --- a/carbonserver/carbonserver.go +++ b/carbonserver/carbonserver.go @@ -975,6 +975,7 @@ func (listener *CarbonserverListener) updateFileList(dir string, cacheMetricName trimmedName := strings.TrimPrefix(p, listener.whisperData) filesLen++ if flc != nil { + // TODO: include metadata like physical/logical size, data points if err := flc.write(trimmedName); err != nil { logger.Error("failed to write to file list cache", zap.Error(err)) if err := flc.close(); err != nil { diff --git a/carbonserver/trie.go b/carbonserver/trie.go index b4a04a26c..62d7e9cc8 100644 --- a/carbonserver/trie.go +++ b/carbonserver/trie.go @@ -633,6 +633,7 @@ outer: if !isFile { if cur.dir() { + cur.gen = ti.root.gen return nil } @@ -640,6 +641,7 @@ outer: for _, child := range *cur.childrens { if child.dir() { cur = child + cur.gen = ti.root.gen newDir = false break } @@ -745,7 +747,7 @@ func (ti *trieIndex) query(expr string, limit int, expand func(globs []string) ( } if cur.dir() { - if mindex+1 >= len(matchers) || !curm.dstate().matched() { + if mindex+1 >= len(matchers) || !curm.dstate().matched() || len(curChildrens) == 0 { goto parent } diff --git a/carbonserver/trie_test.go b/carbonserver/trie_test.go index 1b525f907..f63ea74f6 100644 --- a/carbonserver/trie_test.go +++ b/carbonserver/trie_test.go @@ -692,6 +692,19 @@ func TestTrieIndex(t *testing.T) { }, expectLeafs: []bool{false, false, false, false}, }, + { + input: []string{ + "zk/kafka_xxx/by_node/node_0/status/health.wsp", + "zk/streaming_yyy/by_node/node_0/status/health.wsp", + "zk/kafka_zzz/by_node/node_0/status", // intentionally empty directory node + }, + query: "zk.*.by_node.*.status.health", + expect: []string{ + "zk.kafka_xxx.by_node.node_0.status.health", + "zk.streaming_yyy.by_node.node_0.status.health", + }, + expectLeafs: []bool{true, true}, + }, } for _, c := range cases {