Skip to content
This repository has been archived by the owner on Aug 23, 2023. It is now read-only.

Commit

Permalink
Merge pull request #869 from grafana/issue-844
Browse files Browse the repository at this point in the history
fix: GC task too eagerly closes chunks. #844
  • Loading branch information
Dieterbe authored Mar 15, 2018
2 parents 5240c14 + 24564d8 commit fb2a833
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 33 deletions.
86 changes: 60 additions & 26 deletions mdata/aggmetric.go
Original file line number Diff line number Diff line change
Expand Up @@ -561,16 +561,45 @@ func (a *AggMetric) add(ts uint32, val float64) {
a.addAggregators(ts, val)
}

// collectable returns whether the AggMetric is garbage collectable
// an Aggmetric is collectable based on two conditions:
// * the AggMetric hasn't been written to in a configurable amount of time
// (wether the write went to the ROB or a chunk is irrelevant)
// * the last chunk - if any - is no longer "active".
// active means:
// any reasonable realtime stream (e.g. up to 15 min behind wall-clock)
// could add points to the chunk
//
// caller must hold AggMetric lock
func (a *AggMetric) collectable(now, chunkMinTs uint32) bool {

currentChunk := a.getChunk(a.CurrentChunkPos)

// no chunks at all means "possibly collectable"
// the caller (AggMetric.GC()) still has its own checks to
// handle the "no chunks" correctly later.
// also: we want AggMetric.GC() to go ahead with flushing the ROB in this case
if currentChunk == nil {
return a.lastWrite < chunkMinTs
}

return a.lastWrite < chunkMinTs && currentChunk.Series.T0+a.ChunkSpan+15*60 < now
}

// GC returns whether or not this AggMetric is stale and can be removed
// chunkMinTs -> min timestamp of a chunk before to be considered stale and to be persisted to Cassandra
// metricMinTs -> min timestamp for a metric before to be considered stale and to be purged from the tank
func (a *AggMetric) GC(chunkMinTs, metricMinTs uint32) bool {
func (a *AggMetric) GC(now, chunkMinTs, metricMinTs uint32) bool {
a.Lock()
defer a.Unlock()

// if the reorderBuffer is enabled and we have not received a datapoint in a while,
// then flush the reorder buffer.
if a.rob != nil && a.lastWrite < chunkMinTs {
// abort unless it looks like the AggMetric is collectable
if !a.collectable(now, chunkMinTs) {
return false
}

// make sure any points in the reorderBuffer are moved into our chunks so we can save the data
if a.rob != nil {
tmpLastWrite := a.lastWrite
pts := a.rob.Flush()
for _, p := range pts {
Expand All @@ -582,44 +611,49 @@ func (a *AggMetric) GC(chunkMinTs, metricMinTs uint32) bool {
}

// this aggMetric has never had metrics written to it.
// if the rob is empty or disabled, this AggMetric can be deleted
if len(a.Chunks) == 0 {
return (a.rob == nil || a.rob.IsEmpty()) && a.gcAggregators(chunkMinTs, metricMinTs)
return a.gcAggregators(now, chunkMinTs, metricMinTs)
}

currentChunk := a.getChunk(a.CurrentChunkPos)
if currentChunk == nil {
return false
}

if a.lastWrite < chunkMinTs {
if currentChunk.Closed {
// already closed and should be saved, though we cant guarantee that.
// Check if we should just delete the metric from memory.
if a.lastWrite < metricMinTs {
return a.gcAggregators(chunkMinTs, metricMinTs)
}
} else {
// chunk hasn't been written to in a while, and is not yet closed. Let's close it and persist it if
// we are a primary
log.Debug("Found stale Chunk, adding end-of-stream bytes. key: %s T0: %d", a.Key, currentChunk.T0)
currentChunk.Finish()
if cluster.Manager.IsPrimary() {
if LogLevel < 2 {
log.Debug("AM persist(): node is primary, saving chunk. %s T0: %d", a.Key, currentChunk.T0)
}
// persist the chunk. If the writeQueue is full, then this will block.
a.persist(a.CurrentChunkPos)
// we must check collectable again. Imagine this scenario:
// * we didn't have any chunks when calling collectable() the first time so it returned true
// * data from the ROB is flushed and moved into a new chunk
// * this new chunk is active so we're not collectable, even though earlier we thought we were.
if !a.collectable(now, chunkMinTs) {
return false
}

if currentChunk.Closed {
// already closed and should be saved, though we cant guarantee that.
// Check if we should just delete the metric from memory.
if a.lastWrite < metricMinTs {
return a.gcAggregators(now, chunkMinTs, metricMinTs)
}
} else {
// chunk hasn't been written to in a while, and is not yet closed. Let's close it and persist it if
// we are a primary
log.Debug("Found stale Chunk, adding end-of-stream bytes. key: %s T0: %d", a.Key, currentChunk.T0)
currentChunk.Finish()
if cluster.Manager.IsPrimary() {
if LogLevel < 2 {
log.Debug("AM persist(): node is primary, saving chunk. %s T0: %d", a.Key, currentChunk.T0)
}
// persist the chunk. If the writeQueue is full, then this will block.
a.persist(a.CurrentChunkPos)
}
}
return false
}

func (a *AggMetric) gcAggregators(chunkMinTs, metricMinTs uint32) bool {
func (a *AggMetric) gcAggregators(now, chunkMinTs, metricMinTs uint32) bool {
ret := true
for _, agg := range a.aggregators {
ret = agg.GC(chunkMinTs, metricMinTs, a.lastWrite) && ret
ret = agg.GC(now, chunkMinTs, metricMinTs, a.lastWrite) && ret
}
return ret
}
2 changes: 1 addition & 1 deletion mdata/aggmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func (ms *AggMetrics) GC() {
ms.RLock()
a := ms.Metrics[key]
ms.RUnlock()
if stale := a.GC(chunkMinTs, metricMinTs); stale {
if a.GC(now, chunkMinTs, metricMinTs) {
log.Debug("metric %s is stale. Purging data from memory.", key)
ms.Lock()
delete(ms.Metrics, key)
Expand Down
12 changes: 6 additions & 6 deletions mdata/aggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func (agg *Aggregator) Add(ts uint32, val float64) {
}
}

func (agg *Aggregator) GC(chunkMinTs, metricMinTs, lastWriteTime uint32) bool {
func (agg *Aggregator) GC(now, chunkMinTs, metricMinTs, lastWriteTime uint32) bool {
ret := true

if lastWriteTime+agg.span > chunkMinTs {
Expand All @@ -127,19 +127,19 @@ func (agg *Aggregator) GC(chunkMinTs, metricMinTs, lastWriteTime uint32) bool {
}

if agg.minMetric != nil {
ret = agg.minMetric.GC(chunkMinTs, metricMinTs) && ret
ret = agg.minMetric.GC(now, chunkMinTs, metricMinTs) && ret
}
if agg.maxMetric != nil {
ret = agg.maxMetric.GC(chunkMinTs, metricMinTs) && ret
ret = agg.maxMetric.GC(now, chunkMinTs, metricMinTs) && ret
}
if agg.sumMetric != nil {
ret = agg.sumMetric.GC(chunkMinTs, metricMinTs) && ret
ret = agg.sumMetric.GC(now, chunkMinTs, metricMinTs) && ret
}
if agg.cntMetric != nil {
ret = agg.cntMetric.GC(chunkMinTs, metricMinTs) && ret
ret = agg.cntMetric.GC(now, chunkMinTs, metricMinTs) && ret
}
if agg.lstMetric != nil {
ret = agg.lstMetric.GC(chunkMinTs, metricMinTs) && ret
ret = agg.lstMetric.GC(now, chunkMinTs, metricMinTs) && ret
}

return ret
Expand Down

0 comments on commit fb2a833

Please sign in to comment.