Skip to content

Commit

Permalink
[aggregator/client] Metric for dropped metrics (#3054)
Browse files Browse the repository at this point in the history
Problem:
When collector fails to write a metric to aggregator, it logs an error but in practice it is almost impossible to tell whether it failed to write to both peers owning a shard or only one of them, i.e. whether data is lost or not.

Solultion:
Emit a metric indicating data loss.
  • Loading branch information
abliqo authored Dec 31, 2020
1 parent 468ab1c commit 42ede15
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions src/aggregator/client/tcp_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,10 @@ func (c *TCPClient) write(
return err
}
var (
shardID = c.shardFn(metricID, uint32(placement.NumShards()))
instances = placement.InstancesForShard(shardID)
multiErr = xerrors.NewMultiError()
shardID = c.shardFn(metricID, uint32(placement.NumShards()))
instances = placement.InstancesForShard(shardID)
multiErr = xerrors.NewMultiError()
oneOrMoreSucceeded = false
)
for _, instance := range instances {
// NB(xichen): the shard should technically always be found because the instances
Expand All @@ -288,7 +289,15 @@ func (c *TCPClient) write(
}
if err = c.writerMgr.Write(instance, shardID, payload); err != nil {
multiErr = multiErr.Add(err)
continue
}

oneOrMoreSucceeded = true
}

if !oneOrMoreSucceeded {
// unrectifiable loss
c.metrics.dropped.Inc(1)
}

onPlacementDoneFn()
Expand Down Expand Up @@ -329,6 +338,7 @@ type tcpClientMetrics struct {
flush tally.Counter
shardNotOwned tally.Counter
shardNotWriteable tally.Counter
dropped tally.Counter
}

func newTCPClientMetrics(
Expand All @@ -343,5 +353,6 @@ func newTCPClientMetrics(
flush: scope.Counter("flush"),
shardNotOwned: scope.Counter("shard-not-owned"),
shardNotWriteable: scope.Counter("shard-not-writeable"),
dropped: scope.Counter("dropped"),
}
}

0 comments on commit 42ede15

Please sign in to comment.