-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(outputs): Implement partial write errors #16146
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,32 +10,73 @@ import ( | |
) | ||
|
||
var ( | ||
AgentMetricsWritten = selfstat.Register("agent", "metrics_written", make(map[string]string)) | ||
AgentMetricsDropped = selfstat.Register("agent", "metrics_dropped", make(map[string]string)) | ||
AgentMetricsWritten = selfstat.Register("agent", "metrics_written", make(map[string]string)) | ||
AgentMetricsRejected = selfstat.Register("agent", "metrics_rejected", make(map[string]string)) | ||
AgentMetricsDropped = selfstat.Register("agent", "metrics_dropped", make(map[string]string)) | ||
|
||
registerGob = sync.OnceFunc(func() { metric.Init() }) | ||
) | ||
|
||
type Transaction struct { | ||
// Batch of metrics to write | ||
Batch []telegraf.Metric | ||
|
||
// Accept denotes the indices of metrics that were successfully written | ||
Accept []int | ||
// Reject denotes the indices of metrics that were not written but should | ||
// not be requeued | ||
Reject []int | ||
|
||
// Marks this transaction as valid | ||
valid bool | ||
|
||
// Internal state that can be used by the buffer implementation | ||
state interface{} | ||
} | ||
Comment on lines
+20
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think "transaction" is the wrong word to use here. In computing, a transaction is a sequence of steps that is executed atomically. Either it runs fully, or not at all. The structure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm IMO it is a transaction on the buffer. The buffer is not modified until |
||
|
||
func (tx *Transaction) AcceptAll() { | ||
tx.Accept = make([]int, len(tx.Batch)) | ||
for i := range tx.Batch { | ||
tx.Accept[i] = i | ||
} | ||
} | ||
|
||
func (tx *Transaction) KeepAll() {} | ||
|
||
func (tx *Transaction) InferKeep() []int { | ||
used := make([]bool, len(tx.Batch)) | ||
for _, idx := range tx.Accept { | ||
used[idx] = true | ||
} | ||
for _, idx := range tx.Reject { | ||
used[idx] = true | ||
} | ||
|
||
keep := make([]int, 0, len(tx.Batch)) | ||
for i := range tx.Batch { | ||
if !used[i] { | ||
keep = append(keep, i) | ||
} | ||
} | ||
return keep | ||
} | ||
|
||
type Buffer interface { | ||
// Len returns the number of metrics currently in the buffer. | ||
Len() int | ||
|
||
// Add adds metrics to the buffer and returns number of dropped metrics. | ||
Add(metrics ...telegraf.Metric) int | ||
|
||
// Batch returns a slice containing up to batchSize of the oldest metrics not | ||
// yet dropped. Metrics are ordered from oldest to newest in the batch. The | ||
// batch must not be modified by the client. | ||
Batch(batchSize int) []telegraf.Metric | ||
|
||
// Accept marks the batch, acquired from Batch(), as successfully written. | ||
Accept(metrics []telegraf.Metric) | ||
// Batch starts a transaction by returning a slice of metrics up to the | ||
// given batch-size starting from the oldest metric in the buffer. Metrics | ||
// are ordered from oldest to newest and must not be modified by the plugin. | ||
BeginTransaction(batchSize int) *Transaction | ||
|
||
// Reject returns the batch, acquired from Batch(), to the buffer and marks it | ||
// as unsent. | ||
Reject([]telegraf.Metric) | ||
// Flush ends a metric and persists the buffer state | ||
EndTransaction(*Transaction) | ||
|
||
// Stats returns the buffer statistics such as rejected, dropped and accepred metrics | ||
// Stats returns the buffer statistics such as rejected, dropped and accepted metrics | ||
Stats() BufferStats | ||
|
||
// Close finalizes the buffer and closes all open resources | ||
|
@@ -45,11 +86,12 @@ type Buffer interface { | |
// BufferStats holds common metrics used for buffer implementations. | ||
// Implementations of Buffer should embed this struct in them. | ||
type BufferStats struct { | ||
MetricsAdded selfstat.Stat | ||
MetricsWritten selfstat.Stat | ||
MetricsDropped selfstat.Stat | ||
BufferSize selfstat.Stat | ||
BufferLimit selfstat.Stat | ||
MetricsAdded selfstat.Stat | ||
MetricsWritten selfstat.Stat | ||
MetricsRejected selfstat.Stat | ||
MetricsDropped selfstat.Stat | ||
BufferSize selfstat.Stat | ||
BufferLimit selfstat.Stat | ||
} | ||
|
||
// NewBuffer returns a new empty Buffer with the given capacity. | ||
|
@@ -84,6 +126,11 @@ func NewBufferStats(name, alias string, capacity int) BufferStats { | |
"metrics_written", | ||
tags, | ||
), | ||
MetricsRejected: selfstat.Register( | ||
"write", | ||
"metrics_rejected", | ||
tags, | ||
), | ||
MetricsDropped: selfstat.Register( | ||
"write", | ||
"metrics_dropped", | ||
|
@@ -115,6 +162,12 @@ func (b *BufferStats) metricWritten(m telegraf.Metric) { | |
m.Accept() | ||
} | ||
|
||
func (b *BufferStats) metricRejected(m telegraf.Metric) { | ||
AgentMetricsRejected.Incr(1) | ||
b.MetricsRejected.Incr(1) | ||
m.Reject() | ||
} | ||
|
||
func (b *BufferStats) metricDropped(m telegraf.Metric) { | ||
AgentMetricsDropped.Incr(1) | ||
b.MetricsDropped.Incr(1) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think using bit arrays here would lead to memory savings and performance improvements. Switching to them would be a significant change to the PR and I think the code is sound as it is, so not recommending that, but maybe as a future performance enhancement.
I would continue to use separate vars for "accept" and "reject", just switch them to bit arrays. They would be fixed in size at
len(metrics)/8+1
bytes long, vslen(accepted)*64
, which I think in many cases would be approachinglen(metrics)*64
. Not sure what the typical use case is here, would take some measurement of real-world scenarios, but I think chances are good it would be a decent savings.The other advantage is that it can support simplified operations. The metrics to drop from the WAL after a single write can be found with the bit-wise union of the two bit arrays for accept and reject. Then you union that again with the long-running mask for the WAL. If all the bits are
1
the whole WAL is done. If not, to find out what prefix to remove from the WAL, you find the first non-zero bit in the mask.Here's an example of a bit-array library you could use as a reference. I would not use a library, but instead write a simplified one that supplies just the operations needed here. When bit-array data structures are not made general they can be surprisingly small in code.
https://github.com/yourbasic/bit
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While I do agree that using a bitmask is better it seems a premature optimization at this point. We really need to optimize this further but IMO this also must include optimizing disk I/O as we are currently scratching the disk badly. In this step, the removal-masking should move into a dedicated WAL implementation and be converted to a bit-mask as you suggest. What do you think?