-
Notifications
You must be signed in to change notification settings - Fork 93
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: support ThroughputLimit in samplers #1300
base: main
Are you sure you want to change the base?
Changes from all commits
574e6e0
d9f2143
7f9c455
8498f03
378b961
35c8bc3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
package collect | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"math" | ||
"strconv" | ||
"strings" | ||
"sync" | ||
"time" | ||
|
||
"github.com/honeycombio/refinery/config" | ||
"github.com/honeycombio/refinery/internal/peer" | ||
"github.com/honeycombio/refinery/metrics" | ||
"github.com/honeycombio/refinery/pubsub" | ||
"github.com/jonboulle/clockwork" | ||
) | ||
|
||
const emaThroughputTopic = "ema_throughput" | ||
|
||
// EMAThroughputCalculator encapsulates the logic to calculate a throughput value using an Exponential Moving Average (EMA). | ||
type EMAThroughputCalculator struct { | ||
Config config.Config `inject:""` | ||
Metrics metrics.Metrics `inject:"metrics"` | ||
Clock clockwork.Clock `inject:""` | ||
Pubsub pubsub.PubSub `inject:""` | ||
Peer peer.Peers `inject:""` | ||
|
||
throughputLimit uint | ||
weight float64 // Smoothing factor for EMA | ||
intervalLength time.Duration // Length of the interval | ||
hostID string | ||
|
||
mut sync.RWMutex | ||
throughputs map[string]throughputReport | ||
clusterEMA uint | ||
weightedEventTotal float64 // Internal count of events in the current interval | ||
done chan struct{} | ||
} | ||
|
||
// NewEMAThroughputCalculator creates a new instance of EMAThroughputCalculator. | ||
func (c *EMAThroughputCalculator) Start() error { | ||
cfg := c.Config.GetThroughputCalculatorConfig() | ||
c.throughputLimit = uint(cfg.Limit) | ||
c.done = make(chan struct{}) | ||
|
||
// if throughput limit is not set, disable the calculator | ||
if c.throughputLimit == 0 { | ||
return nil | ||
} | ||
|
||
c.intervalLength = time.Duration(cfg.AdjustmentInterval) | ||
if c.intervalLength == 0 { | ||
c.intervalLength = 15 * time.Second | ||
} | ||
|
||
c.weight = cfg.Weight | ||
if c.weight == 0 { | ||
c.weight = 0.5 | ||
} | ||
|
||
peerID, err := c.Peer.GetInstanceID() | ||
if err != nil { | ||
return err | ||
} | ||
c.hostID = peerID | ||
c.throughputs = make(map[string]throughputReport) | ||
|
||
c.Metrics.Register("cluster_throughput", "gauge") | ||
c.Metrics.Register("cluster_ema_throughput", "gauge") | ||
c.Metrics.Register("individual_throughput", "gauge") | ||
c.Metrics.Register("ema_throughput_publish_error", "counter") | ||
// Subscribe to the throughput topic so we can react to throughput | ||
// changes in the cluster. | ||
c.Pubsub.Subscribe(context.Background(), emaThroughputTopic, c.onThroughputUpdate) | ||
|
||
// have a centralized peer metric service that's responsible for publishing and | ||
// receiving peer metrics | ||
// it could have a channel that's receiving metrics from different source | ||
// it then only send a message if the value has changed and it has passed the configured interval for the metric | ||
// there could be a third case that basically says you have to send it now because we have passed the configured interval and we haven't send a message about this metric since the last interval | ||
go func() { | ||
ticker := c.Clock.NewTicker(c.intervalLength) | ||
defer ticker.Stop() | ||
|
||
for { | ||
select { | ||
case <-c.done: | ||
return | ||
case <-ticker.Chan(): | ||
currentThroughput := c.updateEMA() | ||
err := c.Pubsub.Publish(context.Background(), emaThroughputTopic, newThroughputMessage(currentThroughput, peerID).String()) | ||
if err != nil { | ||
c.Metrics.Count("ema_throughput_publish_error", 1) | ||
} | ||
} | ||
} | ||
|
||
}() | ||
|
||
return nil | ||
} | ||
|
||
func (c *EMAThroughputCalculator) onThroughputUpdate(ctx context.Context, msg string) { | ||
throughputMsg, err := unmarshalThroughputMessage(msg) | ||
if err != nil { | ||
return | ||
} | ||
c.mut.Lock() | ||
c.throughputs[throughputMsg.peerID] = throughputReport{ | ||
key: throughputMsg.peerID, | ||
throughput: throughputMsg.throughput, | ||
timestamp: c.Clock.Now(), | ||
} | ||
c.mut.Unlock() | ||
} | ||
|
||
func (c *EMAThroughputCalculator) Stop() { | ||
close(c.done) | ||
} | ||
|
||
// IncrementEventCount increments the internal event count by a specified amount. | ||
func (c *EMAThroughputCalculator) IncrementEventCount(count float64) { | ||
c.mut.Lock() | ||
c.weightedEventTotal += count | ||
c.mut.Unlock() | ||
} | ||
|
||
// updateEMA calculates the current throughput and updates the EMA. | ||
func (c *EMAThroughputCalculator) updateEMA() uint { | ||
c.mut.Lock() | ||
defer c.mut.Unlock() | ||
|
||
var totalThroughput float64 | ||
|
||
for _, report := range c.throughputs { | ||
if c.Clock.Since(report.timestamp) > c.intervalLength*2 { | ||
delete(c.throughputs, report.key) | ||
continue | ||
} | ||
|
||
totalThroughput += float64(report.throughput) | ||
} | ||
c.Metrics.Gauge("cluster_throughput", totalThroughput) | ||
c.clusterEMA = uint(math.Ceil(c.weight*totalThroughput + (1-c.weight)*float64(c.clusterEMA))) | ||
c.Metrics.Gauge("cluster_ema_throughput", c.clusterEMA) | ||
|
||
// calculating throughput for the next interval | ||
currentThroughput := float64(c.weightedEventTotal) / c.intervalLength.Seconds() | ||
c.Metrics.Gauge("individual_throughput", currentThroughput) | ||
c.weightedEventTotal = 0 // Reset the event count for the new interval | ||
|
||
return uint(currentThroughput) | ||
} | ||
|
||
// GetSamplingRateMultiplier calculates and returns a sampling rate multiplier | ||
// based on the difference between the configured throughput limit and the current throughput. | ||
func (c *EMAThroughputCalculator) GetSamplingRateMultiplier() float64 { | ||
if c.throughputLimit == 0 { | ||
return 1.0 // No limit set, so no adjustment needed | ||
} | ||
|
||
c.mut.RLock() | ||
currentEMA := c.clusterEMA | ||
c.mut.RUnlock() | ||
|
||
if currentEMA <= c.throughputLimit { | ||
return 1.0 // Throughput is within the limit, no adjustment needed | ||
} | ||
|
||
return float64(currentEMA) / float64(c.throughputLimit) | ||
} | ||
|
||
type throughputReport struct { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code can be refactored to be a shared logic in both stress relief and throughput calculator There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, they're quite similar. Would it make sense to go even farther, and bundle the updates into the same messages? So the system maintains a map of named values that can be updated internally by each peer, and the peers send the map through pubsub? |
||
key string | ||
throughput uint | ||
timestamp time.Time | ||
} | ||
|
||
type throughputMessage struct { | ||
peerID string | ||
throughput uint | ||
} | ||
|
||
func newThroughputMessage(throughput uint, peerID string) *throughputMessage { | ||
return &throughputMessage{throughput: throughput, peerID: peerID} | ||
} | ||
|
||
func (msg *throughputMessage) String() string { | ||
return msg.peerID + "|" + fmt.Sprint(msg.throughput) | ||
} | ||
|
||
func unmarshalThroughputMessage(msg string) (*throughputMessage, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This gives me an idea. Instead of taking a string for a message type, Pubsub could take a PubsubMessage, which would maybe just embed encoding.TextMarshaler and encoding.TextUnmarshaler. That would kind of normalize the way we do these pack and unpack things for pubsub. Or we could build a general-purpose PubsubMessage class that has the ability to add named fields. |
||
if len(msg) < 2 { | ||
return nil, fmt.Errorf("empty message") | ||
} | ||
|
||
parts := strings.SplitN(msg, "|", 2) | ||
throughput, err := strconv.Atoi(parts[1]) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return newThroughputMessage(uint(throughput), parts[0]), nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should only publish if the throughput is different from the previous calculation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed.