Skip to content

Commit

Permalink
Add early termination metrics case by case (#3093)
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronbuchwald committed Jun 13, 2024
1 parent fa37f5a commit 347a3f8
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 24 deletions.
133 changes: 129 additions & 4 deletions snow/consensus/snowman/poll/early_term_no_traversal.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,133 @@
package poll

import (
"errors"
"fmt"
"time"

"github.com/prometheus/client_golang/prometheus"

"github.com/ava-labs/avalanchego/ids"
"github.com/ava-labs/avalanchego/utils/bag"
)

var (
errPollDurationVectorMetrics = errors.New("failed to register poll_duration vector metrics")
errPollCountVectorMetrics = errors.New("failed to register poll_count vector metrics")

terminationReason = "reason"
exhaustedReason = "exhausted"
earlyFailReason = "early_fail"
earlyAlphaPrefReason = "early_alpha_pref"
earlyAlphaConfReason = "early_alpha_conf"

exhaustedLabel = prometheus.Labels{
terminationReason: exhaustedReason,
}
earlyFailLabel = prometheus.Labels{
terminationReason: earlyFailReason,
}
earlyAlphaPrefLabel = prometheus.Labels{
terminationReason: earlyAlphaPrefReason,
}
earlyAlphaConfLabel = prometheus.Labels{
terminationReason: earlyAlphaConfReason,
}
)

type earlyTermNoTraversalMetrics struct {
durExhaustedPolls prometheus.Gauge
durEarlyFailPolls prometheus.Gauge
durEarlyAlphaPrefPolls prometheus.Gauge
durEarlyAlphaConfPolls prometheus.Gauge

countExhaustedPolls prometheus.Counter
countEarlyFailPolls prometheus.Counter
countEarlyAlphaPrefPolls prometheus.Counter
countEarlyAlphaConfPolls prometheus.Counter
}

func newEarlyTermNoTraversalMetrics(reg prometheus.Registerer) (*earlyTermNoTraversalMetrics, error) {
pollCountVec := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "poll_count",
Help: "Total # of terminated polls by reason",
}, []string{terminationReason})
if err := reg.Register(pollCountVec); err != nil {
return nil, fmt.Errorf("%w: %w", errPollCountVectorMetrics, err)
}
durPollsVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "poll_duration",
Help: "time (in ns) polls took to complete by reason",
}, []string{terminationReason})
if err := reg.Register(durPollsVec); err != nil {
return nil, fmt.Errorf("%w: %w", errPollDurationVectorMetrics, err)
}

return &earlyTermNoTraversalMetrics{
durExhaustedPolls: durPollsVec.With(exhaustedLabel),
durEarlyFailPolls: durPollsVec.With(earlyFailLabel),
durEarlyAlphaPrefPolls: durPollsVec.With(earlyAlphaPrefLabel),
durEarlyAlphaConfPolls: durPollsVec.With(earlyAlphaConfLabel),
countExhaustedPolls: pollCountVec.With(exhaustedLabel),
countEarlyFailPolls: pollCountVec.With(earlyFailLabel),
countEarlyAlphaPrefPolls: pollCountVec.With(earlyAlphaPrefLabel),
countEarlyAlphaConfPolls: pollCountVec.With(earlyAlphaConfLabel),
}, nil
}

func (m *earlyTermNoTraversalMetrics) observeExhausted(duration time.Duration) {
m.durExhaustedPolls.Add(float64(duration.Nanoseconds()))
m.countExhaustedPolls.Inc()
}

func (m *earlyTermNoTraversalMetrics) observeEarlyFail(duration time.Duration) {
m.durEarlyFailPolls.Add(float64(duration.Nanoseconds()))
m.countEarlyFailPolls.Inc()
}

func (m *earlyTermNoTraversalMetrics) observeEarlyAlphaPref(duration time.Duration) {
m.durEarlyAlphaPrefPolls.Add(float64(duration.Nanoseconds()))
m.countEarlyAlphaPrefPolls.Inc()
}

func (m *earlyTermNoTraversalMetrics) observeEarlyAlphaConf(duration time.Duration) {
m.durEarlyAlphaConfPolls.Add(float64(duration.Nanoseconds()))
m.countEarlyAlphaConfPolls.Inc()
}

type earlyTermNoTraversalFactory struct {
alphaPreference int
alphaConfidence int

metrics *earlyTermNoTraversalMetrics
}

// NewEarlyTermNoTraversalFactory returns a factory that returns polls with
// early termination, without doing DAG traversals
func NewEarlyTermNoTraversalFactory(alphaPreference int, alphaConfidence int) Factory {
func NewEarlyTermNoTraversalFactory(
alphaPreference int,
alphaConfidence int,
reg prometheus.Registerer,
) (Factory, error) {
metrics, err := newEarlyTermNoTraversalMetrics(reg)
if err != nil {
return nil, err
}

return &earlyTermNoTraversalFactory{
alphaPreference: alphaPreference,
alphaConfidence: alphaConfidence,
}
metrics: metrics,
}, nil
}

func (f *earlyTermNoTraversalFactory) New(vdrs bag.Bag[ids.NodeID]) Poll {
return &earlyTermNoTraversalPoll{
polled: vdrs,
alphaPreference: f.alphaPreference,
alphaConfidence: f.alphaConfidence,
metrics: f.metrics,
start: time.Now(),
}
}

Expand All @@ -40,6 +142,10 @@ type earlyTermNoTraversalPoll struct {
polled bag.Bag[ids.NodeID]
alphaPreference int
alphaConfidence int

metrics *earlyTermNoTraversalMetrics
start time.Time
finished bool
}

// Vote registers a response for this poll
Expand Down Expand Up @@ -67,20 +173,39 @@ func (p *earlyTermNoTraversalPoll) Drop(vdr ids.NodeID) {
// transitive voting.
// 4. A single element has achieved an alphaConfidence majority.
func (p *earlyTermNoTraversalPoll) Finished() bool {
if p.finished {
return true
}

remaining := p.polled.Len()
if remaining == 0 {
p.finished = true
p.metrics.observeExhausted(time.Since(p.start))
return true // Case 1
}

received := p.votes.Len()
maxPossibleVotes := received + remaining
if maxPossibleVotes < p.alphaPreference {
p.finished = true
p.metrics.observeEarlyFail(time.Since(p.start))
return true // Case 2
}

_, freq := p.votes.Mode()
return freq >= p.alphaPreference && maxPossibleVotes < p.alphaConfidence || // Case 3
freq >= p.alphaConfidence // Case 4
if freq >= p.alphaPreference && maxPossibleVotes < p.alphaConfidence {
p.finished = true
p.metrics.observeEarlyAlphaPref(time.Since(p.start))
return true // Case 3
}

if freq >= p.alphaConfidence {
p.finished = true
p.metrics.observeEarlyAlphaConf(time.Since(p.start))
return true // Case 4
}

return false
}

// Result returns the result of this poll
Expand Down
35 changes: 24 additions & 11 deletions snow/consensus/snowman/poll/early_term_no_traversal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,25 @@ package poll
import (
"testing"

"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/require"

"github.com/ava-labs/avalanchego/utils/bag"
)

func newEarlyTermNoTraversalTestFactory(require *require.Assertions, alpha int) Factory {
factory, err := NewEarlyTermNoTraversalFactory(alpha, alpha, prometheus.NewRegistry())
require.NoError(err)
return factory
}

func TestEarlyTermNoTraversalResults(t *testing.T) {
require := require.New(t)

vdrs := bag.Of(vdr1) // k = 1
alpha := 1

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
poll := factory.New(vdrs)

poll.Vote(vdr1, blkID1)
Expand All @@ -31,10 +38,12 @@ func TestEarlyTermNoTraversalResults(t *testing.T) {
}

func TestEarlyTermNoTraversalString(t *testing.T) {
require := require.New(t)

vdrs := bag.Of(vdr1, vdr2) // k = 2
alpha := 2

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
poll := factory.New(vdrs)

poll.Vote(vdr1, blkID1)
Expand All @@ -43,7 +52,7 @@ func TestEarlyTermNoTraversalString(t *testing.T) {
NodeID-BaMPFdqMUQ46BV8iRcwbVfsam55kMqcp: 1
received Bag[ids.ID]: (Size = 1)
SYXsAycDPUu4z2ZksJD5fh5nTDcH3vCFHnpcVye5XuJ2jArg: 1`
require.Equal(t, expected, poll.String())
require.Equal(expected, poll.String())
}

func TestEarlyTermNoTraversalDropsDuplicatedVotes(t *testing.T) {
Expand All @@ -52,7 +61,7 @@ func TestEarlyTermNoTraversalDropsDuplicatedVotes(t *testing.T) {
vdrs := bag.Of(vdr1, vdr2) // k = 2
alpha := 2

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
poll := factory.New(vdrs)

poll.Vote(vdr1, blkID1)
Expand All @@ -72,7 +81,7 @@ func TestEarlyTermNoTraversalTerminatesEarlyWithoutAlphaPreference(t *testing.T)
vdrs := bag.Of(vdr1, vdr2, vdr3) // k = 3
alpha := 2

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
poll := factory.New(vdrs)

poll.Drop(vdr1)
Expand All @@ -90,7 +99,8 @@ func TestEarlyTermNoTraversalTerminatesEarlyWithAlphaPreference(t *testing.T) {
alphaPreference := 3
alphaConfidence := 5

factory := NewEarlyTermNoTraversalFactory(alphaPreference, alphaConfidence)
factory, err := NewEarlyTermNoTraversalFactory(alphaPreference, alphaConfidence, prometheus.NewRegistry())
require.NoError(err)
poll := factory.New(vdrs)

poll.Vote(vdr1, blkID1)
Expand All @@ -114,7 +124,8 @@ func TestEarlyTermNoTraversalTerminatesEarlyWithAlphaConfidence(t *testing.T) {
alphaPreference := 3
alphaConfidence := 3

factory := NewEarlyTermNoTraversalFactory(alphaPreference, alphaConfidence)
factory, err := NewEarlyTermNoTraversalFactory(alphaPreference, alphaConfidence, prometheus.NewRegistry())
require.NoError(err)
poll := factory.New(vdrs)

poll.Vote(vdr1, blkID1)
Expand All @@ -138,7 +149,7 @@ func TestEarlyTermNoTraversalForSharedAncestor(t *testing.T) {
vdrs := bag.Of(vdr1, vdr2, vdr3, vdr4) // k = 4
alpha := 4

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
poll := factory.New(vdrs)

poll.Vote(vdr1, blkID2)
Expand All @@ -160,7 +171,7 @@ func TestEarlyTermNoTraversalWithWeightedResponses(t *testing.T) {
vdrs := bag.Of(vdr1, vdr2, vdr2) // k = 3
alpha := 2

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
poll := factory.New(vdrs)

poll.Vote(vdr2, blkID1)
Expand All @@ -174,12 +185,14 @@ func TestEarlyTermNoTraversalWithWeightedResponses(t *testing.T) {
}

func TestEarlyTermNoTraversalDropWithWeightedResponses(t *testing.T) {
require := require.New(t)

vdrs := bag.Of(vdr1, vdr2, vdr2) // k = 3
alpha := 2

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
poll := factory.New(vdrs)

poll.Drop(vdr2)
require.True(t, poll.Finished())
require.True(poll.Finished())
}
18 changes: 10 additions & 8 deletions snow/consensus/snowman/poll/set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ var (
func TestNewSetErrorOnPollsMetrics(t *testing.T) {
require := require.New(t)

factory := NewEarlyTermNoTraversalFactory(1, 1)
alpha := 1
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
log := logging.NoLog{}
registerer := prometheus.NewRegistry()

Expand All @@ -45,7 +46,8 @@ func TestNewSetErrorOnPollsMetrics(t *testing.T) {
func TestNewSetErrorOnPollDurationMetrics(t *testing.T) {
require := require.New(t)

factory := NewEarlyTermNoTraversalFactory(1, 1)
alpha := 1
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
log := logging.NoLog{}
registerer := prometheus.NewRegistry()

Expand All @@ -63,7 +65,7 @@ func TestCreateAndFinishPollOutOfOrder_NewerFinishesFirst(t *testing.T) {
vdrs := []ids.NodeID{vdr1, vdr2, vdr3} // k = 3
alpha := 3

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
log := logging.NoLog{}
registerer := prometheus.NewRegistry()
s, err := NewSet(factory, log, registerer)
Expand Down Expand Up @@ -99,7 +101,7 @@ func TestCreateAndFinishPollOutOfOrder_OlderFinishesFirst(t *testing.T) {
vdrs := []ids.NodeID{vdr1, vdr2, vdr3} // k = 3
alpha := 3

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
log := logging.NoLog{}
registerer := prometheus.NewRegistry()
s, err := NewSet(factory, log, registerer)
Expand Down Expand Up @@ -135,7 +137,7 @@ func TestCreateAndFinishPollOutOfOrder_UnfinishedPollsGaps(t *testing.T) {
vdrs := []ids.NodeID{vdr1, vdr2, vdr3} // k = 3
alpha := 3

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
log := logging.NoLog{}
registerer := prometheus.NewRegistry()
s, err := NewSet(factory, log, registerer)
Expand Down Expand Up @@ -179,7 +181,7 @@ func TestCreateAndFinishSuccessfulPoll(t *testing.T) {
vdrs := bag.Of(vdr1, vdr2) // k = 2
alpha := 2

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
log := logging.NoLog{}
registerer := prometheus.NewRegistry()
s, err := NewSet(factory, log, registerer)
Expand Down Expand Up @@ -211,7 +213,7 @@ func TestCreateAndFinishFailedPoll(t *testing.T) {
vdrs := bag.Of(vdr1, vdr2) // k = 2
alpha := 1

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
log := logging.NoLog{}
registerer := prometheus.NewRegistry()
s, err := NewSet(factory, log, registerer)
Expand Down Expand Up @@ -240,7 +242,7 @@ func TestSetString(t *testing.T) {
vdrs := bag.Of(vdr1) // k = 1
alpha := 1

factory := NewEarlyTermNoTraversalFactory(alpha, alpha)
factory := newEarlyTermNoTraversalTestFactory(require, alpha)
log := logging.NoLog{}
registerer := prometheus.NewRegistry()
s, err := NewSet(factory, log, registerer)
Expand Down
Loading

0 comments on commit 347a3f8

Please sign in to comment.