-
Notifications
You must be signed in to change notification settings - Fork 110
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(lib/babe): fix timing for transition between epochs #1636
Changes from 13 commits
e92668d
8a67205
f893b82
a919188
c23fee5
d23f2be
52d0f73
b6c88b1
578ff9c
6177b10
13cf25c
0b7e8f5
e1ea906
18460c8
af33101
d2f38b2
27dc09f
7a5ce9e
749da75
1058a9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -146,7 +146,7 @@ func NewService(cfg *ServiceConfig) (*Service, error) { | |
"epoch length (slots)", babeService.epochLength, | ||
"authorities", Authorities(babeService.epochData.authorities), | ||
"authority index", babeService.epochData.authorityIndex, | ||
"threshold", babeService.epochData.threshold.ToLEBytes(), | ||
"threshold", babeService.epochData.threshold, | ||
"randomness", babeService.epochData.randomness, | ||
) | ||
return babeService, nil | ||
|
@@ -226,20 +226,23 @@ func (b *Service) EpochLength() uint64 { | |
|
||
// Pause pauses the service ie. halts block production | ||
func (b *Service) Pause() error { | ||
b.Lock() | ||
defer b.Unlock() | ||
|
||
if b.paused { | ||
return errors.New("service already paused") | ||
} | ||
|
||
b.Lock() | ||
defer b.Unlock() | ||
|
||
b.pause <- struct{}{} | ||
b.paused = true | ||
b.pause <- struct{}{} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: seems odd that there are multiple two places in the code where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll change the other place to just call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can probably close the pause channel and reinitialize it on resume.
|
||
return nil | ||
} | ||
|
||
// Resume resumes the service ie. resumes block production | ||
func (b *Service) Resume() error { | ||
b.Lock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acquire lock after checking There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this function should be atomic in the case that |
||
defer b.Unlock() | ||
|
||
if !b.paused { | ||
return nil | ||
} | ||
|
@@ -250,9 +253,6 @@ func (b *Service) Resume() error { | |
return err | ||
} | ||
|
||
b.Lock() | ||
defer b.Unlock() | ||
|
||
b.paused = false | ||
go b.initiate(epoch) | ||
logger.Info("service resumed", "epoch", epoch) | ||
|
@@ -351,93 +351,118 @@ func (b *Service) initiate(epoch uint64) { | |
return | ||
} | ||
|
||
b.invokeBlockAuthoring(epoch) | ||
} | ||
|
||
func (b *Service) invokeBlockAuthoring(epoch uint64) { | ||
// calculate current slot | ||
startSlot := getCurrentSlot(b.slotDuration) | ||
|
||
head, err := b.blockState.BestBlockHeader() | ||
err := b.invokeBlockAuthoring(epoch) | ||
if err != nil { | ||
logger.Error("failed to get best block header", "error", err) | ||
return | ||
logger.Crit("block authoring error", "error", err) | ||
} | ||
} | ||
|
||
// if we're at genesis, set the first slot number for the network | ||
if head.Number.Cmp(big.NewInt(0)) == 0 { | ||
err = b.epochState.SetFirstSlot(startSlot) | ||
func (b *Service) invokeBlockAuthoring(epoch uint64) error { | ||
for { | ||
// get start slot for current epoch | ||
epochStart, err := b.epochState.GetStartSlotForEpoch(epoch) | ||
if err != nil { | ||
logger.Error("failed to set first slot number", "error", err) | ||
return | ||
logger.Error("failed to get start slot for current epoch", "epoch", epoch, "error", err) | ||
return err | ||
} | ||
} | ||
|
||
logger.Info("initiating epoch", "number", epoch, "start slot", startSlot+b.epochLength) | ||
err = b.initiateEpoch(epoch) | ||
if err != nil { | ||
logger.Error("failed to initiate epoch", "epoch", epoch, "error", err) | ||
return | ||
} | ||
head, err := b.blockState.BestBlockHeader() | ||
if err != nil { | ||
logger.Error("failed to get best block header", "error", err) | ||
return err | ||
} | ||
|
||
// get start slot for current epoch | ||
epochStart, err := b.epochState.GetStartSlotForEpoch(0) | ||
if err != nil { | ||
logger.Error("failed to get start slot for current epoch", "epoch", epoch, "error", err) | ||
return | ||
} | ||
// if we're at genesis, set the first slot number for the network | ||
if head.Number.Cmp(big.NewInt(0)) == 0 { | ||
epochStart = getCurrentSlot(b.slotDuration) | ||
err = b.epochState.SetFirstSlot(epochStart) | ||
if err != nil { | ||
logger.Error("failed to set first slot number", "error", err) | ||
return err | ||
} | ||
} | ||
|
||
intoEpoch := startSlot - epochStart | ||
logger.Info("current epoch", "epoch", epoch, "slots into epoch", intoEpoch) | ||
logger.Info("initiating epoch", "number", epoch, "first slot of epoch", epochStart) | ||
err = b.initiateEpoch(epoch) | ||
if err != nil { | ||
logger.Error("failed to initiate epoch", "epoch", epoch, "error", err) | ||
return err | ||
} | ||
|
||
// if the calculated amount of slots "into the epoch" is greater than the epoch length, | ||
// we've been offline for more than an epoch, and need to sync. pause BABE for now, syncer will | ||
// resume it when ready | ||
if b.epochLength <= intoEpoch && !b.dev { | ||
b.paused = true | ||
return | ||
} | ||
epochStartTime := getSlotStartTime(epochStart, b.slotDuration) | ||
logger.Debug("checking if epoch started", "epoch start", epochStartTime, "now", time.Now()) | ||
|
||
// check if it's time to start the epoch yet. if not, wait until it is | ||
if time.Since(epochStartTime) < 0 { | ||
logger.Debug("waiting for epoch to start") | ||
select { | ||
case <-time.After(time.Until(epochStartTime)): | ||
case <-b.ctx.Done(): | ||
return nil | ||
case <-b.pause: | ||
return nil | ||
} | ||
} | ||
|
||
if b.dev { | ||
intoEpoch = intoEpoch % b.epochLength | ||
} | ||
// calculate current slot | ||
startSlot := getCurrentSlot(b.slotDuration) | ||
intoEpoch := startSlot - epochStart | ||
|
||
// if the calculated amount of slots "into the epoch" is greater than the epoch length, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it possible to move this into another function? This seems like something that happens infrequently and can be tested independently? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it would happen if the node is down for over an epoch or if it's starting up for the first time on a network that's past epoch 0, which part would you put in a separate function? just this |
||
// we've been offline for more than an epoch, and need to sync. pause BABE for now, syncer will | ||
// resume it when ready | ||
if b.epochLength <= intoEpoch && !b.dev { | ||
logger.Debug("pausing BABE, need to sync", "slots into epoch", intoEpoch, "startSlot", startSlot, "epochStart", epochStart) | ||
go func() { | ||
<-b.pause | ||
}() | ||
return b.Pause() | ||
} | ||
|
||
slotDone := make([]<-chan time.Time, b.epochLength-intoEpoch) | ||
for i := 0; i < int(b.epochLength-intoEpoch); i++ { | ||
slotDone[i] = time.After(b.getSlotDuration() * time.Duration(i)) | ||
} | ||
if b.dev { | ||
intoEpoch = intoEpoch % b.epochLength | ||
} | ||
|
||
for i := 0; i < int(b.epochLength-intoEpoch); i++ { | ||
select { | ||
case <-b.ctx.Done(): | ||
return | ||
case <-b.pause: | ||
return | ||
case <-slotDone[i]: | ||
if !b.authority { | ||
continue | ||
} | ||
logger.Info("current epoch", "epoch", epoch, "slots into epoch", intoEpoch) | ||
|
||
slotDone := make([]<-chan time.Time, b.epochLength-intoEpoch) | ||
for i := 0; i < int(b.epochLength-intoEpoch); i++ { | ||
slotDone[i] = time.After(b.getSlotDuration() * time.Duration(i)) | ||
} | ||
|
||
slotNum := startSlot + uint64(i) | ||
err = b.handleSlot(slotNum) | ||
if err == ErrNotAuthorized { | ||
logger.Debug("not authorized to produce a block in this slot", "slot", slotNum) | ||
continue | ||
} else if err != nil { | ||
logger.Warn("failed to handle slot", "slot", slotNum, "error", err) | ||
continue | ||
for i := 0; i < int(b.epochLength-intoEpoch); i++ { | ||
select { | ||
case <-b.ctx.Done(): | ||
return nil | ||
case <-b.pause: | ||
return nil | ||
case <-slotDone[i]: | ||
if !b.authority { | ||
continue | ||
} | ||
|
||
slotNum := startSlot + uint64(i) | ||
err = b.handleSlot(slotNum) | ||
if err == ErrNotAuthorized { | ||
logger.Debug("not authorized to produce a block in this slot", "slot", slotNum, "slots into epoch", i) | ||
continue | ||
} else if err != nil { | ||
logger.Warn("failed to handle slot", "slot", slotNum, "error", err) | ||
continue | ||
} | ||
} | ||
} | ||
} | ||
|
||
// setup next epoch, re-invoke block authoring | ||
next, err := b.incrementEpoch() | ||
if err != nil { | ||
logger.Error("failed to increment epoch", "error", err) | ||
return | ||
} | ||
// setup next epoch, re-invoke block authoring | ||
next, err := b.incrementEpoch() | ||
if err != nil { | ||
logger.Error("failed to increment epoch", "error", err) | ||
return err | ||
} | ||
|
||
b.invokeBlockAuthoring(next) | ||
logger.Info("epoch complete!", "completed epoch", epoch, "upcoming epoch", next) | ||
epoch = next | ||
} | ||
} | ||
|
||
func (b *Service) handleSlot(slotNum uint64) error { | ||
|
@@ -466,8 +491,6 @@ func (b *Service) handleSlot(slotNum uint64) error { | |
number: slotNum, | ||
} | ||
|
||
logger.Debug("going to build block", "parent", parent) | ||
|
||
// set runtime trie before building block | ||
// if block building is successful, store the resulting trie in the storage state | ||
ts, err := b.storageState.TrieState(&parent.StateRoot) | ||
|
@@ -509,3 +532,7 @@ func (b *Service) handleSlot(slotNum uint64) error { | |
func getCurrentSlot(slotDuration time.Duration) uint64 { | ||
return uint64(time.Now().UnixNano()) / uint64(slotDuration.Nanoseconds()) | ||
} | ||
|
||
func getSlotStartTime(slot uint64, slotDuration time.Duration) time.Time { | ||
return time.Unix(0, int64(slot)*slotDuration.Nanoseconds()) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -322,7 +322,7 @@ func (b *BlockBuilder) addToQueue(txs []*transaction.ValidTransaction) { | |
} | ||
|
||
func hasSlotEnded(slot Slot) bool { | ||
slotEnd := slot.start.Add(slot.duration) | ||
slotEnd := slot.start.Add(slot.duration * 2 / 3) // reserve last 1/3 of slot for block finalisation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of 1/3, Can we allocate a fixed time for block finalization? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what do you mean exactly? 1/3 of the slot is a fixed slot duration, this is what substrate does afaik There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no that would not be ideal, substrate uses 1/3 of the slot duration, you can see their implementation on the issue |
||
return time.Since(slotEnd) >= 0 | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The lock is not required.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove the error
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there might be a chance that
Pause()
is called when theb.pause
channel is already close, thus causing a close of closed channel panic