Skip to content

Commit

Permalink
Improve stability of system tests (#6486)
Browse files Browse the repository at this point in the history
## Motivation

This PR tries to improve unstable system tests
  • Loading branch information
fasmat committed Nov 28, 2024
1 parent de4e49e commit 290edd2
Show file tree
Hide file tree
Showing 18 changed files with 237 additions and 197 deletions.
2 changes: 1 addition & 1 deletion api/grpcserver/globalstate_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ func (s *GlobalStateService) GlobalStateStream(
root, err := s.conState.GetLayerStateRoot(layer.LayerID)
if err != nil {
ctxzap.Warn(stream.Context(), "error retrieving layer data", zap.Error(err))
root = types.Hash32{}
root = types.EmptyHash32
}
resp := &pb.GlobalStateStreamResponse{Datum: &pb.GlobalStateData{Datum: &pb.GlobalStateData_GlobalState{
GlobalState: &pb.GlobalStateHash{
Expand Down
22 changes: 21 additions & 1 deletion fetch/mesh_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/spacemeshos/go-scale"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"golang.org/x/sync/errgroup"

"github.com/spacemeshos/go-spacemesh/codec"
Expand Down Expand Up @@ -177,20 +178,39 @@ func (f *Fetch) GetBlocks(ctx context.Context, ids []types.BlockID) error {

// GetProposalTxs fetches the txs provided as IDs and validates them, returns an error if one TX failed to be fetched.
func (f *Fetch) GetProposalTxs(ctx context.Context, ids []types.TransactionID) error {
f.logger.Debug("requesting proposal txs from peer",
log.ZContext(ctx),
zap.Int("num_txs", len(ids)),
zap.Array("txs", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) error {
for _, id := range ids {
enc.AppendString(id.ShortString())
}
return nil
})),
)
return f.getTxs(ctx, ids, f.validators.txProposal.HandleMessage)
}

// GetBlockTxs fetches the txs provided as IDs and saves them, they will be validated
// before block is applied.
func (f *Fetch) GetBlockTxs(ctx context.Context, ids []types.TransactionID) error {
f.logger.Debug("requesting block txs from peer",
log.ZContext(ctx),
zap.Int("num_txs", len(ids)),
zap.Array("txs", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) error {
for _, id := range ids {
enc.AppendString(id.ShortString())
}
return nil
})),
)
return f.getTxs(ctx, ids, f.validators.txBlock.HandleMessage)
}

func (f *Fetch) getTxs(ctx context.Context, ids []types.TransactionID, receiver dataReceiver) error {
if len(ids) == 0 {
return nil
}
f.logger.Debug("requesting txs from peer", log.ZContext(ctx), zap.Int("num_txs", len(ids)))
hashes := types.TransactionIDsToHashes(ids)
return f.getHashes(ctx, hashes, datastore.TXDB, receiver)
}
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ require (
github.com/prometheus/client_model v0.6.1
github.com/prometheus/common v0.60.1
github.com/quic-go/quic-go v0.48.2
github.com/rqlite/sql v0.0.0-20241105143344-71b14bed566c
github.com/rqlite/sql v0.0.0-20241111133259-a4122fabb196
github.com/rs/cors v1.11.1
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
github.com/seehuhn/mt19937 v1.0.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -575,8 +575,8 @@ github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzG
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/rqlite/sql v0.0.0-20241105143344-71b14bed566c h1:2hHhSEvDfn6pkvLLUf3jUJcZQjfPL8aB9wG2NG7s2yU=
github.com/rqlite/sql v0.0.0-20241105143344-71b14bed566c/go.mod h1:ib9zVtNgRKiGuoMyUqqL5aNpk+r+++YlyiVIkclVqPg=
github.com/rqlite/sql v0.0.0-20241111133259-a4122fabb196 h1:SjRKMwKLTEE3STO6unJlz4VlMjMv5NZgIdI9HikBeAc=
github.com/rqlite/sql v0.0.0-20241111133259-a4122fabb196/go.mod h1:ib9zVtNgRKiGuoMyUqqL5aNpk+r+++YlyiVIkclVqPg=
github.com/rs/cors v1.11.1 h1:eU3gRzXLRK57F5rKMGMZURNdIG4EoAmX8k94r9wXWHA=
github.com/rs/cors v1.11.1/go.mod h1:XyqrcTp5zjWr1wsJ8PIRZssZ8b/WMcMf71DJnit4EMU=
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
Expand Down
4 changes: 3 additions & 1 deletion miner/proposal_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,9 @@ func (pb *ProposalBuilder) initSharedData(ctx context.Context, current types.Lay
//
// Additionally all activesets that are older than 2 epochs are deleted at the beginning of an epoch anyway, but
// maybe we should revisit this when activesets are no longer bootstrapped.
return pb.db.WithTx(ctx, func(tx sql.Transaction) error {
//
// TODO(mafa): I'm still seeing SQL_BUSY errors in the logs, so for now I change this back to TxImmediate.
return pb.db.WithTxImmediate(ctx, func(tx sql.Transaction) error {
yes, err := activesets.Has(tx, pb.shared.active.id)
if err != nil {
return err
Expand Down
4 changes: 2 additions & 2 deletions sql/layers/layers.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func GetLatestStateHash(db sql.Executor) (rst types.Hash32, err error) {
}); err != nil {
return rst, fmt.Errorf("failed to load latest state root %w", err)
} else if rows == 0 {
return rst, fmt.Errorf("%w: state root doesnt exist", sql.ErrNotFound)
return rst, fmt.Errorf("%w: state root does not exist", sql.ErrNotFound)
}
return rst, err
}
Expand All @@ -117,7 +117,7 @@ func GetStateHash(db sql.Executor, lid types.LayerID) (rst types.Hash32, err err
}); err != nil {
return rst, fmt.Errorf("failed to load state root for %v: %w", lid, err)
} else if rows == 0 {
return rst, fmt.Errorf("%w: %s doesnt exist", sql.ErrNotFound, lid)
return rst, fmt.Errorf("%w: %s does not exist", sql.ErrNotFound, lid)
}
return rst, err
}
Expand Down
13 changes: 8 additions & 5 deletions systest/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ image_name ?= $(org)/systest:$(version_info)
certifier_image ?= $(org)/certifier-service:v0.8.4
poet_image ?= $(org)/poet:v0.10.10
post_service_image ?= $(org)/post-service:v0.8.4
post_init_image ?= $(org)/postcli:v0.12.5
post_init_image ?= $(org)/postcli:v0.12.10
smesher_image ?= $(org)/go-spacemesh-dev:$(version_info)
old_smesher_image ?= $(org)/go-spacemesh-dev:7b9337a # Update this when new version is released
old_smesher_image ?= $(org)/go-spacemesh-dev:v1.7.7
bs_image ?= $(org)/go-spacemesh-dev-bs:$(version_info)

test_id ?= systest-$(version_info)
test_job_name ?= systest-$(version_info)-$(date)
keep ?= false
clusters ?= 1
size ?= 10
poet_size ?= 3
poet_size ?= 2
level ?= debug
bootstrap ?= 5m
storage ?= standard=1Gi
Expand All @@ -38,8 +38,11 @@ ifeq ($(configname),$(test_job_name))
run_deps = config
endif

command := gotestsum --raw-command -- test2json -t -p systest \
/bin/tests -test.v -test.count=$(count) -test.timeout=60m -test.run=$(test_name) -test.parallel=$(clusters) \
# command := gotestsum --raw-command -- test2json -t -p systest \
/bin/tests -test.v -test.count=$(count) -test.timeout=60m -test.run=$(test_name) -test.parallel=$(clusters) \
-test.failfast=$(failfast) -clusters=$(clusters) -level=$(level) -configname=$(configname)

command := /bin/tests -test.v -test.count=$(count) -test.timeout=60m -test.run=$(test_name) -test.parallel=$(clusters) \
-test.failfast=$(failfast) -clusters=$(clusters) -level=$(level) -configname=$(configname)

.PHONY: docker
Expand Down
3 changes: 1 addition & 2 deletions systest/cluster/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -878,8 +878,7 @@ func deployNode(
corev1.Volume().WithName("config").
WithConfigMap(corev1.ConfigMapVolumeSource().WithName(spacemeshConfigMapName)),
corev1.Volume().WithName("data").
WithEmptyDir(corev1.EmptyDirVolumeSource().
WithSizeLimit(resource.MustParse(ctx.Storage.Size))),
WithEmptyDir(corev1.EmptyDirVolumeSource().WithSizeLimit(resource.MustParse(ctx.Storage.Size))),
).
WithDNSConfig(corev1.PodDNSConfig().WithOptions(
corev1.PodDNSConfigOption().WithName("timeout").WithValue("1"),
Expand Down
12 changes: 7 additions & 5 deletions systest/testcontext/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ var (
10,
)
poetSize = parameters.Int(
"poet-size", "size of the poet servers", 1,
"poet-size", "size of the poet servers", 2,
)
bsSize = parameters.Int(
"bs-size", "size of bootstrappers", 1,
Expand Down Expand Up @@ -250,7 +250,8 @@ func updateContext(ctx *Context) error {
keep, err := strconv.ParseBool(keepval)
if err != nil {
ctx.Log.Panicw("invalid state. keep label should be parsable as a boolean",
"keepval", keepval)
"keepval", keepval,
)
}
ctx.Keep = ctx.Keep || keep

Expand All @@ -261,7 +262,8 @@ func updateContext(ctx *Context) error {
psize, err := strconv.Atoi(psizeval)
if err != nil {
ctx.Log.Panicw("invalid state. poet size label should be parsable as an integer",
"psizeval", psizeval)
"psizeval", psizeval,
)
}
ctx.PoetSize = psize
return nil
Expand Down Expand Up @@ -360,9 +362,9 @@ func New(t *testing.T, opts ...Opt) *Context {
Keep: keep.Get(p),
ClusterSize: clSize,
BootnodeSize: max(2, (clSize/1000)*2),
RemoteSize: 0,
RemoteSize: clSize / 2, // 50% of smeshers are remote
PoetSize: poetSize.Get(p),
OldSize: 0,
OldSize: clSize / 4, // 25% of smeshers are old (use previous version of go-spacemesh)
BootstrapperSize: bsSize.Get(p),
Image: imageFlag.Get(p),
OldImage: oldImageFlag.Get(p),
Expand Down
25 changes: 12 additions & 13 deletions systest/tests/checkpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,9 @@ func TestCheckpoint(t *testing.T) {

tctx := testcontext.New(t)
addedLater := 2
size := min(tctx.ClusterSize, 30)
oldSize := size - addedLater
if tctx.ClusterSize > oldSize {
tctx.Log.Info("cluster size changed to ", oldSize)
tctx.ClusterSize = oldSize
}
oldSize := tctx.ClusterSize - addedLater
tctx.Log.Info("cluster size changed to ", oldSize)
tctx.ClusterSize = oldSize

// at the last layer of epoch 3, in the beginning of poet round 2.
// it is important to avoid check-pointing in the middle of cycle gap
Expand All @@ -63,14 +60,15 @@ func TestCheckpoint(t *testing.T) {
require.EqualValues(t, 4, layersPerEpoch, "checkpoint layer require tuning as layersPerEpoch is changed")
layerDuration := testcontext.LayerDuration.Get(tctx.Parameters)

eg, ctx := errgroup.WithContext(tctx)
first := layersPerEpoch * 2
stop := first + 2
stop := first + 5
receiver := types.GenerateAddress([]byte{11, 1, 1})
tctx.Log.Infow("sending transactions", "from", first, "to", stop-1)
require.NoError(t, sendTransactions(ctx, eg, tctx.Log, cl, first, stop, receiver, 1, 100))
require.NoError(t, eg.Wait())

deadline := cl.Genesis().Add(time.Duration(stop+1) * layerDuration)
ctx, cancel := context.WithDeadline(tctx, deadline)
defer cancel()
require.NoError(t, sendTransactions(ctx, tctx.Log.Desugar(), cl, first, stop, receiver, 1, 100))
require.NoError(t, waitLayer(tctx, cl.Client(0), snapshotLayer))

tctx.Log.Debugw("getting account balances")
Expand Down Expand Up @@ -100,7 +98,8 @@ func TestCheckpoint(t *testing.T) {
diffs = append(diffs, cl.Client(i).Name)
tctx.Log.Errorw("diff checkpoint data",
fmt.Sprintf("reference %v", cl.Client(0).Name), string(checkpoints[0]),
fmt.Sprintf("client %v", cl.Client(i).Name), string(checkpoints[i]))
fmt.Sprintf("client %v", cl.Client(i).Name), string(checkpoints[i]),
)
}
}
require.Empty(t, diffs)
Expand Down Expand Up @@ -173,8 +172,8 @@ func TestCheckpoint(t *testing.T) {
ensureSmeshing(t, tctx, cl, checkpointEpoch)

// increase the cluster size to the original test size
tctx.Log.Info("cluster size changed to ", size)
tctx.ClusterSize = size
tctx.ClusterSize += addedLater
tctx.Log.Info("cluster size changed to ", tctx.ClusterSize)
require.NoError(t, cl.AddSmeshers(tctx, addedLater))

tctx.Log.Infow("waiting for all miners to be smeshing", "last epoch", lastEpoch)
Expand Down
Loading

0 comments on commit 290edd2

Please sign in to comment.