From f0e1f0827951702bf21a139b9e64d004b334912f Mon Sep 17 00:00:00 2001 From: Adin Schmahmann Date: Fri, 14 May 2021 01:04:19 -0700 Subject: [PATCH] Bulk Provide/Reproviding System (#34) * batched: added a batching providing and reproviding system that takes advantage of an underlying provide emitting system that can operate on many items at a time. The implementation is experimental and likely to change. * queue: modified documentation to indicate that its durability is best effort and determined by the underlying datastore This commit was moved from ipfs/go-ipfs-provider@d391dae4a595473f6797eb5d5b803a529a7bbdbc --- provider/batched/system.go | 415 ++++++++++++++++++++++++++++++++ provider/batched/system_test.go | 117 +++++++++ provider/queue/queue.go | 8 +- 3 files changed, 536 insertions(+), 4 deletions(-) create mode 100644 provider/batched/system.go create mode 100644 provider/batched/system_test.go diff --git a/provider/batched/system.go b/provider/batched/system.go new file mode 100644 index 000000000..5637e55b1 --- /dev/null +++ b/provider/batched/system.go @@ -0,0 +1,415 @@ +package batched + +import ( + "context" + "errors" + "fmt" + "strconv" + "sync" + "time" + + "github.com/ipfs/go-cid" + "github.com/ipfs/go-datastore" + provider "github.com/ipfs/go-ipfs-provider" + "github.com/ipfs/go-ipfs-provider/queue" + "github.com/ipfs/go-ipfs-provider/simple" + logging "github.com/ipfs/go-log" + "github.com/ipfs/go-verifcid" + "github.com/multiformats/go-multihash" +) + +var log = logging.Logger("provider.batched") + +type BatchProvidingSystem struct { + ctx context.Context + close context.CancelFunc + closewg sync.WaitGroup + + reprovideInterval time.Duration + initalReprovideDelay time.Duration + initialReprovideDelaySet bool + + rsys provideMany + keyProvider simple.KeyChanFunc + + q *queue.Queue + ds datastore.Batching + + reprovideCh chan cid.Cid + + totalProvides, lastReprovideBatchSize int + avgProvideDuration, lastReprovideDuration time.Duration +} + +var _ provider.System = (*BatchProvidingSystem)(nil) + +type provideMany interface { + ProvideMany(ctx context.Context, keys []multihash.Multihash) error + Ready() bool +} + +// Option defines the functional option type that can be used to configure +// BatchProvidingSystem instances +type Option func(system *BatchProvidingSystem) error + +var lastReprovideKey = datastore.NewKey("/provider/reprovide/lastreprovide") + +func New(provider provideMany, q *queue.Queue, opts ...Option) (*BatchProvidingSystem, error) { + s := &BatchProvidingSystem{ + reprovideInterval: time.Hour * 24, + rsys: provider, + keyProvider: nil, + q: q, + ds: datastore.NewMapDatastore(), + reprovideCh: make(chan cid.Cid), + } + + for _, o := range opts { + if err := o(s); err != nil { + return nil, err + } + } + + // Setup default behavior for the initial reprovide delay + // + // If the reprovide ticker is larger than a minute (likely), + // provide once after we've been up a minute. + // + // Don't provide _immediately_ as we might be just about to stop. + if !s.initialReprovideDelaySet && s.reprovideInterval > time.Minute { + s.initalReprovideDelay = time.Minute + s.initialReprovideDelaySet = true + } + + if s.keyProvider == nil { + s.keyProvider = func(ctx context.Context) (<-chan cid.Cid, error) { + ch := make(chan cid.Cid) + close(ch) + return ch, nil + } + } + + // This is after the options processing so we do not have to worry about leaking a context if there is an + // initialization error processing the options + ctx, cancel := context.WithCancel(context.Background()) + s.ctx = ctx + s.close = cancel + + return s, nil +} + +func Datastore(batching datastore.Batching) Option { + return func(system *BatchProvidingSystem) error { + system.ds = batching + return nil + } +} + +func ReproviderInterval(duration time.Duration) Option { + return func(system *BatchProvidingSystem) error { + system.reprovideInterval = duration + return nil + } +} + +func KeyProvider(fn simple.KeyChanFunc) Option { + return func(system *BatchProvidingSystem) error { + system.keyProvider = fn + return nil + } +} + +func initialReprovideDelay(duration time.Duration) Option { + return func(system *BatchProvidingSystem) error { + system.initialReprovideDelaySet = true + system.initalReprovideDelay = duration + return nil + } +} + +func (s *BatchProvidingSystem) Run() { + // how long we wait between the first provider we hear about and batching up the provides to send out + const pauseDetectionThreshold = time.Millisecond * 500 + // how long we are willing to collect providers for the batch after we receive the first one + const maxCollectionDuration = time.Minute * 10 + + provCh := s.q.Dequeue() + + s.closewg.Add(1) + go func() { + defer s.closewg.Done() + + m := make(map[cid.Cid]struct{}) + + // setup stopped timers + maxCollectionDurationTimer := time.NewTimer(time.Hour) + pauseDetectTimer := time.NewTimer(time.Hour) + stopAndEmptyTimer(maxCollectionDurationTimer) + stopAndEmptyTimer(pauseDetectTimer) + + // make sure timers are cleaned up + defer maxCollectionDurationTimer.Stop() + defer pauseDetectTimer.Stop() + + resetTimersAfterReceivingProvide := func() { + firstProvide := len(m) == 0 + if firstProvide { + // after receiving the first provider start up the timers + maxCollectionDurationTimer.Reset(maxCollectionDuration) + pauseDetectTimer.Reset(pauseDetectionThreshold) + } else { + // otherwise just do a full restart of the pause timer + stopAndEmptyTimer(pauseDetectTimer) + pauseDetectTimer.Reset(pauseDetectionThreshold) + } + } + + for { + performedReprovide := false + + // at the start of every loop the maxCollectionDurationTimer and pauseDetectTimer should be already be + // stopped and have empty channels + loop: + for { + select { + case <-maxCollectionDurationTimer.C: + // if this timer has fired then the pause timer has started so let's stop and empty it + stopAndEmptyTimer(pauseDetectTimer) + break loop + default: + } + + select { + case c := <-provCh: + resetTimersAfterReceivingProvide() + m[c] = struct{}{} + continue + default: + } + + select { + case c := <-provCh: + resetTimersAfterReceivingProvide() + m[c] = struct{}{} + case c := <-s.reprovideCh: + resetTimersAfterReceivingProvide() + m[c] = struct{}{} + performedReprovide = true + case <-pauseDetectTimer.C: + // if this timer has fired then the max collection timer has started so let's stop and empty it + stopAndEmptyTimer(maxCollectionDurationTimer) + break loop + case <-maxCollectionDurationTimer.C: + // if this timer has fired then the pause timer has started so let's stop and empty it + stopAndEmptyTimer(pauseDetectTimer) + break loop + case <-s.ctx.Done(): + return + } + } + + if len(m) == 0 { + continue + } + + keys := make([]multihash.Multihash, 0, len(m)) + for c := range m { + delete(m, c) + + // hash security + if err := verifcid.ValidateCid(c); err != nil { + log.Errorf("insecure hash in reprovider, %s (%s)", c, err) + continue + } + + keys = append(keys, c.Hash()) + } + + for !s.rsys.Ready() { + log.Debugf("reprovider system not ready") + select { + case <-time.After(time.Minute): + case <-s.ctx.Done(): + return + } + } + + log.Debugf("starting provide of %d keys", len(keys)) + start := time.Now() + err := s.rsys.ProvideMany(s.ctx, keys) + if err != nil { + log.Debugf("providing failed %v", err) + continue + } + dur := time.Since(start) + + totalProvideTime := int64(s.totalProvides) * int64(s.avgProvideDuration) + recentAvgProvideDuration := time.Duration(int64(dur) / int64(len(keys))) + s.avgProvideDuration = time.Duration((totalProvideTime + int64(dur)) / int64(s.totalProvides+len(keys))) + s.totalProvides += len(keys) + + log.Debugf("finished providing of %d keys. It took %v with an average of %v per provide", len(keys), dur, recentAvgProvideDuration) + + if performedReprovide { + s.lastReprovideBatchSize = len(keys) + s.lastReprovideDuration = dur + + if err := s.ds.Put(lastReprovideKey, storeTime(time.Now())); err != nil { + log.Errorf("could not store last reprovide time: %v", err) + } + if err := s.ds.Sync(lastReprovideKey); err != nil { + log.Errorf("could not perform sync of last reprovide time: %v", err) + } + } + } + }() + + s.closewg.Add(1) + go func() { + defer s.closewg.Done() + + var initialReprovideCh, reprovideCh <-chan time.Time + + // If reproviding is enabled (non-zero) + if s.reprovideInterval > 0 { + reprovideTicker := time.NewTicker(s.reprovideInterval) + defer reprovideTicker.Stop() + reprovideCh = reprovideTicker.C + + // if there is a non-zero initial reprovide time that was set in the initializer or if the fallback has been + if s.initialReprovideDelaySet { + initialReprovideTimer := time.NewTimer(s.initalReprovideDelay) + defer initialReprovideTimer.Stop() + + initialReprovideCh = initialReprovideTimer.C + } + } + + for s.ctx.Err() == nil { + select { + case <-initialReprovideCh: + case <-reprovideCh: + case <-s.ctx.Done(): + return + } + + err := s.reprovide(s.ctx, false) + + // only log if we've hit an actual error, otherwise just tell the client we're shutting down + if s.ctx.Err() == nil && err != nil { + log.Errorf("failed to reprovide: %s", err) + } + } + }() +} + +func stopAndEmptyTimer(t *time.Timer) { + if !t.Stop() { + <-t.C + } +} + +func storeTime(t time.Time) []byte { + val := []byte(fmt.Sprintf("%d", t.UnixNano())) + return val +} + +func parseTime(b []byte) (time.Time, error) { + tns, err := strconv.ParseInt(string(b), 10, 64) + if err != nil { + return time.Time{}, err + } + return time.Unix(0, tns), nil +} + +func (s *BatchProvidingSystem) Close() error { + s.close() + err := s.q.Close() + s.closewg.Wait() + return err +} + +func (s *BatchProvidingSystem) Provide(cid cid.Cid) error { + return s.q.Enqueue(cid) +} + +func (s *BatchProvidingSystem) Reprovide(ctx context.Context) error { + return s.reprovide(ctx, true) +} + +func (s *BatchProvidingSystem) reprovide(ctx context.Context, force bool) error { + if !s.shouldReprovide() && !force { + return nil + } + + kch, err := s.keyProvider(ctx) + if err != nil { + return err + } + +reprovideCidLoop: + for { + select { + case c, ok := <-kch: + if !ok { + break reprovideCidLoop + } + + select { + case s.reprovideCh <- c: + case <-ctx.Done(): + return ctx.Err() + } + case <-ctx.Done(): + return ctx.Err() + } + } + + return nil +} + +func (s *BatchProvidingSystem) getLastReprovideTime() (time.Time, error) { + val, err := s.ds.Get(lastReprovideKey) + if errors.Is(err, datastore.ErrNotFound) { + return time.Time{}, nil + } + if err != nil { + return time.Time{}, fmt.Errorf("could not get last reprovide time") + } + + t, err := parseTime(val) + if err != nil { + return time.Time{}, fmt.Errorf("could not decode last reprovide time, got %q", string(val)) + } + + return t, nil +} + +func (s *BatchProvidingSystem) shouldReprovide() bool { + t, err := s.getLastReprovideTime() + if err != nil { + log.Debugf("getting last reprovide time failed: %s", err) + return false + } + + if time.Since(t) < time.Duration(float64(s.reprovideInterval)*0.5) { + return false + } + return true +} + +type BatchedProviderStats struct { + TotalProvides, LastReprovideBatchSize int + AvgProvideDuration, LastReprovideDuration time.Duration +} + +// Stat returns various stats about this provider system +func (s *BatchProvidingSystem) Stat(ctx context.Context) (BatchedProviderStats, error) { + // TODO: Does it matter that there is no locking around the total+average values? + return BatchedProviderStats{ + TotalProvides: s.totalProvides, + LastReprovideBatchSize: s.lastReprovideBatchSize, + AvgProvideDuration: s.avgProvideDuration, + LastReprovideDuration: s.lastReprovideDuration, + }, nil +} diff --git a/provider/batched/system_test.go b/provider/batched/system_test.go new file mode 100644 index 000000000..b2b312020 --- /dev/null +++ b/provider/batched/system_test.go @@ -0,0 +1,117 @@ +package batched + +import ( + "context" + "strconv" + "sync" + "testing" + "time" + + "github.com/ipfs/go-cid" + "github.com/ipfs/go-datastore" + dssync "github.com/ipfs/go-datastore/sync" + mh "github.com/multiformats/go-multihash" + + q "github.com/ipfs/go-ipfs-provider/queue" +) + +type mockProvideMany struct { + lk sync.Mutex + keys []mh.Multihash +} + +func (m *mockProvideMany) ProvideMany(ctx context.Context, keys []mh.Multihash) error { + m.lk.Lock() + defer m.lk.Unlock() + m.keys = keys + return nil +} + +func (m *mockProvideMany) Ready() bool { + return true +} + +func (m *mockProvideMany) GetKeys() []mh.Multihash { + m.lk.Lock() + defer m.lk.Unlock() + return m.keys[:] +} + +var _ provideMany = (*mockProvideMany)(nil) + +func TestBatched(t *testing.T) { + ctx := context.Background() + defer ctx.Done() + + ds := dssync.MutexWrap(datastore.NewMapDatastore()) + queue, err := q.NewQueue(ctx, "test", ds) + if err != nil { + t.Fatal(err) + } + + provider := &mockProvideMany{} + + ctx, cancel := context.WithTimeout(ctx, time.Second*10) + defer cancel() + + const numProvides = 100 + keysToProvide := make(map[cid.Cid]int) + for i := 0; i < numProvides; i++ { + h, err := mh.Sum([]byte(strconv.Itoa(i)), mh.SHA2_256, -1) + if err != nil { + panic(err) + } + c := cid.NewCidV1(cid.Raw, h) + keysToProvide[c] = i + } + + batchSystem, err := New(provider, queue, KeyProvider(func(ctx context.Context) (<-chan cid.Cid, error) { + ch := make(chan cid.Cid) + go func() { + for k := range keysToProvide { + select { + case ch <- k: + case <-ctx.Done(): + return + } + } + }() + return ch, nil + }), initialReprovideDelay(0)) + if err != nil { + t.Fatal(err) + } + + batchSystem.Run() + + var keys []mh.Multihash + for { + if ctx.Err() != nil { + t.Fatal("test hung") + } + keys = provider.GetKeys() + if len(keys) != 0 { + break + } + time.Sleep(time.Millisecond * 100) + } + + if len(keys) != numProvides { + t.Fatalf("expected %d provider keys, got %d", numProvides, len(keys)) + } + + provMap := make(map[string]struct{}) + for _, k := range keys { + provMap[string(k)] = struct{}{} + } + + for i := 0; i < numProvides; i++ { + h, err := mh.Sum([]byte(strconv.Itoa(i)), mh.SHA2_256, -1) + if err != nil { + panic(err) + } + if _, found := provMap[string(h)]; !found { + t.Fatalf("could not find provider with value %d", i) + } + } +} diff --git a/provider/queue/queue.go b/provider/queue/queue.go index 2c3350256..e81e341f6 100644 --- a/provider/queue/queue.go +++ b/provider/queue/queue.go @@ -14,11 +14,11 @@ import ( var log = logging.Logger("provider.queue") -// Queue provides a durable, FIFO interface to the datastore for storing cids +// Queue provides a best-effort durability, FIFO interface to the datastore for storing cids // -// Durability just means that cids in the process of being provided when a -// crash or shutdown occurs will still be in the queue when the node is -// brought back online. +// Best-effort durability just means that cids in the process of being provided when a +// crash or shutdown occurs may be in the queue when the node is brought back online +// depending on whether the underlying datastore has synchronous or asynchronous writes. type Queue struct { // used to differentiate queues in datastore // e.g. provider vs reprovider