Skip to content

Commit

Permalink
Merge pull request #2885 from ipfs/feature/bloom-cache
Browse files Browse the repository at this point in the history
Add ARC caching and bloom filter for blockstorage
  • Loading branch information
whyrusleeping authored Jul 4, 2016
2 parents 50a7df6 + f13506c commit ad5730d
Show file tree
Hide file tree
Showing 6 changed files with 254 additions and 92 deletions.
181 changes: 181 additions & 0 deletions blocks/blockstore/bloom_cache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package blockstore

import (
"github.com/ipfs/go-ipfs/blocks"
key "github.com/ipfs/go-ipfs/blocks/key"
lru "gx/ipfs/QmVYxfoJQiZijTgPNHCHgHELvQpbsJNTg6Crmc3dQkj3yy/golang-lru"
bloom "gx/ipfs/QmWQ2SJisXwcCLsUXLwYCKSfyExXjFRW2WbBH5sqCUnwX5/bbloom"
context "gx/ipfs/QmZy2y8t9zQH2a1b8q2ZSLKp17ATuJoCNxxyMFG5qFExpt/go-net/context"
ds "gx/ipfs/QmfQzVugPq1w5shWRcLWSeiHF4a2meBX7yVD8Vw7GWJM9o/go-datastore"

"sync/atomic"
)

// BloomCached returns Blockstore that caches Has requests using Bloom filter
// Size is size of bloom filter in bytes

This comment has been minimized.

Copy link
@jbenet

jbenet Jul 7, 2016

Member

nice! is there a benchmark for this?

This comment has been minimized.

Copy link
@Kubuxu

Kubuxu Jul 7, 2016

Member

Bloom filter I used itself requires 700ns (1200ns with mutex lock) to respond.
With sizes we are using we should get <0.1% false positive rate up to about 50k blocks in the repo.

It was created to reduce Has requests hitting disk for bitswap responses. We are getting wantlists which need to be checked against our repo, in 98% of cases we don't have given block.

With optimal working of bloom filter we should be able to reduce hits to disk form Has request by 97% using only about 64KiB of memory used.
What is currently missing is dynamic scaling (if blockstore is much bigger than expected) and rebuilding in case of a lot of deletions. Those mechanics are quite complex and we decided to collect concrete metrics (on which I am working now) before proceeding.

func BloomCached(bs Blockstore, bloomSize, lruSize int) (*bloomcache, error) {
bl, err := bloom.New(float64(bloomSize), float64(7))
if err != nil {
return nil, err
}
arc, err := lru.NewARC(lruSize)
if err != nil {
return nil, err
}
bc := &bloomcache{blockstore: bs, bloom: bl, arc: arc}
bc.Invalidate()
go bc.Rebuild()

return bc, nil
}

type bloomcache struct {
bloom *bloom.Bloom
active int32

arc *lru.ARCCache
// This chan is only used for testing to wait for bloom to enable
rebuildChan chan struct{}
blockstore Blockstore

// Statistics
hits uint64
misses uint64
}

func (b *bloomcache) Invalidate() {
b.rebuildChan = make(chan struct{})
atomic.StoreInt32(&b.active, 0)
}

func (b *bloomcache) BloomActive() bool {
return atomic.LoadInt32(&b.active) != 0
}

func (b *bloomcache) Rebuild() {
ctx := context.TODO()
evt := log.EventBegin(ctx, "bloomcache.Rebuild")
defer evt.Done()

ch, err := b.blockstore.AllKeysChan(ctx)
if err != nil {
log.Errorf("AllKeysChan failed in bloomcache rebuild with: %v", err)
return
}
for key := range ch {
b.bloom.AddTS([]byte(key)) // Use binary key, the more compact the better
}
close(b.rebuildChan)
atomic.StoreInt32(&b.active, 1)
}

func (b *bloomcache) DeleteBlock(k key.Key) error {
if has, ok := b.hasCached(k); ok && !has {
return ErrNotFound
}

b.arc.Remove(k) // Invalidate cache before deleting.
err := b.blockstore.DeleteBlock(k)
switch err {
case nil:
b.arc.Add(k, false)
case ds.ErrNotFound, ErrNotFound:
b.arc.Add(k, false)
default:
return err
}
return nil
}

// if ok == false has is inconclusive
// if ok == true then has respons to question: is it contained
func (b *bloomcache) hasCached(k key.Key) (has bool, ok bool) {
if k == "" {
// Return cache invalid so call to blockstore
// in case of invalid key is forwarded deeper
return false, false
}
if b.BloomActive() {
blr := b.bloom.HasTS([]byte(k))
if blr == false { // not contained in bloom is only conclusive answer bloom gives
return false, true
}
}
h, ok := b.arc.Get(k)
if ok {
return h.(bool), ok
} else {
return false, false
}
}

func (b *bloomcache) Has(k key.Key) (bool, error) {
if has, ok := b.hasCached(k); ok {
return has, nil
}

res, err := b.blockstore.Has(k)
if err == nil {
b.arc.Add(k, res)
}
return res, err
}

func (b *bloomcache) Get(k key.Key) (blocks.Block, error) {
if has, ok := b.hasCached(k); ok && !has {
return nil, ErrNotFound
}

bl, err := b.blockstore.Get(k)
if bl == nil && err == ErrNotFound {
b.arc.Add(k, false)
} else if bl != nil {
b.arc.Add(k, true)
}
return bl, err
}

func (b *bloomcache) Put(bl blocks.Block) error {
if has, ok := b.hasCached(bl.Key()); ok && has {
return nil
}

err := b.blockstore.Put(bl)
if err == nil {
b.bloom.AddTS([]byte(bl.Key()))
b.arc.Add(bl.Key(), true)
}
return err
}

func (b *bloomcache) PutMany(bs []blocks.Block) error {
var good []blocks.Block
for _, block := range bs {
if has, ok := b.hasCached(block.Key()); !ok || (ok && !has) {
good = append(good, block)
}
}
err := b.blockstore.PutMany(bs)
if err == nil {
for _, block := range bs {
b.bloom.AddTS([]byte(block.Key()))
}
}
return err
}

func (b *bloomcache) AllKeysChan(ctx context.Context) (<-chan key.Key, error) {
return b.blockstore.AllKeysChan(ctx)
}

func (b *bloomcache) GCLock() Unlocker {
return b.blockstore.(GCBlockstore).GCLock()
}

func (b *bloomcache) PinLock() Unlocker {
return b.blockstore.(GCBlockstore).PinLock()
}

func (b *bloomcache) GCRequested() bool {
return b.blockstore.(GCBlockstore).GCRequested()
}
Original file line number Diff line number Diff line change
@@ -1,34 +1,44 @@
package blockstore

import (
"fmt"
"sync"
"testing"
"time"

"github.com/ipfs/go-ipfs/blocks"

ds "gx/ipfs/QmfQzVugPq1w5shWRcLWSeiHF4a2meBX7yVD8Vw7GWJM9o/go-datastore"
dsq "gx/ipfs/QmfQzVugPq1w5shWRcLWSeiHF4a2meBX7yVD8Vw7GWJM9o/go-datastore/query"
syncds "gx/ipfs/QmfQzVugPq1w5shWRcLWSeiHF4a2meBX7yVD8Vw7GWJM9o/go-datastore/sync"
)

func TestReturnsErrorWhenSizeNegative(t *testing.T) {
bs := NewBlockstore(syncds.MutexWrap(ds.NewMapDatastore()))
_, err := WriteCached(bs, -1)
if err != nil {
return
_, err := BloomCached(bs, 100, -1)
if err == nil {
t.Fail()
}
_, err = BloomCached(bs, -1, 100)
if err == nil {
t.Fail()
}
t.Fail()
}

func TestRemoveCacheEntryOnDelete(t *testing.T) {
b := blocks.NewBlock([]byte("foo"))
cd := &callbackDatastore{f: func() {}, ds: ds.NewMapDatastore()}
bs := NewBlockstore(syncds.MutexWrap(cd))
cachedbs, err := WriteCached(bs, 1)
cachedbs, err := BloomCached(bs, 1, 1)
if err != nil {
t.Fatal(err)
}
cachedbs.Put(b)

cd.Lock()
writeHitTheDatastore := false
cd.Unlock()

cd.SetFunc(func() {
writeHitTheDatastore = true
})
Expand All @@ -43,7 +53,7 @@ func TestRemoveCacheEntryOnDelete(t *testing.T) {
func TestElideDuplicateWrite(t *testing.T) {
cd := &callbackDatastore{f: func() {}, ds: ds.NewMapDatastore()}
bs := NewBlockstore(syncds.MutexWrap(cd))
cachedbs, err := WriteCached(bs, 1)
cachedbs, err := BloomCached(bs, 1, 1)
if err != nil {
t.Fatal(err)
}
Expand All @@ -56,36 +66,78 @@ func TestElideDuplicateWrite(t *testing.T) {
})
cachedbs.Put(b1)
}
func TestHasIsBloomCached(t *testing.T) {
cd := &callbackDatastore{f: func() {}, ds: ds.NewMapDatastore()}
bs := NewBlockstore(syncds.MutexWrap(cd))

for i := 0; i < 1000; i++ {
bs.Put(blocks.NewBlock([]byte(fmt.Sprintf("data: %d", i))))
}
cachedbs, err := BloomCached(bs, 256*1024, 128)
if err != nil {
t.Fatal(err)
}

select {
case <-cachedbs.rebuildChan:
case <-time.After(1 * time.Second):
t.Fatalf("Timeout wating for rebuild: %d", cachedbs.bloom.ElementsAdded())
}

cacheFails := 0
cd.SetFunc(func() {
cacheFails++
})

for i := 0; i < 1000; i++ {
cachedbs.Has(blocks.NewBlock([]byte(fmt.Sprintf("data: %d", i+2000))).Key())
}

if float64(cacheFails)/float64(1000) > float64(0.05) {
t.Fatal("Bloom filter has cache miss rate of more than 5%")
}
}

type callbackDatastore struct {
sync.Mutex
f func()
ds ds.Datastore
}

func (c *callbackDatastore) SetFunc(f func()) { c.f = f }
func (c *callbackDatastore) SetFunc(f func()) {
c.Lock()
defer c.Unlock()
c.f = f
}

func (c *callbackDatastore) CallF() {
c.Lock()
defer c.Unlock()
c.f()
}

func (c *callbackDatastore) Put(key ds.Key, value interface{}) (err error) {
c.f()
c.CallF()
return c.ds.Put(key, value)
}

func (c *callbackDatastore) Get(key ds.Key) (value interface{}, err error) {
c.f()
c.CallF()
return c.ds.Get(key)
}

func (c *callbackDatastore) Has(key ds.Key) (exists bool, err error) {
c.f()
c.CallF()
return c.ds.Has(key)
}

func (c *callbackDatastore) Delete(key ds.Key) (err error) {
c.f()
c.CallF()
return c.ds.Delete(key)
}

func (c *callbackDatastore) Query(q dsq.Query) (dsq.Results, error) {
c.f()
c.CallF()
return c.ds.Query(q)
}

Expand Down
Loading

0 comments on commit ad5730d

Please sign in to comment.