diff --git a/memiavl/benchmark_test.go b/memiavl/benchmark_test.go index 4dc68886b8..9f2938e495 100644 --- a/memiavl/benchmark_test.go +++ b/memiavl/benchmark_test.go @@ -4,6 +4,7 @@ import ( "bytes" "encoding/binary" "math/rand" + "sort" "testing" iavlcache "github.com/cosmos/iavl/cache" @@ -146,6 +147,22 @@ func BenchmarkRandomGet(b *testing.B) { _ = m[string(targetKey)] } }) + + b.Run("binary-search", func(b *testing.B) { + // the last benchmark sort the items in place + sort.Slice(items, func(i, j int) bool { + return bytes.Compare(items[i].key, items[j].key) < 0 + }) + cmp := func(i int) bool { return bytes.Compare(items[i].key, targetKey) != -1 } + i := sort.Search(len(items), cmp) + require.Equal(b, targetValue, items[i].value) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + n := sort.Search(len(items), cmp) + _ = items[n].value + } + }) } func BenchmarkRandomSet(b *testing.B) { diff --git a/memiavl/export.go b/memiavl/export.go index a2a1952393..7ee20a67c6 100644 --- a/memiavl/export.go +++ b/memiavl/export.go @@ -1,6 +1,7 @@ package memiavl import ( + "context" "fmt" "math" @@ -10,6 +11,11 @@ import ( protoio "github.com/gogo/protobuf/io" ) +// exportBufferSize is the number of nodes to buffer in the exporter. It improves throughput by +// processing multiple nodes per context switch, but take care to avoid excessive memory usage, +// especially since callers may export several IAVL stores in parallel (e.g. the Cosmos SDK). +const exportBufferSize = 32 + func (db *DB) Snapshot(height uint64, protoWriter protoio.Writer) error { if height > math.MaxUint32 { return fmt.Errorf("height overflows uint32: %d", height) @@ -59,26 +65,89 @@ func (db *DB) Snapshot(height uint64, protoWriter protoio.Writer) error { type Exporter struct { snapshot *Snapshot - i uint32 - count int + ch chan *iavl.ExportNode + cancel context.CancelFunc +} + +func newExporter(snapshot *Snapshot) *Exporter { + ctx, cancel := context.WithCancel(context.Background()) + exporter := &Exporter{ + snapshot: snapshot, + ch: make(chan *iavl.ExportNode, exportBufferSize), + cancel: cancel, + } + go exporter.export(ctx) + return exporter +} + +func (e *Exporter) export(ctx context.Context) { + defer close(e.ch) + + if e.snapshot.leavesLen() == 0 { + return + } + + if e.snapshot.leavesLen() == 1 { + leaf := e.snapshot.Leaf(0) + e.ch <- &iavl.ExportNode{ + Height: 0, + Version: int64(leaf.Version()), + Key: leaf.Key(), + Value: leaf.Value(), + } + return + } + + var pendingTrees int + var i, j uint32 + for ; i < uint32(e.snapshot.nodesLen()); i++ { + // pending branch node + node := e.snapshot.nodesLayout.Node(i) + for pendingTrees < int(node.PreTrees())+2 { + // add more leaf nodes + leaf := e.snapshot.leavesLayout.Leaf(j) + key, value := e.snapshot.KeyValue(leaf.KeyOffset()) + enode := &iavl.ExportNode{ + Height: 0, + Version: int64(leaf.Version()), + Key: key, + Value: value, + } + j++ + pendingTrees++ + + select { + case e.ch <- enode: + case <-ctx.Done(): + return + } + } + enode := &iavl.ExportNode{ + Height: int8(node.Height()), + Version: int64(node.Version()), + Key: e.snapshot.LeafKey(node.KeyLeaf()), + } + pendingTrees-- + + select { + case e.ch <- enode: + case <-ctx.Done(): + return + } + } } func (e *Exporter) Next() (*iavl.ExportNode, error) { - if int(e.i) >= e.count { - return nil, iavl.ExportDone + if exportNode, ok := <-e.ch; ok { + return exportNode, nil } - node := e.snapshot.Node(e.i) - e.i++ + return nil, iavl.ExportDone +} - height := node.Height() - var value []byte - if height == 0 { - value = node.Value() +// Close closes the exporter. It is safe to call multiple times. +func (e *Exporter) Close() { + e.cancel() + for range e.ch { // drain channel } - return &iavl.ExportNode{ - Height: int8(height), - Version: int64(node.Version()), - Key: node.Key(), - Value: value, - }, nil + e.snapshot = nil } diff --git a/memiavl/import.go b/memiavl/import.go index 26a048bc74..191f127a95 100644 --- a/memiavl/import.go +++ b/memiavl/import.go @@ -144,13 +144,13 @@ func doImport(dir string, version int64, nodes <-chan *iavl.ExportNode, writeHas } } - switch len(i.indexStack) { + switch len(i.leavesStack) { case 0: - return EmptyRootNodeIndex, nil + return 0, nil case 1: - return i.indexStack[0], nil + return i.leafCounter, nil default: - return 0, fmt.Errorf("invalid node structure, found stack size %v after imported", len(i.indexStack)) + return 0, fmt.Errorf("invalid node structure, found stack size %v after imported", len(i.leavesStack)) } }) } @@ -158,8 +158,10 @@ func doImport(dir string, version int64, nodes <-chan *iavl.ExportNode, writeHas type importer struct { snapshotWriter - indexStack []uint32 - nodeStack []*MemNode + // keep track of how many leaves has been written before the pending nodes + leavesStack []uint32 + // keep track of the pending nodes + nodeStack []*MemNode } func (i *importer) Add(n *iavl.ExportNode) error { @@ -169,24 +171,23 @@ func (i *importer) Add(n *iavl.ExportNode) error { if n.Height == 0 { node := &MemNode{ - height: uint8(n.Height), + height: 0, size: 1, version: uint32(n.Version), key: n.Key, value: n.Value, } nodeHash := node.Hash() - idx, err := i.writeLeaf(node.version, node.key, node.value, nodeHash) - if err != nil { + if err := i.writeLeaf(node.version, node.key, node.value, nodeHash); err != nil { return err } - i.indexStack = append(i.indexStack, idx) + i.leavesStack = append(i.leavesStack, i.leafCounter) i.nodeStack = append(i.nodeStack, node) return nil } // branch node - leftIndex := i.indexStack[len(i.indexStack)-2] + keyLeaf := i.leavesStack[len(i.leavesStack)-2] leftNode := i.nodeStack[len(i.nodeStack)-2] rightNode := i.nodeStack[len(i.nodeStack)-1] @@ -199,13 +200,13 @@ func (i *importer) Add(n *iavl.ExportNode) error { right: rightNode, } nodeHash := node.Hash() - idx, err := i.writeBranch(node.version, uint32(node.size), node.height, leftIndex+1, nodeHash) - if err != nil { + preTrees := uint8(len(i.nodeStack) - 2) + if err := i.writeBranch(node.version, uint32(node.size), node.height, preTrees, keyLeaf, nodeHash); err != nil { return err } - i.indexStack = i.indexStack[:len(i.indexStack)-2] - i.indexStack = append(i.indexStack, idx) + i.leavesStack = i.leavesStack[:len(i.leavesStack)-2] + i.leavesStack = append(i.leavesStack, i.leafCounter) i.nodeStack = i.nodeStack[:len(i.nodeStack)-2] i.nodeStack = append(i.nodeStack, node) diff --git a/memiavl/iterator.go b/memiavl/iterator.go index 9e2e6208c0..99098bbc88 100644 --- a/memiavl/iterator.go +++ b/memiavl/iterator.go @@ -82,7 +82,7 @@ func (iter *Iterator) Next() { afterStart := iter.start == nil || startCmp < 0 beforeEnd := iter.end == nil || bytes.Compare(key, iter.end) < 0 - if isLeaf(node) { + if node.IsLeaf() { startOrAfter := afterStart || startCmp == 0 if startOrAfter && beforeEnd { iter.key = key diff --git a/memiavl/layout_little_endian.go b/memiavl/layout_little_endian.go index 5d04707eee..51c0f3e233 100644 --- a/memiavl/layout_little_endian.go +++ b/memiavl/layout_little_endian.go @@ -30,6 +30,10 @@ func (node NodeLayout) Height() uint8 { return node.data[OffsetHeight] } +func (node NodeLayout) PreTrees() uint8 { + return node.data[OffsetPreTrees] +} + func (node NodeLayout) Version() uint32 { return binary.LittleEndian.Uint32(node.data[OffsetVersion : OffsetVersion+4]) } @@ -38,14 +42,44 @@ func (node NodeLayout) Size() uint32 { return binary.LittleEndian.Uint32(node.data[OffsetSize : OffsetSize+4]) } -func (node NodeLayout) KeyOffset() uint64 { - return binary.LittleEndian.Uint64(node.data[OffsetKeyOffset : OffsetKeyOffset+8]) -} - -func (node NodeLayout) KeyNode() uint32 { - return binary.LittleEndian.Uint32(node.data[OffsetKeyNode : OffsetKeyNode+4]) +func (node NodeLayout) KeyLeaf() uint32 { + return binary.LittleEndian.Uint32(node.data[OffsetKeyLeaf : OffsetKeyLeaf+4]) } func (node NodeLayout) Hash() []byte { return node.data[OffsetHash : OffsetHash+SizeHash] } + +// Leaves is a continuously stored IAVL nodes +type Leaves struct { + data []byte +} + +func NewLeaves(data []byte) (Leaves, error) { + return Leaves{data}, nil +} + +func (leaves Leaves) Leaf(i uint32) LeafLayout { + offset := int(i) * SizeLeaf + return LeafLayout{data: (*[SizeLeaf]byte)(leaves.data[offset : offset+SizeLeaf])} +} + +type LeafLayout struct { + data *[SizeLeaf]byte +} + +func (leaf LeafLayout) Version() uint32 { + return binary.LittleEndian.Uint32(leaf.data[OffsetLeafVersion : OffsetLeafVersion+4]) +} + +func (leaf LeafLayout) KeyLength() uint32 { + return binary.LittleEndian.Uint32(leaf.data[OffsetLeafKeyLen : OffsetLeafKeyLen+4]) +} + +func (leaf LeafLayout) KeyOffset() uint64 { + return binary.LittleEndian.Uint64(leaf.data[OffsetLeafKeyOffset : OffsetLeafKeyOffset+8]) +} + +func (leaf LeafLayout) Hash() []byte { + return leaf.data[OffsetLeafHash : OffsetLeafHash+32] +} diff --git a/memiavl/layout_native.go b/memiavl/layout_native.go index 7f2567943a..b14161aeef 100644 --- a/memiavl/layout_native.go +++ b/memiavl/layout_native.go @@ -43,6 +43,10 @@ func (node *nodeLayout) Height() uint8 { return uint8(node.data[0]) } +func (node NodeLayout) PreTrees() uint8 { + return uint8(node.data[0] >> 8) +} + func (node *nodeLayout) Version() uint32 { return node.data[1] } @@ -51,7 +55,7 @@ func (node *nodeLayout) Size() uint32 { return node.data[2] } -func (node *nodeLayout) KeyNode() uint32 { +func (node *nodeLayout) KeyLeaf() uint32 { return node.data[3] } @@ -62,3 +66,51 @@ func (node *nodeLayout) KeyOffset() uint64 { func (node *nodeLayout) Hash() []byte { return node.hash[:] } + +type LeafLayout = *leafLayout + +// Nodes is a continuously stored IAVL nodes +type Leaves struct { + leaves []leafLayout +} + +func NewLeaves(buf []byte) (Leaves, error) { + // check alignment and size of the buffer + p := unsafe.Pointer(unsafe.SliceData(buf)) + if uintptr(p)%unsafe.Alignof(leafLayout{}) != 0 { + return Leaves{}, errors.New("input buffer is not aligned") + } + size := int(unsafe.Sizeof(leafLayout{})) + if len(buf)%size != 0 { + return Leaves{}, errors.New("input buffer length is not correct") + } + leaves := unsafe.Slice((*leafLayout)(p), len(buf)/size) + return Leaves{leaves}, nil +} + +func (leaves Leaves) Leaf(i uint32) LeafLayout { + return &leaves.leaves[i] +} + +type leafLayout struct { + version uint32 + keyLen uint32 + keyOffset uint64 + hash [32]byte +} + +func (leaf *leafLayout) Version() uint32 { + return leaf.version +} + +func (leaf *leafLayout) KeyLength() uint32 { + return leaf.keyLen +} + +func (leaf *leafLayout) KeyOffset() uint64 { + return leaf.keyOffset +} + +func (leaf *leafLayout) Hash() []byte { + return leaf.hash[:] +} diff --git a/memiavl/mem_node.go b/memiavl/mem_node.go index 68e76baa58..6f8eb375bf 100644 --- a/memiavl/mem_node.go +++ b/memiavl/mem_node.go @@ -26,14 +26,14 @@ func newLeafNode(key, value []byte, version uint32) *MemNode { } } -func (node *MemNode) isLeaf() bool { - return node.height == 0 -} - func (node *MemNode) Height() uint8 { return node.height } +func (node *MemNode) IsLeaf() bool { + return node.height == 0 +} + func (node *MemNode) Size() int64 { return node.size } @@ -157,7 +157,7 @@ func (node *MemNode) reBalance(version, cowVersion uint32) *MemNode { } func (node *MemNode) Get(key []byte) ([]byte, uint32) { - if node.isLeaf() { + if node.IsLeaf() { switch bytes.Compare(node.key, key) { case -1: return nil, 1 @@ -177,7 +177,7 @@ func (node *MemNode) Get(key []byte) ([]byte, uint32) { } func (node *MemNode) GetByIndex(index uint32) ([]byte, []byte) { - if node.isLeaf() { + if node.IsLeaf() { if index == 0 { return node.key, node.value } diff --git a/memiavl/mmap.go b/memiavl/mmap.go index 4cf4464d3f..c4dbb2621c 100644 --- a/memiavl/mmap.go +++ b/memiavl/mmap.go @@ -38,7 +38,11 @@ func NewMmap(path string) (*MmapFile, error) { // Close closes the file and mmap handles func (m *MmapFile) Close() error { - return errors.Join(mmap.Munmap(m.data, m.handle), m.file.Close()) + var err error + if m.handle != nil { + err = mmap.Munmap(m.data, m.handle) + } + return errors.Join(err, m.file.Close()) } // Data returns the mmap-ed buffer @@ -51,5 +55,8 @@ func Mmap(f *os.File) ([]byte, *[mmap.MaxMapSize]byte, error) { if err != nil { return nil, nil, err } + if fi.Size() == 0 { + return nil, nil, nil + } return mmap.Mmap(f, int(fi.Size())) } diff --git a/memiavl/node.go b/memiavl/node.go index d4a7c428c1..c756ecdbeb 100644 --- a/memiavl/node.go +++ b/memiavl/node.go @@ -11,6 +11,7 @@ import ( // Node interface encapsulate the interface of both PersistedNode and MemNode. type Node interface { Height() uint8 + IsLeaf() bool Size() int64 Version() uint32 Key() []byte @@ -27,10 +28,6 @@ type Node interface { GetByIndex(uint32) ([]byte, []byte) } -func isLeaf(node Node) bool { - return node.Height() == 0 -} - // setRecursive do set operation. // it always do modification and return new `MemNode`, even if the value is the same. // also returns if it's an update or insertion, if update, the tree height and balance is not changed. @@ -40,7 +37,7 @@ func setRecursive(node Node, key, value []byte, version, cowVersion uint32) (*Me } nodeKey := node.Key() - if isLeaf(node) { + if node.IsLeaf() { switch bytes.Compare(key, nodeKey) { case -1: return &MemNode{ @@ -98,7 +95,7 @@ func removeRecursive(node Node, key []byte, version, cowVersion uint32) ([]byte, return nil, nil, nil } - if isLeaf(node) { + if node.IsLeaf() { if bytes.Equal(node.Key(), key) { return node.Value(), nil, nil } @@ -159,7 +156,7 @@ func writeHashBytes(node Node, w io.Writer) error { // Key is not written for inner nodes, unlike writeBytes. - if isLeaf(node) { + if node.IsLeaf() { if err := EncodeBytes(w, node.Key()); err != nil { return fmt.Errorf("writing key, %w", err) } diff --git a/memiavl/persisted_node.go b/memiavl/persisted_node.go index 9ed497d519..f6338c96e2 100644 --- a/memiavl/persisted_node.go +++ b/memiavl/persisted_node.go @@ -3,21 +3,27 @@ package memiavl import ( "bytes" "crypto/sha256" + "sort" ) const ( - OffsetHeight = 0 - OffsetVersion = OffsetHeight + 4 - OffsetSize = OffsetVersion + 4 - OffsetKeyNode = OffsetSize + 4 + OffsetHeight = 0 + OffsetPreTrees = OffsetHeight + 1 + OffsetVersion = OffsetHeight + 4 + OffsetSize = OffsetVersion + 4 + OffsetKeyLeaf = OffsetSize + 4 - // leaf node repurpose two uint32 to store the offset in kv file. - OffsetKeyOffset = OffsetSize - - OffsetHash = OffsetKeyNode + 4 + OffsetHash = OffsetKeyLeaf + 4 SizeHash = sha256.Size SizeNodeWithoutHash = OffsetHash SizeNode = SizeNodeWithoutHash + SizeHash + + OffsetLeafVersion = 0 + OffsetLeafKeyLen = OffsetLeafVersion + 4 + OffsetLeafKeyOffset = OffsetLeafKeyLen + 4 + OffsetLeafHash = OffsetLeafKeyOffset + 8 + SizeLeafWithoutHash = OffsetLeafHash + SizeLeaf = SizeLeafWithoutHash + SizeHash ) // PersistedNode is backed by serialized byte array, usually mmap-ed from disk file. @@ -25,161 +31,214 @@ const ( // // Branch node: // - height : 1 -// - _padding : 3 +// - preTrees : 1 +// - _padding : 2 // - version : 4 // - size : 4 // - key node : 4 // node index of the smallest leaf in right branch // - hash : 32 // Leaf node: -// - height : 1 -// - _padding : 3 // - version : 4 +// - key len : 4 // - key offset : 8 // - hash : 32 type PersistedNode struct { snapshot *Snapshot + isLeaf bool index uint32 } var _ Node = PersistedNode{} -func (node PersistedNode) data() NodeLayout { +func (node PersistedNode) branchNode() NodeLayout { return node.snapshot.nodesLayout.Node(node.index) } +func (node PersistedNode) leafNode() LeafLayout { + return node.snapshot.leavesLayout.Leaf(node.index) +} + func (node PersistedNode) Height() uint8 { - return node.data().Height() + if node.isLeaf { + return 0 + } + return node.branchNode().Height() +} + +func (node PersistedNode) IsLeaf() bool { + return node.isLeaf } func (node PersistedNode) Version() uint32 { - return node.data().Version() + if node.isLeaf { + return node.leafNode().Version() + } + return node.branchNode().Version() } func (node PersistedNode) Size() int64 { - data := node.data() - if data.Height() == 0 { + if node.isLeaf { return 1 } - return int64(data.Size()) + return int64(node.branchNode().Size()) } func (node PersistedNode) Key() []byte { - data := node.data() - if data.Height() != 0 { - data = node.snapshot.nodesLayout.Node(data.KeyNode()) + if node.isLeaf { + return node.snapshot.LeafKey(node.index) } - return node.snapshot.Key(data.KeyOffset()) + index := node.branchNode().KeyLeaf() + return node.snapshot.LeafKey(index) } // Value result is not defined for non-leaf node. func (node PersistedNode) Value() []byte { - _, value := node.snapshot.KeyValue(node.data().KeyOffset()) + if !node.isLeaf { + panic("can't call Value on branch node") + } + _, value := node.snapshot.LeafKeyValue(node.index) return value } // Left result is not defined for leaf nodes. func (node PersistedNode) Left() Node { - return PersistedNode{snapshot: node.snapshot, index: node.data().KeyNode() - 1} + if node.isLeaf { + panic("can't call Left on leaf node") + } + + data := node.branchNode() + preTrees := uint32(data.PreTrees()) + startLeaf := getStartLeaf(node.index, data.Size(), preTrees) + keyLeaf := data.KeyLeaf() + if startLeaf+1 == keyLeaf { + return PersistedNode{snapshot: node.snapshot, index: startLeaf, isLeaf: true} + } + return PersistedNode{snapshot: node.snapshot, index: getLeftBranch(keyLeaf, preTrees)} } // Right result is not defined for leaf nodes. func (node PersistedNode) Right() Node { + if node.isLeaf { + panic("can't call Right on leaf node") + } + + data := node.branchNode() + keyLeaf := data.KeyLeaf() + preTrees := uint32(data.PreTrees()) + if keyLeaf == getEndLeaf(node.index, preTrees) { + return PersistedNode{snapshot: node.snapshot, index: keyLeaf, isLeaf: true} + } return PersistedNode{snapshot: node.snapshot, index: node.index - 1} } func (node PersistedNode) Hash() []byte { - return node.data().Hash() + if node.isLeaf { + return node.leafNode().Hash() + } + return node.branchNode().Hash() } func (node PersistedNode) Mutate(version, _ uint32) *MemNode { - data := node.data() - mnode := &MemNode{ + if node.isLeaf { + key, value := node.snapshot.LeafKeyValue(node.index) + return &MemNode{ + height: 0, + size: 1, + version: version, + key: key, + value: value, + } + } + data := node.branchNode() + return &MemNode{ height: data.Height(), - size: node.Size(), + size: int64(data.Size()), version: version, key: node.Key(), + left: node.Left(), + right: node.Right(), } - if mnode.isLeaf() { - mnode.value = node.Value() - } else { - mnode.left = node.Left() - mnode.right = node.Right() - } - return mnode } func (node PersistedNode) Get(key []byte) ([]byte, uint32) { - return getPersistedNode(node.snapshot, node.index, key) -} + var start, count uint32 + if node.isLeaf { + start = node.index + count = 1 + } else { + data := node.branchNode() + preTrees := uint32(data.PreTrees()) + count = data.Size() + start = getStartLeaf(node.index, count, preTrees) + } -func (node PersistedNode) GetByIndex(leafIndex uint32) ([]byte, []byte) { - return getPersistedNodeByIndex(node.snapshot, node.index, leafIndex) -} - -// getPersistedNode specialize the get function for `PersistedNode`. -// returns the value and the leaf index -func getPersistedNode(snapshot *Snapshot, root uint32, key []byte) ([]byte, uint32) { - nodes := snapshot.nodesLayout - index := root - var leafIndex uint32 - - for { - node := nodes.Node(index) - if node.Height() == 0 { - nodeKey, value := snapshot.KeyValue(node.KeyOffset()) - switch bytes.Compare(nodeKey, key) { - case -1: - return nil, leafIndex + 1 - case 1: - return nil, leafIndex - default: - return value, leafIndex - } - } + // binary search in the leaf node array + i := uint32(sort.Search(int(count), func(i int) bool { + leafKey := node.snapshot.LeafKey(start + uint32(i)) + return bytes.Compare(leafKey, key) >= 0 + })) - keyNode := node.KeyNode() - nodeKey := snapshot.Key(nodes.Node(keyNode).KeyOffset()) - if bytes.Compare(key, nodeKey) == -1 { - // left child - index = keyNode - 1 - } else { - size := node.Size() - // calculate the leaf size using formula `N=2L-1`. - rightSize := (index - keyNode + 1) / 2 - leafIndex += (size - rightSize) - - // right child - index-- - } + leaf := i + start + if leaf >= start+count { + // return the next index if the key is greater than all keys in the node + return nil, i } -} -func getPersistedNodeByIndex(snapshot *Snapshot, root uint32, leafIndex uint32) ([]byte, []byte) { - nodes := snapshot.nodesLayout - index := root + nodeKey, value := node.snapshot.LeafKeyValue(leaf) + if !bytes.Equal(nodeKey, key) { + return nil, i + } + + return value, i +} - for { - node := nodes.Node(index) - if node.Height() == 0 { - if leafIndex == 0 { - return snapshot.KeyValue(node.KeyOffset()) - } +func (node PersistedNode) GetByIndex(leafIndex uint32) ([]byte, []byte) { + if node.isLeaf { + if leafIndex != 0 { return nil, nil } + return node.snapshot.LeafKeyValue(node.index) + } + data := node.branchNode() + preTrees := uint32(data.PreTrees()) + startLeaf := getStartLeaf(node.index, data.Size(), preTrees) + endLeaf := getEndLeaf(node.index, preTrees) + + i := startLeaf + leafIndex + if i > endLeaf { + return nil, nil + } + return node.snapshot.LeafKeyValue(i) +} - keyNode := node.KeyNode() - size := node.Size() - // calculate the leaf size using formula `N=2L-1`. - rightSize := (index - keyNode + 1) / 2 - leftSize := size - rightSize - if leafIndex < leftSize { - // left child - index = keyNode - 1 - continue - } +// getStartLeaf returns the index of the first leaf in the node. +// +// > start leaf = pre leaves +// > = pre branches + pre trees +// > = total branches - sub branches + pre trees +// > = (index + 1) - (size - 1) + preTrees +// > = index + 2 - size + preTrees +func getStartLeaf(index, size, preTrees uint32) uint32 { + return index + 2 - size + preTrees +} - // right child - index-- - leafIndex -= leftSize - } +// getEndLeaf returns the index of the last leaf in the node. +// +// > end leaf = start leaf + size - 1 +// > = (index + 2 - size + preTrees) + size - 1 +// > = index + 1 + preTrees +func getEndLeaf(index, preTrees uint32) uint32 { + return index + preTrees + 1 +} + +// getLeftBranch returns the index of the left branch of the node. +// +// > left branch = pre branches + left branches - 1 +// > = (total branches - sub branches) + (left leaves - 1) - 1 +// > = (total branches - sub branches) + (key leaf - start leaf - 1) - 1 +// > = (index+1 - (size-1)) + (key leaf - (index + 2 - size + preTrees) - 1) - 1 +// > = (index - size + 2) + key leaf - index - 2 + size - preTrees - 2 +// > = key leaf - preTrees - 2 +func getLeftBranch(keyLeaf, preTrees uint32) uint32 { + return keyLeaf - preTrees - 2 } diff --git a/memiavl/snapshot.go b/memiavl/snapshot.go index 01f2745996..c97d45aead 100644 --- a/memiavl/snapshot.go +++ b/memiavl/snapshot.go @@ -7,7 +7,6 @@ import ( "errors" "fmt" "io" - "math" "os" "path/filepath" @@ -22,13 +21,11 @@ const ( // the initial snapshot format SnapshotFormat = 0 - // magic: uint32, format: uint32, version: uint32, root node index: uint32 - SizeMetadata = 16 - - // EmptyRootNodeIndex is a special value of root node index to represent empty tree - EmptyRootNodeIndex = math.MaxUint32 + // magic: uint32, format: uint32, version: uint32 + SizeMetadata = 12 FileNameNodes = "nodes" + FileNameLeaves = "leaves" FileNameKVs = "kvs" FileNameKVIndex = "kvs.index" FileNameMetadata = "metadata" @@ -37,28 +34,32 @@ const ( // Snapshot manage the lifecycle of mmap-ed files for the snapshot, // it must out live the objects that derived from it. type Snapshot struct { - nodesMap *MmapFile - kvsMap *MmapFile + nodesMap *MmapFile + leavesMap *MmapFile + kvsMap *MmapFile - nodes []byte - kvs []byte + nodes []byte + leaves []byte + kvs []byte // hash index of kvs index *recsplit.Index indexReader *recsplit.IndexReader // reader for the index // parsed from metadata file - version uint32 - rootIndex uint32 + version uint32 // wrapping the raw nodes buffer - nodesLayout Nodes + nodesLayout Nodes + leavesLayout Leaves + + // nil means empty snapshot + root *PersistedNode } func NewEmptySnapshot(version uint32) *Snapshot { return &Snapshot{ - version: version, - rootIndex: EmptyRootNodeIndex, + version: version, } } @@ -83,19 +84,16 @@ func OpenSnapshot(snapshotDir string) (*Snapshot, error) { return nil, fmt.Errorf("unknown snapshot format: %d", format) } version := binary.LittleEndian.Uint32(bz[8:]) - rootIndex := binary.LittleEndian.Uint32(bz[12:]) - if rootIndex == EmptyRootNodeIndex { - // we can't mmap empty files, so have to return early - return NewEmptySnapshot(version), nil - } - - var nodesMap, kvsMap *MmapFile + var nodesMap, leavesMap, kvsMap *MmapFile cleanupHandles := func(err error) error { errs := []error{err} if nodesMap != nil { errs = append(errs, nodesMap.Close()) } + if leavesMap != nil { + errs = append(errs, leavesMap.Close()) + } if kvsMap != nil { errs = append(errs, kvsMap.Close()) } @@ -105,22 +103,34 @@ func OpenSnapshot(snapshotDir string) (*Snapshot, error) { if nodesMap, err = NewMmap(filepath.Join(snapshotDir, FileNameNodes)); err != nil { return nil, cleanupHandles(err) } + if leavesMap, err = NewMmap(filepath.Join(snapshotDir, FileNameLeaves)); err != nil { + return nil, cleanupHandles(err) + } if kvsMap, err = NewMmap(filepath.Join(snapshotDir, FileNameKVs)); err != nil { return nil, cleanupHandles(err) } nodes := nodesMap.Data() + leaves := leavesMap.Data() kvs := kvsMap.Data() - if len(nodes) == 0 && rootIndex != 0 { + // validate nodes length + if len(nodes)%SizeNode != 0 { + return nil, cleanupHandles( + fmt.Errorf("corrupted snapshot, nodes file size %d is not a multiple of %d", len(nodes), SizeNode), + ) + } + if len(leaves)%SizeLeaf != 0 { return nil, cleanupHandles( - fmt.Errorf("corrupted snapshot, nodes are empty but rootIndex is not zero: %d", rootIndex), + fmt.Errorf("corrupted snapshot, leaves file size %d is not a multiple of %d", len(leaves), SizeLeaf), ) } - if len(nodes) > 0 && uint64(len(nodes)) != (uint64(rootIndex)+1)*SizeNode { + nodesLen := len(nodes) / SizeNode + leavesLen := len(leaves) / SizeLeaf + if (leavesLen > 0 && nodesLen+1 != leavesLen) || (leavesLen == 0 && nodesLen != 0) { return nil, cleanupHandles( - fmt.Errorf("nodes file size %d don't match root node index %d", len(nodes), rootIndex), + fmt.Errorf("corrupted snapshot, branch nodes size %d don't match leaves size %d", nodesLen, leavesLen), ) } @@ -145,22 +155,45 @@ func OpenSnapshot(snapshotDir string) (*Snapshot, error) { return nil, err } - return &Snapshot{ - nodesMap: nodesMap, - kvsMap: kvsMap, + leavesData, err := NewLeaves(leaves) + if err != nil { + return nil, err + } + + snapshot := &Snapshot{ + nodesMap: nodesMap, + leavesMap: leavesMap, + kvsMap: kvsMap, // cache the pointers - nodes: nodes, - kvs: kvs, + nodes: nodes, + leaves: leaves, + kvs: kvs, index: index, indexReader: indexReader, - version: version, - rootIndex: rootIndex, + version: version, - nodesLayout: nodesData, - }, nil + nodesLayout: nodesData, + leavesLayout: leavesData, + } + + if nodesLen > 0 { + snapshot.root = &PersistedNode{ + snapshot: snapshot, + isLeaf: false, + index: uint32(nodesLen - 1), + } + } else if leavesLen > 0 { + snapshot.root = &PersistedNode{ + snapshot: snapshot, + isLeaf: true, + index: 0, + } + } + + return snapshot, nil } // Close closes the file and mmap handles, clears the buffers. @@ -170,6 +203,9 @@ func (snapshot *Snapshot) Close() error { if snapshot.nodesMap != nil { errs = append(errs, snapshot.nodesMap.Close()) } + if snapshot.leavesMap != nil { + errs = append(errs, snapshot.leavesMap.Close()) + } if snapshot.kvsMap != nil { errs = append(errs, snapshot.kvsMap.Close()) } @@ -185,14 +221,24 @@ func (snapshot *Snapshot) Close() error { // IsEmpty returns if the snapshot is an empty tree. func (snapshot *Snapshot) IsEmpty() bool { - return snapshot.rootIndex == EmptyRootNodeIndex + return snapshot.root == nil } -// Node returns the node by index +// Node returns the branch node by index func (snapshot *Snapshot) Node(index uint32) PersistedNode { return PersistedNode{ snapshot: snapshot, index: index, + isLeaf: false, + } +} + +// Leaf returns the leaf node by index +func (snapshot *Snapshot) Leaf(index uint32) PersistedNode { + return PersistedNode{ + snapshot: snapshot, + index: index, + isLeaf: true, } } @@ -206,7 +252,7 @@ func (snapshot *Snapshot) RootNode() PersistedNode { if snapshot.IsEmpty() { panic("RootNode not supported on an empty snapshot") } - return snapshot.Node(snapshot.rootIndex) + return *snapshot.root } func (snapshot *Snapshot) RootHash() []byte { @@ -221,8 +267,18 @@ func (snapshot *Snapshot) nodesLen() int { return len(snapshot.nodes) / SizeNode } -// ScanNodes iterate over the nodes in the snapshot order (depth-first post-order) +// leavesLen returns the number of nodes in the snapshot +func (snapshot *Snapshot) leavesLen() int { + return len(snapshot.leaves) / SizeLeaf +} + +// ScanNodes iterate over the nodes in the snapshot order (depth-first post-order, leaf nodes before branch nodes) func (snapshot *Snapshot) ScanNodes(callback func(node PersistedNode) error) error { + for i := 0; i < snapshot.leavesLen(); i++ { + if err := callback(snapshot.Leaf(uint32(i))); err != nil { + return err + } + } for i := 0; i < snapshot.nodesLen(); i++ { if err := callback(snapshot.Node(uint32(i))); err != nil { return err @@ -260,18 +316,38 @@ func (snapshot *Snapshot) KeyValue(offset uint64) ([]byte, []byte) { return key, value } +func (snapshot *Snapshot) LeafKey(index uint32) []byte { + leaf := snapshot.leavesLayout.Leaf(index) + offset := leaf.KeyOffset() + 4 + return snapshot.kvs[offset : offset+uint64(leaf.KeyLength())] +} + +func (snapshot *Snapshot) LeafKeyValue(index uint32) ([]byte, []byte) { + leaf := snapshot.leavesLayout.Leaf(index) + offset := leaf.KeyOffset() + 4 + length := uint64(leaf.KeyLength()) + key := snapshot.kvs[offset : offset+length] + offset += length + length = uint64(binary.LittleEndian.Uint32(snapshot.kvs[offset:])) + offset += 4 + return key, snapshot.kvs[offset : offset+length] +} + // Export exports the nodes in DFS post-order, resemble the API of existing iavl library func (snapshot *Snapshot) Export() *Exporter { - return &Exporter{snapshot: snapshot, count: snapshot.nodesLen()} + return newExporter(snapshot) } // WriteSnapshot save the IAVL tree to a new snapshot directory. func (t *Tree) WriteSnapshot(snapshotDir string, writeHashIndex bool) error { return writeSnapshot(snapshotDir, t.version, writeHashIndex, func(w *snapshotWriter) (uint32, error) { if t.root == nil { - return EmptyRootNodeIndex, nil + return 0, nil } else { - return w.writeRecursive(t.root) + if err := w.writeRecursive(t.root); err != nil { + return 0, err + } + return w.leafCounter, nil } }) } @@ -285,6 +361,7 @@ func writeSnapshot( } nodesFile := filepath.Join(dir, FileNameNodes) + leavesFile := filepath.Join(dir, FileNameLeaves) kvsFile := filepath.Join(dir, FileNameKVs) kvsIndexFile := filepath.Join(dir, FileNameKVIndex) @@ -298,6 +375,16 @@ func writeSnapshot( } }() + fpLeaves, err := createFile(leavesFile) + if err != nil { + return err + } + defer func() { + if err := fpLeaves.Close(); returnErr == nil { + returnErr = err + } + }() + fpKVs, err := createFile(kvsFile) if err != nil { return err @@ -309,18 +396,22 @@ func writeSnapshot( }() nodesWriter := bufio.NewWriter(fpNodes) + leavesWriter := bufio.NewWriter(fpLeaves) kvsWriter := bufio.NewWriter(fpKVs) - w := newSnapshotWriter(nodesWriter, kvsWriter) - rootIndex, err := doWrite(w) + w := newSnapshotWriter(nodesWriter, leavesWriter, kvsWriter) + leaves, err := doWrite(w) if err != nil { return err } - if rootIndex != EmptyRootNodeIndex { + if leaves > 0 { if err := nodesWriter.Flush(); err != nil { return err } + if err := leavesWriter.Flush(); err != nil { + return err + } if err := kvsWriter.Flush(); err != nil { return err } @@ -328,6 +419,9 @@ func writeSnapshot( if err := fpKVs.Sync(); err != nil { return err } + if err := fpLeaves.Sync(); err != nil { + return err + } if err := fpNodes.Sync(); err != nil { return err } @@ -343,8 +437,6 @@ func writeSnapshot( returnErr = err } }() - // N = 2L-1 - leaves := (rootIndex + 2) / 2 if err := buildIndex(input, kvsIndexFile, dir, int(leaves)); err != nil { return fmt.Errorf("build MPHF index failed: %w", err) } @@ -356,7 +448,6 @@ func writeSnapshot( binary.LittleEndian.PutUint32(metadataBuf[:], SnapshotFileMagic) binary.LittleEndian.PutUint32(metadataBuf[4:], SnapshotFormat) binary.LittleEndian.PutUint32(metadataBuf[8:], version) - binary.LittleEndian.PutUint32(metadataBuf[12:], rootIndex) metadataFile := filepath.Join(dir, FileNameMetadata) fpMetadata, err := createFile(metadataFile) @@ -377,19 +468,20 @@ func writeSnapshot( } type snapshotWriter struct { - nodesWriter, kvWriter io.Writer + nodesWriter, leavesWriter, kvWriter io.Writer - // record the current node index - nodeIndex uint32 + // count how many nodes have been written + branchCounter, leafCounter uint32 // record the current writing offset in kvs file kvsOffset uint64 } -func newSnapshotWriter(nodesWriter, kvsWriter io.Writer) *snapshotWriter { +func newSnapshotWriter(nodesWriter, leavesWriter, kvsWriter io.Writer) *snapshotWriter { return &snapshotWriter{ - nodesWriter: nodesWriter, - kvWriter: kvsWriter, + nodesWriter: nodesWriter, + leavesWriter: leavesWriter, + kvWriter: kvsWriter, } } @@ -417,56 +509,66 @@ func (w *snapshotWriter) writeKeyValue(key, value []byte) error { return nil } -func (w *snapshotWriter) writeNode(node, hash []byte) (uint32, error) { - if _, err := w.nodesWriter.Write(node); err != nil { - return 0, err - } - if _, err := w.nodesWriter.Write(hash); err != nil { - return 0, err - } +func (w *snapshotWriter) writeLeaf(version uint32, key, value, hash []byte) error { + var buf [SizeLeafWithoutHash]byte + binary.LittleEndian.PutUint32(buf[OffsetLeafVersion:], version) + binary.LittleEndian.PutUint32(buf[OffsetLeafKeyLen:], uint32(len(key))) + binary.LittleEndian.PutUint64(buf[OffsetLeafKeyOffset:], w.kvsOffset) - i := w.nodeIndex - w.nodeIndex++ - return i, nil -} - -func (w *snapshotWriter) writeLeaf(version uint32, key, value, hash []byte) (uint32, error) { - var buf [SizeNodeWithoutHash]byte - binary.LittleEndian.PutUint32(buf[OffsetVersion:], version) - keyOffset := w.kvsOffset if err := w.writeKeyValue(key, value); err != nil { - return 0, err + return err + } + + if _, err := w.leavesWriter.Write(buf[:]); err != nil { + return err + } + if _, err := w.leavesWriter.Write(hash); err != nil { + return err } - binary.LittleEndian.PutUint64(buf[OffsetKeyOffset:], keyOffset) - return w.writeNode(buf[:], hash) + w.leafCounter++ + return nil } -func (w *snapshotWriter) writeBranch(version, size uint32, height uint8, keyNode uint32, hash []byte) (uint32, error) { +func (w *snapshotWriter) writeBranch(version, size uint32, height, preTrees uint8, keyLeaf uint32, hash []byte) error { var buf [SizeNodeWithoutHash]byte buf[OffsetHeight] = height + buf[OffsetPreTrees] = preTrees binary.LittleEndian.PutUint32(buf[OffsetVersion:], version) binary.LittleEndian.PutUint32(buf[OffsetSize:], size) - binary.LittleEndian.PutUint32(buf[OffsetKeyNode:], keyNode) + binary.LittleEndian.PutUint32(buf[OffsetKeyLeaf:], keyLeaf) - return w.writeNode(buf[:], hash) + if _, err := w.nodesWriter.Write(buf[:]); err != nil { + return err + } + if _, err := w.nodesWriter.Write(hash); err != nil { + return err + } + + w.branchCounter++ + return nil } // writeRecursive write the node recursively in depth-first post-order, // returns `(nodeIndex, err)`. -func (w *snapshotWriter) writeRecursive(node Node) (uint32, error) { - if isLeaf(node) { +func (w *snapshotWriter) writeRecursive(node Node) error { + if node.IsLeaf() { return w.writeLeaf(node.Version(), node.Key(), node.Value(), node.Hash()) } - leftIndex, err := w.writeRecursive(node.Left()) - if err != nil { - return 0, err + // record the number of pending subtrees before the current one, + // it's always positive and won't exceed the tree height, so we can use an uint8 to store it. + preTrees := uint8(w.leafCounter - w.branchCounter) + + if err := w.writeRecursive(node.Left()); err != nil { + return err } - if _, err := w.writeRecursive(node.Right()); err != nil { - return 0, err + keyLeaf := w.leafCounter + if err := w.writeRecursive(node.Right()); err != nil { + return err } - return w.writeBranch(node.Version(), uint32(node.Size()), node.Height(), leftIndex+1, node.Hash()) + + return w.writeBranch(node.Version(), uint32(node.Size()), node.Height(), preTrees, keyLeaf, node.Hash()) } // buildIndex build MPHF index for the kvs file.