Skip to content

Commit

Permalink
Problem: memiavl don't recover corrupted wal tail (#1073)
Browse files Browse the repository at this point in the history
* Problem: memiavl don't recover corrupted wal tail

Solution:
- fix in wal and update dependencies

* Update CHANGELOG.md

Signed-off-by: yihuang <huang@crypto.com>

* fix truncate wal index

* don't patch upstream

* Update CHANGELOG.md

Signed-off-by: yihuang <huang@crypto.com>

* no dep change

* fix lint

* fix lint

* fix lint

---------

Signed-off-by: yihuang <huang@crypto.com>
Co-authored-by: mmsqe <mavis@crypto.com>
  • Loading branch information
yihuang and mmsqe authored Jun 21, 2023
1 parent 06a0b5d commit f83c74b
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
- [#1058](https://github.com/crypto-org-chain/cronos/pull/1058) Fix decode log for multi topics in websocket subscribe ([ethermint commit](https://github.com/crypto-org-chain/ethermint/commit/2136ad029860c819942ad1836dd3f42585002233)).
- [#1062](https://github.com/crypto-org-chain/cronos/pull/1062) Update cometbft `v0.34.29` with several minor bug fixes and low-severity security-fixes.
- [#1075](https://github.com/crypto-org-chain/cronos/pull/1075) Add missing close in memiavl to avoid resource leaks.
- [#1073](https://github.com/crypto-org-chain/cronos/pull/1073) memiavl automatically truncate corrupted wal tail.

### Features

Expand Down
4 changes: 2 additions & 2 deletions memiavl/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func Load(dir string, opts Options) (*DB, error) {
}
}

wal, err := wal.Open(walPath(dir), &wal.Options{NoCopy: true, NoSync: true})
wal, err := OpenWAL(walPath(dir), &wal.Options{NoCopy: true, NoSync: true})
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -316,7 +316,7 @@ func (db *DB) pruneSnapshots() {
db.logger.Error("failed to find first snapshot", "err", err)
}

if err := db.wal.TruncateFront(uint64(earliestVersion + 1)); err != nil {
if err := db.wal.TruncateFront(walIndex(earliestVersion+1, db.initialVersion)); err != nil {
db.logger.Error("failed to truncate wal", "err", err, "version", earliestVersion+1)
}
}()
Expand Down
100 changes: 100 additions & 0 deletions memiavl/wal.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package memiavl

import (
"bytes"
"encoding/binary"
"fmt"
"os"
"path/filepath"
"unsafe"

"github.com/tidwall/gjson"
"github.com/tidwall/wal"
)

// OpenWAL opens the write ahead log, try to truncate the corrupted tail if there's any
// TODO fix in upstream: https://github.com/tidwall/wal/pull/22
func OpenWAL(dir string, opts *wal.Options) (*wal.Log, error) {
log, err := wal.Open(dir, opts)
if err == wal.ErrCorrupt {
// try to truncate corrupted tail
var fis []os.DirEntry
fis, err = os.ReadDir(dir)
if err != nil {
return nil, fmt.Errorf("read wal dir fail: %w", err)
}
var lastSeg string
for _, fi := range fis {
if fi.IsDir() || len(fi.Name()) < 20 {
continue
}
lastSeg = fi.Name()
}

if len(lastSeg) == 0 {
return nil, err
}
if err = truncateCorruptedTail(filepath.Join(dir, lastSeg), opts.LogFormat); err != nil {
return nil, fmt.Errorf("truncate corrupted tail fail: %w", err)
}

// try again
return wal.Open(dir, opts)
}

return log, err
}

func truncateCorruptedTail(path string, format wal.LogFormat) error {
data, err := os.ReadFile(path)
if err != nil {
return err
}
var pos int
for len(data) > 0 {
var n int
if format == wal.JSON {
n, err = loadNextJSONEntry(data)
} else {
n, err = loadNextBinaryEntry(data)
}
if err == wal.ErrCorrupt {
break
}
if err != nil {
return err
}
data = data[n:]
pos += n
}
if pos != len(data) {
return os.Truncate(path, int64(pos))
}
return nil
}

func loadNextJSONEntry(data []byte) (n int, err error) {
// {"index":number,"data":string}
idx := bytes.IndexByte(data, '\n')
if idx == -1 {
return 0, wal.ErrCorrupt
}
line := data[:idx]
dres := gjson.Get(*(*string)(unsafe.Pointer(&line)), "data")
if dres.Type != gjson.String {
return 0, wal.ErrCorrupt
}
return idx + 1, nil
}

func loadNextBinaryEntry(data []byte) (n int, err error) {
// data_size + data
size, n := binary.Uvarint(data)
if n <= 0 {
return 0, wal.ErrCorrupt
}
if uint64(len(data)-n) < size {
return 0, wal.ErrCorrupt
}
return n + int(size), nil
}
45 changes: 45 additions & 0 deletions memiavl/wal_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package memiavl

import (
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/require"
"github.com/tidwall/wal"
)

func TestCorruptedTail(t *testing.T) {
opts := &wal.Options{
LogFormat: wal.JSON,
}
dir := t.TempDir()

testCases := []struct {
name string
logs []byte
lastIndex uint64
}{
{"failure-1", []byte("\n"), 0},
{"failure-2", []byte(`{}` + "\n"), 0},
{"failure-3", []byte(`{"index":"1"}` + "\n"), 0},
{"failure-4", []byte(`{"index":"1","data":"?"}`), 0},
{"failure-5", []byte(`{"index":1,"data":"?"}` + "\n" + `{"index":"1","data":"?"}`), 1},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
os.WriteFile(filepath.Join(dir, "00000000000000000001"), tc.logs, 0o600)

_, err := wal.Open(dir, opts)
require.Equal(t, wal.ErrCorrupt, err)

log, err := OpenWAL(dir, opts)
require.NoError(t, err)

lastIndex, err := log.LastIndex()
require.NoError(t, err)
require.Equal(t, tc.lastIndex, lastIndex)
})
}
}

0 comments on commit f83c74b

Please sign in to comment.