Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

raw_restore: fix the issue that raw restore rewrite the t prefix keys (#35641) #35692

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions br/pkg/task/restore_raw.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
// Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0.

package task

import (
"context"

"github.com/pingcap/errors"
"github.com/pingcap/log"
"github.com/pingcap/tidb/br/pkg/conn"
berrors "github.com/pingcap/tidb/br/pkg/errors"
"github.com/pingcap/tidb/br/pkg/glue"
"github.com/pingcap/tidb/br/pkg/httputil"
"github.com/pingcap/tidb/br/pkg/metautil"
"github.com/pingcap/tidb/br/pkg/restore"
"github.com/pingcap/tidb/br/pkg/summary"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
)

// RestoreRawConfig is the configuration specific for raw kv restore tasks.
type RestoreRawConfig struct {
RawKvConfig
RestoreCommonConfig
}

// DefineRawRestoreFlags defines common flags for the backup command.
func DefineRawRestoreFlags(command *cobra.Command) {
command.Flags().StringP(flagKeyFormat, "", "hex", "start/end key format, support raw|escaped|hex")
command.Flags().StringP(flagTiKVColumnFamily, "", "default", "restore specify cf, correspond to tikv cf")
command.Flags().StringP(flagStartKey, "", "", "restore raw kv start key, key is inclusive")
command.Flags().StringP(flagEndKey, "", "", "restore raw kv end key, key is exclusive")

DefineRestoreCommonFlags(command.PersistentFlags())
}

// ParseFromFlags parses the backup-related flags from the flag set.
func (cfg *RestoreRawConfig) ParseFromFlags(flags *pflag.FlagSet) error {
var err error
cfg.Online, err = flags.GetBool(flagOnline)
if err != nil {
return errors.Trace(err)
}
err = cfg.RestoreCommonConfig.ParseFromFlags(flags)
if err != nil {
return errors.Trace(err)
}
return cfg.RawKvConfig.ParseFromFlags(flags)
}

func (cfg *RestoreRawConfig) adjust() {
cfg.Config.adjust()
cfg.RestoreCommonConfig.adjust()

if cfg.Concurrency == 0 {
cfg.Concurrency = defaultRestoreConcurrency
}
}

// RunRestoreRaw starts a raw kv restore task inside the current goroutine.
func RunRestoreRaw(c context.Context, g glue.Glue, cmdName string, cfg *RestoreRawConfig) (err error) {
cfg.adjust()

defer summary.Summary(cmdName)
ctx, cancel := context.WithCancel(c)
defer cancel()

// Restore raw does not need domain.
needDomain := false
mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, GetKeepalive(&cfg.Config), cfg.CheckRequirements, needDomain)
if err != nil {
return errors.Trace(err)
}
defer mgr.Close()

mergeRegionSize := cfg.MergeSmallRegionSizeBytes
mergeRegionCount := cfg.MergeSmallRegionKeyCount
if mergeRegionSize == conn.DefaultMergeRegionSizeBytes &&
mergeRegionCount == conn.DefaultMergeRegionKeyCount {
// according to https://github.com/pingcap/tidb/issues/34167.
// we should get the real config from tikv to adapt the dynamic region.
httpCli := httputil.NewClient(mgr.GetTLSConfig())
mergeRegionSize, mergeRegionCount, err = mgr.GetMergeRegionSizeAndCount(ctx, httpCli)
if err != nil {
return errors.Trace(err)
}
}

keepaliveCfg := GetKeepalive(&cfg.Config)
// sometimes we have pooled the connections.
// sending heartbeats in idle times is useful.
keepaliveCfg.PermitWithoutStream = true
client := restore.NewRestoreClient(mgr.GetPDClient(), mgr.GetTLSConfig(), keepaliveCfg, true)
client.SetRateLimit(cfg.RateLimit)
client.SetCrypter(&cfg.CipherInfo)
client.SetConcurrency(uint(cfg.Concurrency))
if cfg.Online {
client.EnableOnline()
}
client.SetSwitchModeInterval(cfg.SwitchModeInterval)
err = client.Init(g, mgr.GetStorage())
defer client.Close()
if err != nil {
return errors.Trace(err)
}

u, s, backupMeta, err := ReadBackupMeta(ctx, metautil.MetaFile, &cfg.Config)
if err != nil {
return errors.Trace(err)
}
reader := metautil.NewMetaReader(backupMeta, s, &cfg.CipherInfo)
if err = client.InitBackupMeta(c, backupMeta, u, reader); err != nil {
return errors.Trace(err)
}

if !client.IsRawKvMode() {
return errors.Annotate(berrors.ErrRestoreModeMismatch, "cannot do raw restore from transactional data")
}

files, err := client.GetFilesInRawRange(cfg.StartKey, cfg.EndKey, cfg.CF)
if err != nil {
return errors.Trace(err)
}
archiveSize := reader.ArchiveSize(ctx, files)
g.Record(summary.RestoreDataSize, archiveSize)

if len(files) == 0 {
log.Info("all files are filtered out from the backup archive, nothing to restore")
return nil
}
summary.CollectInt("restore files", len(files))

ranges, _, err := restore.MergeFileRanges(
files, mergeRegionSize, mergeRegionCount)
if err != nil {
return errors.Trace(err)
}

// Redirect to log if there is no log file to avoid unreadable output.
// TODO: How to show progress?
updateCh := g.StartProgress(
ctx,
"Raw Restore",
// Split/Scatter + Download/Ingest
int64(len(ranges)+len(files)),
!cfg.LogProgress)

// RawKV restore does not need to rewrite keys.
err = restore.SplitRanges(ctx, client, ranges, nil, updateCh, true)
if err != nil {
return errors.Trace(err)
}

restoreSchedulers, err := restorePreWork(ctx, client, mgr, true)
if err != nil {
return errors.Trace(err)
}
defer restorePostWork(ctx, client, restoreSchedulers)

err = client.RestoreRaw(ctx, cfg.StartKey, cfg.EndKey, files, updateCh)
if err != nil {
return errors.Trace(err)
}

// Restore has finished.
updateCh.Close()

// Set task summary to success status.
summary.SetSuccessStatus(true)
return nil
}
189 changes: 189 additions & 0 deletions br/tests/br_rawkv/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#!/bin/sh
#
# Copyright 2019 PingCAP, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -eux

# restart service without tiflash
source $( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/../_utils/run_services
start_services --no-tiflash

BACKUP_DIR=$TEST_DIR/"raw_backup"
BACKUP_FULL=$TEST_DIR/"rawkv-full"

checksum() {
bin/rawkv --pd $PD_ADDR \
--ca "$TEST_DIR/certs/ca.pem" \
--cert "$TEST_DIR/certs/br.pem" \
--key "$TEST_DIR/certs/br.key" \
--mode checksum --start-key $1 --end-key $2 | grep result | awk '{print $3}'
}

fail_and_exit() {
echo "TEST: [$TEST_NAME] failed!"
exit 1
}

clean() {
bin/rawkv --pd $PD_ADDR \
--ca "$TEST_DIR/certs/ca.pem" \
--cert "$TEST_DIR/certs/br.pem" \
--key "$TEST_DIR/certs/br.key" \
--mode delete --start-key $1 --end-key $2
}

test_full_rawkv() {
check_range_start=00
check_range_end=ff

rm -rf $BACKUP_FULL

checksum_full=$(checksum $check_range_start $check_range_end)
# backup current state of key-values
# raw backup is not working with range [nil, nil]. TODO: fix it.
run_br --pd $PD_ADDR backup raw -s "local://$BACKUP_FULL" --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef" --start $check_range_start --format hex

clean $check_range_start $check_range_end
# Ensure the data is deleted
checksum_new=$(checksum $check_range_start $check_range_end)
if [ "$checksum_new" == "$checksum_full" ];then
echo "failed to delete data in range"
fail_and_exit
fi

run_br --pd $PD_ADDR restore raw -s "local://$BACKUP_FULL" --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef" --start $check_range_start --format hex
checksum_new=$(checksum $check_range_start $check_range_end)
if [ "$checksum_new" != "$checksum_full" ];then
echo "failed to restore"
fail_and_exit
fi
}

checksum_empty=$(checksum 31 3130303030303030)

run_test() {
if [ -z "$1" ];then
echo "run test"
else
export GO_FAILPOINTS="$1"
echo "run test with failpoints: $GO_FAILPOINTS"
fi

rm -rf $BACKUP_DIR
clean 31 3130303030303030

# generate raw kv randomly in range[start-key, end-key) in 10s
bin/rawkv --pd $PD_ADDR \
--ca "$TEST_DIR/certs/ca.pem" \
--cert "$TEST_DIR/certs/br.pem" \
--key "$TEST_DIR/certs/br.key" \
--mode rand-gen --start-key 31 --end-key 3130303030303030 --duration 10

# put some keys around 311122 to check the correctness of endKey of restoring
bin/rawkv --pd $PD_ADDR \
--ca "$TEST_DIR/certs/ca.pem" \
--cert "$TEST_DIR/certs/br.pem" \
--key "$TEST_DIR/certs/br.key" \
--mode put --put-data "311121:31, 31112100:32, 311122:33, 31112200:34, 3111220000:35, 311123:36"


# put some keys starts with t. https://github.com/pingcap/tidb/issues/35279
# t_128_r_12 --<hex encode>--> 745f3132385f725f3132
# t_128_r_13 --<hex encode>--> 745f3132385f725f3133
bin/rawkv --pd $PD_ADDR \
--ca "$TEST_DIR/certs/ca.pem" \
--cert "$TEST_DIR/certs/br.pem" \
--key "$TEST_DIR/certs/br.key" \
--mode put --put-data "745f3132385f725f3132:31, 745f3132385f725f3133:32"

checksum_ori=$(checksum 31 3130303030303030)
checksum_partial=$(checksum 311111 311122)
checksum_t_prefix=$(checksum 745f3132385f725f3131 745f3132385f725f3134)

# backup rawkv
echo "backup start..."
run_br --pd $PD_ADDR backup raw -s "local://$BACKUP_DIR" --start 31 --end 745f3132385f725f3134 --format hex --concurrency 4 --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef"

# delete data in range[start-key, end-key)
clean 31 3130303030303030
# Ensure the data is deleted
checksum_new=$(checksum 31 3130303030303030)

if [ "$checksum_new" != "$checksum_empty" ];then
echo "failed to delete data in range"
fail_and_exit
fi

# restore rawkv
echo "restore start..."
run_br --pd $PD_ADDR restore raw -s "local://$BACKUP_DIR" --start 31 --end 3130303030303030 --format hex --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef"

checksum_new=$(checksum 31 3130303030303030)

if [ "$checksum_new" != "$checksum_ori" ];then
echo "checksum failed after restore"
fail_and_exit
fi

test_full_rawkv

# delete data in range[start-key, end-key)
clean 31 3130303030303030
# Ensure the data is deleted
checksum_new=$(checksum 31 3130303030303030)

if [ "$checksum_new" != "$checksum_empty" ];then
echo "failed to delete data in range"
fail_and_exit
fi

echo "partial restore start..."
run_br --pd $PD_ADDR restore raw -s "local://$BACKUP_DIR" --start 311111 --end 311122 --format hex --concurrency 4 --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef"
bin/rawkv --pd $PD_ADDR \
--ca "$TEST_DIR/certs/ca.pem" \
--cert "$TEST_DIR/certs/br.pem" \
--key "$TEST_DIR/certs/br.key" \
--mode scan --start-key 311121 --end-key 33

checksum_new=$(checksum 31 3130303030303030)

if [ "$checksum_new" != "$checksum_partial" ];then
echo "checksum failed after restore"
fail_and_exit
fi

echo "t prefix restore start..."
run_br --pd $PD_ADDR restore raw -s "local://$BACKUP_DIR" --start "745f3132385f725f3131" --end "745f3132385f725f3134" --format hex --concurrency 4 --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef"
bin/rawkv --pd $PD_ADDR \
--ca "$TEST_DIR/certs/ca.pem" \
--cert "$TEST_DIR/certs/br.pem" \
--key "$TEST_DIR/certs/br.key" \
--mode scan --start-key 745f3132385f725f3131 --end-key 745f3132385f725f3134

checksum_new=$(checksum 745f3132385f725f3131 745f3132385f725f3134)

if [ "$checksum_new" != "$checksum_t_prefix" ];then
echo "checksum failed after restore"
fail_and_exit
fi

export GO_FAILPOINTS=""
}


run_test ""

# ingest "region error" to trigger fineGrainedBackup
run_test "github.com/pingcap/tidb/br/pkg/backup/tikv-region-error=return(\"region error\")"