diff --git a/br/pkg/task/restore_raw.go b/br/pkg/task/restore_raw.go new file mode 100644 index 0000000000000..452cccfad8c42 --- /dev/null +++ b/br/pkg/task/restore_raw.go @@ -0,0 +1,171 @@ +// Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0. + +package task + +import ( + "context" + + "github.com/pingcap/errors" + "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/conn" + berrors "github.com/pingcap/tidb/br/pkg/errors" + "github.com/pingcap/tidb/br/pkg/glue" + "github.com/pingcap/tidb/br/pkg/httputil" + "github.com/pingcap/tidb/br/pkg/metautil" + "github.com/pingcap/tidb/br/pkg/restore" + "github.com/pingcap/tidb/br/pkg/summary" + "github.com/spf13/cobra" + "github.com/spf13/pflag" +) + +// RestoreRawConfig is the configuration specific for raw kv restore tasks. +type RestoreRawConfig struct { + RawKvConfig + RestoreCommonConfig +} + +// DefineRawRestoreFlags defines common flags for the backup command. +func DefineRawRestoreFlags(command *cobra.Command) { + command.Flags().StringP(flagKeyFormat, "", "hex", "start/end key format, support raw|escaped|hex") + command.Flags().StringP(flagTiKVColumnFamily, "", "default", "restore specify cf, correspond to tikv cf") + command.Flags().StringP(flagStartKey, "", "", "restore raw kv start key, key is inclusive") + command.Flags().StringP(flagEndKey, "", "", "restore raw kv end key, key is exclusive") + + DefineRestoreCommonFlags(command.PersistentFlags()) +} + +// ParseFromFlags parses the backup-related flags from the flag set. +func (cfg *RestoreRawConfig) ParseFromFlags(flags *pflag.FlagSet) error { + var err error + cfg.Online, err = flags.GetBool(flagOnline) + if err != nil { + return errors.Trace(err) + } + err = cfg.RestoreCommonConfig.ParseFromFlags(flags) + if err != nil { + return errors.Trace(err) + } + return cfg.RawKvConfig.ParseFromFlags(flags) +} + +func (cfg *RestoreRawConfig) adjust() { + cfg.Config.adjust() + cfg.RestoreCommonConfig.adjust() + + if cfg.Concurrency == 0 { + cfg.Concurrency = defaultRestoreConcurrency + } +} + +// RunRestoreRaw starts a raw kv restore task inside the current goroutine. +func RunRestoreRaw(c context.Context, g glue.Glue, cmdName string, cfg *RestoreRawConfig) (err error) { + cfg.adjust() + + defer summary.Summary(cmdName) + ctx, cancel := context.WithCancel(c) + defer cancel() + + // Restore raw does not need domain. + needDomain := false + mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, GetKeepalive(&cfg.Config), cfg.CheckRequirements, needDomain) + if err != nil { + return errors.Trace(err) + } + defer mgr.Close() + + mergeRegionSize := cfg.MergeSmallRegionSizeBytes + mergeRegionCount := cfg.MergeSmallRegionKeyCount + if mergeRegionSize == conn.DefaultMergeRegionSizeBytes && + mergeRegionCount == conn.DefaultMergeRegionKeyCount { + // according to https://github.com/pingcap/tidb/issues/34167. + // we should get the real config from tikv to adapt the dynamic region. + httpCli := httputil.NewClient(mgr.GetTLSConfig()) + mergeRegionSize, mergeRegionCount, err = mgr.GetMergeRegionSizeAndCount(ctx, httpCli) + if err != nil { + return errors.Trace(err) + } + } + + keepaliveCfg := GetKeepalive(&cfg.Config) + // sometimes we have pooled the connections. + // sending heartbeats in idle times is useful. + keepaliveCfg.PermitWithoutStream = true + client := restore.NewRestoreClient(mgr.GetPDClient(), mgr.GetTLSConfig(), keepaliveCfg, true) + client.SetRateLimit(cfg.RateLimit) + client.SetCrypter(&cfg.CipherInfo) + client.SetConcurrency(uint(cfg.Concurrency)) + if cfg.Online { + client.EnableOnline() + } + client.SetSwitchModeInterval(cfg.SwitchModeInterval) + err = client.Init(g, mgr.GetStorage()) + defer client.Close() + if err != nil { + return errors.Trace(err) + } + + u, s, backupMeta, err := ReadBackupMeta(ctx, metautil.MetaFile, &cfg.Config) + if err != nil { + return errors.Trace(err) + } + reader := metautil.NewMetaReader(backupMeta, s, &cfg.CipherInfo) + if err = client.InitBackupMeta(c, backupMeta, u, reader); err != nil { + return errors.Trace(err) + } + + if !client.IsRawKvMode() { + return errors.Annotate(berrors.ErrRestoreModeMismatch, "cannot do raw restore from transactional data") + } + + files, err := client.GetFilesInRawRange(cfg.StartKey, cfg.EndKey, cfg.CF) + if err != nil { + return errors.Trace(err) + } + archiveSize := reader.ArchiveSize(ctx, files) + g.Record(summary.RestoreDataSize, archiveSize) + + if len(files) == 0 { + log.Info("all files are filtered out from the backup archive, nothing to restore") + return nil + } + summary.CollectInt("restore files", len(files)) + + ranges, _, err := restore.MergeFileRanges( + files, mergeRegionSize, mergeRegionCount) + if err != nil { + return errors.Trace(err) + } + + // Redirect to log if there is no log file to avoid unreadable output. + // TODO: How to show progress? + updateCh := g.StartProgress( + ctx, + "Raw Restore", + // Split/Scatter + Download/Ingest + int64(len(ranges)+len(files)), + !cfg.LogProgress) + + // RawKV restore does not need to rewrite keys. + err = restore.SplitRanges(ctx, client, ranges, nil, updateCh, true) + if err != nil { + return errors.Trace(err) + } + + restoreSchedulers, err := restorePreWork(ctx, client, mgr, true) + if err != nil { + return errors.Trace(err) + } + defer restorePostWork(ctx, client, restoreSchedulers) + + err = client.RestoreRaw(ctx, cfg.StartKey, cfg.EndKey, files, updateCh) + if err != nil { + return errors.Trace(err) + } + + // Restore has finished. + updateCh.Close() + + // Set task summary to success status. + summary.SetSuccessStatus(true) + return nil +} diff --git a/br/tests/br_rawkv/run.sh b/br/tests/br_rawkv/run.sh new file mode 100755 index 0000000000000..b32cca0f8e41f --- /dev/null +++ b/br/tests/br_rawkv/run.sh @@ -0,0 +1,189 @@ +#!/bin/sh +# +# Copyright 2019 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eux + +# restart service without tiflash +source $( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/../_utils/run_services +start_services --no-tiflash + +BACKUP_DIR=$TEST_DIR/"raw_backup" +BACKUP_FULL=$TEST_DIR/"rawkv-full" + +checksum() { + bin/rawkv --pd $PD_ADDR \ + --ca "$TEST_DIR/certs/ca.pem" \ + --cert "$TEST_DIR/certs/br.pem" \ + --key "$TEST_DIR/certs/br.key" \ + --mode checksum --start-key $1 --end-key $2 | grep result | awk '{print $3}' +} + +fail_and_exit() { + echo "TEST: [$TEST_NAME] failed!" + exit 1 +} + +clean() { + bin/rawkv --pd $PD_ADDR \ + --ca "$TEST_DIR/certs/ca.pem" \ + --cert "$TEST_DIR/certs/br.pem" \ + --key "$TEST_DIR/certs/br.key" \ + --mode delete --start-key $1 --end-key $2 +} + +test_full_rawkv() { + check_range_start=00 + check_range_end=ff + + rm -rf $BACKUP_FULL + + checksum_full=$(checksum $check_range_start $check_range_end) + # backup current state of key-values + # raw backup is not working with range [nil, nil]. TODO: fix it. + run_br --pd $PD_ADDR backup raw -s "local://$BACKUP_FULL" --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef" --start $check_range_start --format hex + + clean $check_range_start $check_range_end + # Ensure the data is deleted + checksum_new=$(checksum $check_range_start $check_range_end) + if [ "$checksum_new" == "$checksum_full" ];then + echo "failed to delete data in range" + fail_and_exit + fi + + run_br --pd $PD_ADDR restore raw -s "local://$BACKUP_FULL" --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef" --start $check_range_start --format hex + checksum_new=$(checksum $check_range_start $check_range_end) + if [ "$checksum_new" != "$checksum_full" ];then + echo "failed to restore" + fail_and_exit + fi +} + +checksum_empty=$(checksum 31 3130303030303030) + +run_test() { + if [ -z "$1" ];then + echo "run test" + else + export GO_FAILPOINTS="$1" + echo "run test with failpoints: $GO_FAILPOINTS" + fi + + rm -rf $BACKUP_DIR + clean 31 3130303030303030 + + # generate raw kv randomly in range[start-key, end-key) in 10s + bin/rawkv --pd $PD_ADDR \ + --ca "$TEST_DIR/certs/ca.pem" \ + --cert "$TEST_DIR/certs/br.pem" \ + --key "$TEST_DIR/certs/br.key" \ + --mode rand-gen --start-key 31 --end-key 3130303030303030 --duration 10 + + # put some keys around 311122 to check the correctness of endKey of restoring + bin/rawkv --pd $PD_ADDR \ + --ca "$TEST_DIR/certs/ca.pem" \ + --cert "$TEST_DIR/certs/br.pem" \ + --key "$TEST_DIR/certs/br.key" \ + --mode put --put-data "311121:31, 31112100:32, 311122:33, 31112200:34, 3111220000:35, 311123:36" + + + # put some keys starts with t. https://github.com/pingcap/tidb/issues/35279 + # t_128_r_12 ----> 745f3132385f725f3132 + # t_128_r_13 ----> 745f3132385f725f3133 + bin/rawkv --pd $PD_ADDR \ + --ca "$TEST_DIR/certs/ca.pem" \ + --cert "$TEST_DIR/certs/br.pem" \ + --key "$TEST_DIR/certs/br.key" \ + --mode put --put-data "745f3132385f725f3132:31, 745f3132385f725f3133:32" + + checksum_ori=$(checksum 31 3130303030303030) + checksum_partial=$(checksum 311111 311122) + checksum_t_prefix=$(checksum 745f3132385f725f3131 745f3132385f725f3134) + + # backup rawkv + echo "backup start..." + run_br --pd $PD_ADDR backup raw -s "local://$BACKUP_DIR" --start 31 --end 745f3132385f725f3134 --format hex --concurrency 4 --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef" + + # delete data in range[start-key, end-key) + clean 31 3130303030303030 + # Ensure the data is deleted + checksum_new=$(checksum 31 3130303030303030) + + if [ "$checksum_new" != "$checksum_empty" ];then + echo "failed to delete data in range" + fail_and_exit + fi + + # restore rawkv + echo "restore start..." + run_br --pd $PD_ADDR restore raw -s "local://$BACKUP_DIR" --start 31 --end 3130303030303030 --format hex --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef" + + checksum_new=$(checksum 31 3130303030303030) + + if [ "$checksum_new" != "$checksum_ori" ];then + echo "checksum failed after restore" + fail_and_exit + fi + + test_full_rawkv + + # delete data in range[start-key, end-key) + clean 31 3130303030303030 + # Ensure the data is deleted + checksum_new=$(checksum 31 3130303030303030) + + if [ "$checksum_new" != "$checksum_empty" ];then + echo "failed to delete data in range" + fail_and_exit + fi + + echo "partial restore start..." + run_br --pd $PD_ADDR restore raw -s "local://$BACKUP_DIR" --start 311111 --end 311122 --format hex --concurrency 4 --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef" + bin/rawkv --pd $PD_ADDR \ + --ca "$TEST_DIR/certs/ca.pem" \ + --cert "$TEST_DIR/certs/br.pem" \ + --key "$TEST_DIR/certs/br.key" \ + --mode scan --start-key 311121 --end-key 33 + + checksum_new=$(checksum 31 3130303030303030) + + if [ "$checksum_new" != "$checksum_partial" ];then + echo "checksum failed after restore" + fail_and_exit + fi + + echo "t prefix restore start..." + run_br --pd $PD_ADDR restore raw -s "local://$BACKUP_DIR" --start "745f3132385f725f3131" --end "745f3132385f725f3134" --format hex --concurrency 4 --crypter.method "aes128-ctr" --crypter.key "0123456789abcdef0123456789abcdef" + bin/rawkv --pd $PD_ADDR \ + --ca "$TEST_DIR/certs/ca.pem" \ + --cert "$TEST_DIR/certs/br.pem" \ + --key "$TEST_DIR/certs/br.key" \ + --mode scan --start-key 745f3132385f725f3131 --end-key 745f3132385f725f3134 + + checksum_new=$(checksum 745f3132385f725f3131 745f3132385f725f3134) + + if [ "$checksum_new" != "$checksum_t_prefix" ];then + echo "checksum failed after restore" + fail_and_exit + fi + + export GO_FAILPOINTS="" +} + + +run_test "" + +# ingest "region error" to trigger fineGrainedBackup +run_test "github.com/pingcap/tidb/br/pkg/backup/tikv-region-error=return(\"region error\")"