Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dm/worker: don't exit when failed to read checkpoint in relay (#3345) #4005

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dm/dm/worker/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func NewServer(cfg *Config) *Server {
}

// Start starts to serving.
// this function should only exit when can't dail DM-master, for other errors it should not exit.
func (s *Server) Start() error {
log.L().Info("starting dm-worker server")
RegistryMetrics()
Expand Down
4 changes: 2 additions & 2 deletions dm/dm/worker/source_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,11 +301,11 @@ func (w *SourceWorker) EnableRelay() (err error) {
defer dcancel()
minLoc, err1 := getMinLocInAllSubTasks(dctx, subTaskCfgs)
if err1 != nil {
return err1
w.l.Error("meet error when EnableRelay", zap.Error(err1))
}

if minLoc != nil {
log.L().Info("get min location in all subtasks", zap.Stringer("location", *minLoc))
w.l.Info("get min location in all subtasks", zap.Stringer("location", *minLoc))
w.cfg.RelayBinLogName = binlog.AdjustPosition(minLoc.Position).Name
w.cfg.RelayBinlogGTID = minLoc.GTIDSetStr()
// set UUIDSuffix when bound to a source
Expand Down
41 changes: 41 additions & 0 deletions dm/tests/new_relay/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,46 @@ function test_cant_dail_upstream() {
cleanup_data $TEST_NAME
}

function test_cant_dail_downstream() {
cleanup_data $TEST_NAME
cleanup_process

run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT

cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml
dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1

run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"start-relay -s $SOURCE_ID1 worker1" \
"\"result\": true" 1
dmctl_start_task_standalone $cur/conf/dm-task.yaml "--remove-meta"

kill_dm_worker
# kill tidb
pkill -hup tidb-server 2>/dev/null || true
wait_process_exit tidb-server

run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT

# make sure DM-worker doesn't exit
sleep 2
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status -s $SOURCE_ID1" \
"\"relayCatchUpMaster\": true" 1 \
"dial tcp 127.0.0.1:4000: connect: connection refused" 1

# restart tidb
run_tidb_server 4000 $TIDB_PASSWORD
sleep 2

cleanup_process
cleanup_data $TEST_NAME
}

function test_kill_dump_connection() {
cleanup_data $TEST_NAME
cleanup_process
Expand Down Expand Up @@ -83,6 +123,7 @@ function test_kill_dump_connection() {
}

function run() {
test_cant_dail_downstream
test_cant_dail_upstream

export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/relay/ReportRelayLogSpaceInBackground=return(1)"
Expand Down