Skip to content

Commit

Permalink
Solve the deadlock problem caused by queuing (#13191)
Browse files Browse the repository at this point in the history
* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing,move the event to the tail by throwing a exception

Co-authored-by: wfs <wangfushun@cdqcp.cpm>

(cherry picked from commit 7a0a2c2)
  • Loading branch information
dahai1996 authored and zhongjiajie committed Dec 28, 2022
1 parent 19771e5 commit bc1cf25
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.dolphinscheduler.server.master.event;

/**
* This exception represent the exception can be recovered, when we get this exception,
* we will move the event to the fail of the queue.
*/
public class StateEventHandleFailure extends Exception {

public StateEventHandleFailure(String message) {
super(message);
}

public StateEventHandleFailure(String message, Throwable throwable) {
super(message, throwable);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ public interface StateEventHandler {
* @param stateEvent given state event.
* @throws StateEventHandleException this exception means it can be recovered.
* @throws StateEventHandleError this exception means it cannot be recovered, so the event need to drop.
* @throws StateEventHandleException this means it can be recovered.
*/
boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent)
throws StateEventHandleException, StateEventHandleError;
boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
StateEvent stateEvent) throws StateEventHandleException, StateEventHandleError, StateEventHandleFailure;

StateEventType getEventType();
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,24 @@
import org.apache.dolphinscheduler.common.enums.StateEventType;
import org.apache.dolphinscheduler.server.master.runner.WorkflowExecuteRunnable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.auto.service.AutoService;

@AutoService(StateEventHandler.class)
public class TaskWaitTaskGroupStateHandler implements StateEventHandler {

private static final Logger logger = LoggerFactory.getLogger(TaskWaitTaskGroupStateHandler.class);

@Override
public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent) {
return workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent);
public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
StateEvent stateEvent) throws StateEventHandleFailure {
logger.info("Handle task instance wait task group event, taskInstanceId: {}", stateEvent.getTaskInstanceId());
if (!workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent)) {
throw new StateEventHandleFailure("Task state event handle failed due to robing taskGroup resource failed");
}
return true;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
import org.apache.dolphinscheduler.server.master.event.StateEvent;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleError;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleException;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleFailure;
import org.apache.dolphinscheduler.server.master.event.StateEventHandler;
import org.apache.dolphinscheduler.server.master.event.StateEventHandlerManager;
import org.apache.dolphinscheduler.server.master.metrics.TaskMetrics;
Expand Down Expand Up @@ -279,6 +280,13 @@ public void handleEvents() {
stateEvent,
stateEventHandleException);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (StateEventHandleFailure stateEventHandleFailure) {
logger.error("State event handle failed, will move event to the tail: {}",
stateEvent,
stateEventHandleFailure);
this.stateEvents.remove(stateEvent);
this.stateEvents.offer(stateEvent);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (Exception e) {
// we catch the exception here, since if the state event handle failed, the state event will still keep in the stateEvents queue.
logger.error("State event handle error, get a unknown exception, will retry this event: {}",
Expand Down

0 comments on commit bc1cf25

Please sign in to comment.