Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Watcher: Fix race condition when reloading watches #33157

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ public void clusterChanged(ClusterChangedEvent event) {
// if this is not a data node, we need to start it ourselves possibly
if (event.state().nodes().getLocalNode().isDataNode() == false &&
isWatcherStoppedManually == false && this.state.get() == WatcherState.STOPPED) {
this.state.set(WatcherState.STARTING);
watcherService.start(event.state(), () -> this.state.set(WatcherState.STARTED));
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,6 @@ void reload(ClusterState state, String reason) {
// by checking the cluster state version before and after loading the watches we can potentially just exit without applying the
// changes
processedClusterStateVersion.set(state.getVersion());
triggerService.pauseExecution();
int cancelledTaskCount = executionService.clearExecutionsAndQueue();
logger.info("reloading watcher, reason [{}], cancelled [{}] queued tasks", reason, cancelledTaskCount);

executor.execute(wrapWatcherService(() -> reloadInner(state, reason, false),
e -> logger.error("error reloading watcher", e)));
Expand Down Expand Up @@ -221,6 +218,7 @@ private synchronized boolean reloadInner(ClusterState state, String reason, bool
if (processedClusterStateVersion.get() != state.getVersion()) {
logger.debug("watch service has not been reloaded for state [{}], another reload for state [{}] in progress",
state.getVersion(), processedClusterStateVersion.get());
return false;
}

Collection<Watch> watches = loadWatches(state);
Expand All @@ -231,7 +229,13 @@ private synchronized boolean reloadInner(ClusterState state, String reason, bool

// if we had another state coming in the meantime, we will not start the trigger engines with these watches, but wait
// until the others are loaded
// also this is the place where we pause the trigger service execution and clear the current execution service, so that we make sure
// that existing executions finish, but no new ones are executed
if (processedClusterStateVersion.get() == state.getVersion()) {
triggerService.pauseExecution();
int cancelledTaskCount = executionService.clearExecutionsAndQueue();
logger.info("reloading watcher, reason [{}], cancelled [{}] queued tasks", reason, cancelledTaskCount);

executionService.unPause();
triggerService.start(watches);
if (triggeredWatches.isEmpty() == false) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public synchronized void start(Collection<Watch> jobs) {
schedules.put(job.id(), new ActiveSchedule(job.id(), trigger.getSchedule(), startTime));
}
}
this.schedules.putAll(schedules);
this.schedules = schedules;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@
import static org.elasticsearch.xpack.watcher.trigger.schedule.Schedules.daily;
import static org.elasticsearch.xpack.watcher.trigger.schedule.Schedules.interval;
import static org.elasticsearch.xpack.watcher.trigger.schedule.Schedules.weekly;
import static org.hamcrest.Matchers.everyItem;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.startsWith;
import static org.joda.time.DateTimeZone.UTC;
import static org.mockito.Mockito.mock;

Expand All @@ -50,8 +52,12 @@ public void init() throws Exception {
}

private TriggerEngine createEngine() {
return new TickerScheduleTriggerEngine(Settings.EMPTY,
mock(ScheduleRegistry.class), clock);
Settings settings = Settings.EMPTY;
// having a low value here speeds up the tests tremendously, we still want to run with the defaults every now and then
if (usually()) {
settings = Settings.builder().put(TickerScheduleTriggerEngine.TICKER_INTERVAL_SETTING.getKey(), "10ms").build();
}
return new TickerScheduleTriggerEngine(settings, mock(ScheduleRegistry.class), clock);
}

private void advanceClockIfNeeded(DateTime newCurrentDateTime) {
Expand Down Expand Up @@ -104,6 +110,40 @@ public void accept(Iterable<TriggerEvent> events) {
assertThat(bits.cardinality(), is(count));
}

public void testStartClearsExistingSchedules() throws Exception {
final CountDownLatch latch = new CountDownLatch(1);
List<String> firedWatchIds = new ArrayList<>();
engine.register(new Consumer<Iterable<TriggerEvent>>() {
@Override
public void accept(Iterable<TriggerEvent> events) {
for (TriggerEvent event : events) {
firedWatchIds.add(event.jobName());
}
latch.countDown();
}
});

int count = randomIntBetween(2, 5);
List<Watch> watches = new ArrayList<>();
for (int i = 0; i < count; i++) {
watches.add(createWatch(String.valueOf(i), interval("1s")));
}
engine.start(watches);

watches.clear();
for (int i = 0; i < count; i++) {
watches.add(createWatch("another_id" + i, interval("1s")));
}
engine.start(watches);

advanceClockIfNeeded(new DateTime(clock.millis(), UTC).plusMillis(1100));
if (!latch.await(3 * count, TimeUnit.SECONDS)) {
fail("waiting too long for all watches to be triggered");
}

assertThat(firedWatchIds, everyItem(startsWith("another_id")));
}

public void testAddHourly() throws Exception {
final String name = "job_name";
final CountDownLatch latch = new CountDownLatch(1);
Expand Down