From 7ec5ed8e79f7d88bdfedd72d2f70d752b822ff73 Mon Sep 17 00:00:00 2001 From: Alexander Reelsen Date: Mon, 27 Aug 2018 09:24:49 +0200 Subject: [PATCH] Watcher: Fix race condition when reloading watches The current watcher implementation had two issues on reload, that could lead to the existing watches not properly cleared out. One fix here was to ensure that when `TriggerService.start()` is called, we ensure in the trigger engine implementations that current watches are removed instead of adding to the existing ones in `TickerScheduleTriggerEngine.start()` The second fix is a bit more subtle, as the underlying issue is rooted more subtle due concurrent code. When `WatcherService.reload()` is called it calls in turn `WatcherService.reloadInner()`, which is synchronized. In the reload method we cleared out existing watches and executions. However there was still a small window of time, where the clearing could happen in relatively quick succession for two cluster states coming in, however due to timing issues the second clearing happened before the first starting of the trigger engine. This could lead due `TriggerEngine.start(Collection watches)` being called twice with a different set of watches, without the existing ones being cleared out, resulting in the execution of watches on this node, that should not be executed. Also, there were two minor fixes 1. If the node is not a data node, we forgot to set the status to STARTING when watcher is being started. This should not be a big issue, because a non-data node does not spent a lot of time loading as there are no watches which need loading. 2. If a new cluster state came in during a reload, we had two checks in place to abort loading the current one. The first one before we load all the watches of the local node and the second before watcher is starting with those new watches. Turned out that the first check was not returning, which meant we always tried to load all the watches, and then would fail on the second check. This has been fixed here. --- .../watcher/WatcherLifeCycleService.java | 1 + .../xpack/watcher/WatcherService.java | 10 +++-- .../engine/TickerScheduleTriggerEngine.java | 2 +- .../engine/TickerScheduleEngineTests.java | 44 ++++++++++++++++++- 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherLifeCycleService.java b/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherLifeCycleService.java index 620d575fc802c..5a691d6101c33 100644 --- a/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherLifeCycleService.java +++ b/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherLifeCycleService.java @@ -110,6 +110,7 @@ public void clusterChanged(ClusterChangedEvent event) { // if this is not a data node, we need to start it ourselves possibly if (event.state().nodes().getLocalNode().isDataNode() == false && isWatcherStoppedManually == false && this.state.get() == WatcherState.STOPPED) { + this.state.set(WatcherState.STARTING); watcherService.start(event.state(), () -> this.state.set(WatcherState.STARTED)); return; } diff --git a/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherService.java b/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherService.java index 49915674fe9e2..599287bb50a76 100644 --- a/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherService.java +++ b/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/WatcherService.java @@ -183,9 +183,6 @@ void reload(ClusterState state, String reason) { // by checking the cluster state version before and after loading the watches we can potentially just exit without applying the // changes processedClusterStateVersion.set(state.getVersion()); - triggerService.pauseExecution(); - int cancelledTaskCount = executionService.clearExecutionsAndQueue(); - logger.info("reloading watcher, reason [{}], cancelled [{}] queued tasks", reason, cancelledTaskCount); executor.execute(wrapWatcherService(() -> reloadInner(state, reason, false), e -> logger.error("error reloading watcher", e))); @@ -221,6 +218,7 @@ private synchronized boolean reloadInner(ClusterState state, String reason, bool if (processedClusterStateVersion.get() != state.getVersion()) { logger.debug("watch service has not been reloaded for state [{}], another reload for state [{}] in progress", state.getVersion(), processedClusterStateVersion.get()); + return false; } Collection watches = loadWatches(state); @@ -231,7 +229,13 @@ private synchronized boolean reloadInner(ClusterState state, String reason, bool // if we had another state coming in the meantime, we will not start the trigger engines with these watches, but wait // until the others are loaded + // also this is the place where we pause the trigger service execution and clear the current execution service, so that we make sure + // that existing executions finish, but no new ones are executed if (processedClusterStateVersion.get() == state.getVersion()) { + triggerService.pauseExecution(); + int cancelledTaskCount = executionService.clearExecutionsAndQueue(); + logger.info("reloading watcher, reason [{}], cancelled [{}] queued tasks", reason, cancelledTaskCount); + executionService.unPause(); triggerService.start(watches); if (triggeredWatches.isEmpty() == false) { diff --git a/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/trigger/schedule/engine/TickerScheduleTriggerEngine.java b/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/trigger/schedule/engine/TickerScheduleTriggerEngine.java index 05aa7cf302817..4c10f794880b9 100644 --- a/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/trigger/schedule/engine/TickerScheduleTriggerEngine.java +++ b/x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/trigger/schedule/engine/TickerScheduleTriggerEngine.java @@ -56,7 +56,7 @@ public synchronized void start(Collection jobs) { schedules.put(job.id(), new ActiveSchedule(job.id(), trigger.getSchedule(), startTime)); } } - this.schedules.putAll(schedules); + this.schedules = schedules; } @Override diff --git a/x-pack/plugin/watcher/src/test/java/org/elasticsearch/xpack/watcher/trigger/schedule/engine/TickerScheduleEngineTests.java b/x-pack/plugin/watcher/src/test/java/org/elasticsearch/xpack/watcher/trigger/schedule/engine/TickerScheduleEngineTests.java index 7949998867b48..6680b38ab94b3 100644 --- a/x-pack/plugin/watcher/src/test/java/org/elasticsearch/xpack/watcher/trigger/schedule/engine/TickerScheduleEngineTests.java +++ b/x-pack/plugin/watcher/src/test/java/org/elasticsearch/xpack/watcher/trigger/schedule/engine/TickerScheduleEngineTests.java @@ -35,7 +35,9 @@ import static org.elasticsearch.xpack.watcher.trigger.schedule.Schedules.daily; import static org.elasticsearch.xpack.watcher.trigger.schedule.Schedules.interval; import static org.elasticsearch.xpack.watcher.trigger.schedule.Schedules.weekly; +import static org.hamcrest.Matchers.everyItem; import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.startsWith; import static org.joda.time.DateTimeZone.UTC; import static org.mockito.Mockito.mock; @@ -50,8 +52,12 @@ public void init() throws Exception { } private TriggerEngine createEngine() { - return new TickerScheduleTriggerEngine(Settings.EMPTY, - mock(ScheduleRegistry.class), clock); + Settings settings = Settings.EMPTY; + // having a low value here speeds up the tests tremendously, we still want to run with the defaults every now and then + if (usually()) { + settings = Settings.builder().put(TickerScheduleTriggerEngine.TICKER_INTERVAL_SETTING.getKey(), "10ms").build(); + } + return new TickerScheduleTriggerEngine(settings, mock(ScheduleRegistry.class), clock); } private void advanceClockIfNeeded(DateTime newCurrentDateTime) { @@ -104,6 +110,40 @@ public void accept(Iterable events) { assertThat(bits.cardinality(), is(count)); } + public void testStartClearsExistingSchedules() throws Exception { + final CountDownLatch latch = new CountDownLatch(1); + List firedWatchIds = new ArrayList<>(); + engine.register(new Consumer>() { + @Override + public void accept(Iterable events) { + for (TriggerEvent event : events) { + firedWatchIds.add(event.jobName()); + } + latch.countDown(); + } + }); + + int count = randomIntBetween(2, 5); + List watches = new ArrayList<>(); + for (int i = 0; i < count; i++) { + watches.add(createWatch(String.valueOf(i), interval("1s"))); + } + engine.start(watches); + + watches.clear(); + for (int i = 0; i < count; i++) { + watches.add(createWatch("another_id" + i, interval("1s"))); + } + engine.start(watches); + + advanceClockIfNeeded(new DateTime(clock.millis(), UTC).plusMillis(1100)); + if (!latch.await(3 * count, TimeUnit.SECONDS)) { + fail("waiting too long for all watches to be triggered"); + } + + assertThat(firedWatchIds, everyItem(startsWith("another_id"))); + } + public void testAddHourly() throws Exception { final String name = "job_name"; final CountDownLatch latch = new CountDownLatch(1);