From 6de8c6b79d206ecba097a23ecab85753010f551f Mon Sep 17 00:00:00 2001 From: Vladimir Dolzhenko Date: Fri, 31 Aug 2018 21:29:06 +0200 Subject: [PATCH] drop `index.shard.check_on_startup: fix` (#32279) Relates #31389 (cherry picked from commit 3d82a30fadd035228f29ccc00d6c5bab71e9adf6) --- docs/reference/index-modules.asciidoc | 4 +- .../elasticsearch/index/shard/IndexShard.java | 22 +-- .../org/elasticsearch/index/store/Store.java | 15 +- .../index/shard/IndexShardTests.java | 157 +++++++++++++++++- .../index/shard/IndexShardTestCase.java | 107 +++++++++--- 5 files changed, 249 insertions(+), 56 deletions(-) diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc index de57db1f89742..001996de398ce 100644 --- a/docs/reference/index-modules.asciidoc +++ b/docs/reference/index-modules.asciidoc @@ -65,9 +65,7 @@ corruption is detected, it will prevent the shard from being opened. Accepts: `fix`:: - Check for both physical and logical corruption. Segments that were reported - as corrupted will be automatically removed. This option *may result in data loss*. - Use with extreme caution! + The same as `false`. This option is deprecated and will be completely removed in 7.0. WARNING: Expert only. Checking shards may take a lot of time on large indices. -- diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java index 378df05208c6f..6db5c21f56626 100644 --- a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java @@ -297,6 +297,10 @@ public IndexShard( logger.debug("state: [CREATED]"); this.checkIndexOnStartup = indexSettings.getValue(IndexSettings.INDEX_CHECK_ON_STARTUP); + if ("fix".equals(checkIndexOnStartup)) { + deprecationLogger.deprecated("Setting [index.shard.check_on_startup] is set to deprecated value [fix], " + + "which has no effect and will not be accepted in future"); + } this.translogConfig = new TranslogConfig(shardId, shardPath().resolveTranslog(), indexSettings, bigArrays); final String aId = shardRouting.allocationId().getId(); this.globalCheckpointListeners = new GlobalCheckpointListeners(shardId, threadPool.executor(ThreadPool.Names.LISTENER), logger); @@ -1354,7 +1358,7 @@ private void innerOpenEngineAndTranslog() throws IOException { } recoveryState.setStage(RecoveryState.Stage.VERIFY_INDEX); // also check here, before we apply the translog - if (Booleans.isTrue(checkIndexOnStartup)) { + if (Booleans.isTrue(checkIndexOnStartup) || "checksum".equals(checkIndexOnStartup)) { try { checkIndex(); } catch (IOException ex) { @@ -1958,6 +1962,9 @@ void checkIndex() throws IOException { if (store.tryIncRef()) { try { doCheckIndex(); + } catch (IOException e) { + store.markStoreCorrupted(e); + throw e; } finally { store.decRef(); } @@ -2001,18 +2008,7 @@ private void doCheckIndex() throws IOException { return; } logger.warn("check index [failure]\n{}", os.bytes().utf8ToString()); - if ("fix".equals(checkIndexOnStartup)) { - if (logger.isDebugEnabled()) { - logger.debug("fixing index, writing new segments file ..."); - } - store.exorciseIndex(status); - if (logger.isDebugEnabled()) { - logger.debug("index fixed, wrote new segments file \"{}\"", status.segmentsFileName); - } - } else { - // only throw a failure if we are not going to fix the index - throw new IllegalStateException("index check failure but can't fix it"); - } + throw new IOException("index check failure"); } } diff --git a/server/src/main/java/org/elasticsearch/index/store/Store.java b/server/src/main/java/org/elasticsearch/index/store/Store.java index 03b641b69ce57..fa1534ef922c7 100644 --- a/server/src/main/java/org/elasticsearch/index/store/Store.java +++ b/server/src/main/java/org/elasticsearch/index/store/Store.java @@ -134,7 +134,8 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref static final int VERSION_STACK_TRACE = 1; // we write the stack trace too since 1.4.0 static final int VERSION_START = 0; static final int VERSION = VERSION_WRITE_THROWABLE; - static final String CORRUPTED = "corrupted_"; + // public is for test purposes + public static final String CORRUPTED = "corrupted_"; public static final Setting INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING = Setting.timeSetting("index.store.stats_refresh_interval", TimeValue.timeValueSeconds(10), Property.IndexScope); @@ -360,18 +361,6 @@ public CheckIndex.Status checkIndex(PrintStream out) throws IOException { } } - /** - * Repairs the index using the previous returned status from {@link #checkIndex(PrintStream)}. - */ - public void exorciseIndex(CheckIndex.Status status) throws IOException { - metadataLock.writeLock().lock(); - try (CheckIndex checkIndex = new CheckIndex(directory)) { - checkIndex.exorciseIndex(status); - } finally { - metadataLock.writeLock().unlock(); - } - } - public StoreStats stats() throws IOException { ensureOpen(); return new StoreStats(directory.estimateSize()); diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java index 1cb9cb7c3d47f..83d930c3fdba8 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexCommit; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; @@ -110,6 +111,7 @@ import org.elasticsearch.snapshots.SnapshotId; import org.elasticsearch.snapshots.SnapshotInfo; import org.elasticsearch.snapshots.SnapshotShardFailure; +import org.elasticsearch.test.CorruptionUtils; import org.elasticsearch.test.DummyShardLock; import org.elasticsearch.test.FieldMaskingReader; import org.elasticsearch.test.VersionUtils; @@ -118,7 +120,11 @@ import java.io.IOException; import java.nio.charset.Charset; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -1229,7 +1235,7 @@ public String[] listAll() throws IOException { }; try (Store store = createStore(shardId, new IndexSettings(metaData, Settings.EMPTY), directory)) { - IndexShard shard = newShard(shardRouting, shardPath, metaData, store, + IndexShard shard = newShard(shardRouting, shardPath, metaData, i -> store, null, new InternalEngineFactory(), () -> { }, EMPTY_EVENT_LISTENER); AtomicBoolean failureCallbackTriggered = new AtomicBoolean(false); @@ -2569,6 +2575,143 @@ public void testReadSnapshotConcurrently() throws IOException, InterruptedExcept closeShards(newShard); } + public void testIndexCheckOnStartup() throws Exception { + final IndexShard indexShard = newStartedShard(true); + + final long numDocs = between(10, 100); + for (long i = 0; i < numDocs; i++) { + indexDoc(indexShard, "_doc", Long.toString(i), "{}"); + } + indexShard.flush(new FlushRequest()); + closeShards(indexShard); + + final ShardPath shardPath = indexShard.shardPath(); + + final Path indexPath = corruptIndexFile(shardPath); + + final AtomicInteger corruptedMarkerCount = new AtomicInteger(); + final SimpleFileVisitor corruptedVisitor = new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) { + corruptedMarkerCount.incrementAndGet(); + } + return FileVisitResult.CONTINUE; + } + }; + Files.walkFileTree(indexPath, corruptedVisitor); + + assertThat("corruption marker should not be there", corruptedMarkerCount.get(), equalTo(0)); + + final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(), + RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE + ); + // start shard and perform index check on startup. It enforce shard to fail due to corrupted index files + final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData()) + .settings(Settings.builder() + .put(indexShard.indexSettings.getSettings()) + .put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("true", "checksum"))) + .build(); + + IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData, + null, null, indexShard.engineFactory, + indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER); + + final IndexShardRecoveryException indexShardRecoveryException = + expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, true)); + assertThat(indexShardRecoveryException.getMessage(), equalTo("failed recovery")); + + // check that corrupt marker is there + Files.walkFileTree(indexPath, corruptedVisitor); + assertThat("store has to be marked as corrupted", corruptedMarkerCount.get(), equalTo(1)); + + try { + closeShards(corruptedShard); + } catch (RuntimeException e) { + assertThat(e.getMessage(), equalTo("CheckIndex failed")); + } + } + + public void testShardDoesNotStartIfCorruptedMarkerIsPresent() throws Exception { + final IndexShard indexShard = newStartedShard(true); + + final long numDocs = between(10, 100); + for (long i = 0; i < numDocs; i++) { + indexDoc(indexShard, "_doc", Long.toString(i), "{}"); + } + indexShard.flush(new FlushRequest()); + closeShards(indexShard); + + final ShardPath shardPath = indexShard.shardPath(); + + final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(), + RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE + ); + final IndexMetaData indexMetaData = indexShard.indexSettings().getIndexMetaData(); + + final Path indexPath = shardPath.getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME); + + // create corrupted marker + final String corruptionMessage = "fake ioexception"; + try(Store store = createStore(indexShard.indexSettings(), shardPath)) { + store.markStoreCorrupted(new IOException(corruptionMessage)); + } + + // try to start shard on corrupted files + final IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData, + null, null, indexShard.engineFactory, + indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER); + + final IndexShardRecoveryException exception1 = expectThrows(IndexShardRecoveryException.class, + () -> newStartedShard(p -> corruptedShard, true)); + assertThat(exception1.getCause().getMessage(), equalTo(corruptionMessage + " (resource=preexisting_corruption)")); + closeShards(corruptedShard); + + final AtomicInteger corruptedMarkerCount = new AtomicInteger(); + final SimpleFileVisitor corruptedVisitor = new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) { + corruptedMarkerCount.incrementAndGet(); + } + return FileVisitResult.CONTINUE; + } + }; + Files.walkFileTree(indexPath, corruptedVisitor); + assertThat("store has to be marked as corrupted", corruptedMarkerCount.get(), equalTo(1)); + + // try to start another time shard on corrupted files + final IndexShard corruptedShard2 = newShard(shardRouting, shardPath, indexMetaData, + null, null, indexShard.engineFactory, + indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER); + + final IndexShardRecoveryException exception2 = expectThrows(IndexShardRecoveryException.class, + () -> newStartedShard(p -> corruptedShard2, true)); + assertThat(exception2.getCause().getMessage(), equalTo(corruptionMessage + " (resource=preexisting_corruption)")); + closeShards(corruptedShard2); + + // check that corrupt marker is there + corruptedMarkerCount.set(0); + Files.walkFileTree(indexPath, corruptedVisitor); + assertThat("store still has a single corrupt marker", corruptedMarkerCount.get(), equalTo(1)); + } + + private Path corruptIndexFile(ShardPath shardPath) throws IOException { + final Path indexPath = shardPath.getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME); + final Path[] filesToCorrupt = + Files.walk(indexPath) + .filter(p -> { + final String name = p.getFileName().toString(); + return Files.isRegularFile(p) + && name.startsWith("extra") == false // Skip files added by Lucene's ExtrasFS + && IndexWriter.WRITE_LOCK_NAME.equals(name) == false + && name.startsWith("segments_") == false && name.endsWith(".si") == false; + }) + .toArray(Path[]::new); + CorruptionUtils.corruptFile(random(), filesToCorrupt); + return indexPath; + } + /** * Simulates a scenario that happens when we are async fetching snapshot metadata from GatewayService * and checking index concurrently. This should always be possible without any exception. @@ -2592,7 +2735,7 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception { final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData()) .settings(Settings.builder() .put(indexShard.indexSettings.getSettings()) - .put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum", "fix"))) + .put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum"))) .build(); final IndexShard newShard = newShard(shardRouting, indexShard.shardPath(), indexMetaData, null, null, indexShard.engineFactory, indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER); @@ -2634,6 +2777,16 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception { closeShards(newShard); } + public void testCheckOnStartupDeprecatedValue() throws Exception { + final Settings settings = Settings.builder().put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), "fix").build(); + + final IndexShard newShard = newShard(true, settings); + closeShards(newShard); + + assertWarnings("Setting [index.shard.check_on_startup] is set to deprecated value [fix], " + + "which has no effect and will not be accepted in future"); + } + class Result { private final int localCheckpoint; private final int maxSeqNo; diff --git a/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java index 82ee78cfee1c7..56c78930244e7 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java @@ -37,6 +37,7 @@ import org.elasticsearch.cluster.routing.ShardRoutingHelper; import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.TestShardRouting; +import org.elasticsearch.common.CheckedFunction; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.lucene.uid.Versions; @@ -163,7 +164,6 @@ public Settings threadPoolSettings() { return Settings.EMPTY; } - protected Store createStore(IndexSettings indexSettings, ShardPath shardPath) throws IOException { return createStore(shardPath.getShardId(), indexSettings, newFSDirectory(shardPath.resolveIndex())); } @@ -176,7 +176,6 @@ public Directory newDirectory() throws IOException { } }; return new Store(shardId, indexSettings, directoryService, new DummyShardLock(shardId)); - } /** @@ -186,31 +185,64 @@ public Directory newDirectory() throws IOException { * (ready to recover from another shard) */ protected IndexShard newShard(boolean primary) throws IOException { - ShardRouting shardRouting = TestShardRouting.newShardRouting(new ShardId("index", "_na_", 0), randomAlphaOfLength(10), primary, - ShardRoutingState.INITIALIZING, - primary ? RecoverySource.StoreRecoverySource.EMPTY_STORE_INSTANCE : RecoverySource.PeerRecoverySource.INSTANCE); - return newShard(shardRouting); + return newShard(primary, Settings.EMPTY); } /** - * creates a new initializing shard. The shard will have its own unique data path. + * Creates a new initializing shard. The shard will have its own unique data path. * - * @param shardRouting the {@link ShardRouting} to use for this shard - * @param listeners an optional set of listeners to add to the shard + * @param primary indicates whether to a primary shard (ready to recover from an empty store) or a replica (ready to recover from + * another shard) + */ + protected IndexShard newShard(final boolean primary, final Settings settings) throws IOException { + return newShard(primary, settings, new InternalEngineFactory()); + } + + /** + * Creates a new initializing shard. The shard will have its own unique data path. + * + * @param primary indicates whether to a primary shard (ready to recover from an empty store) or a replica (ready to recover from + * another shard) + * @param settings the settings to use for this shard + * @param engineFactory the engine factory to use for this shard + */ + protected IndexShard newShard(boolean primary, Settings settings, EngineFactory engineFactory) throws IOException { + final RecoverySource recoverySource = + primary ? RecoverySource.StoreRecoverySource.EMPTY_STORE_INSTANCE : RecoverySource.PeerRecoverySource.INSTANCE; + final ShardRouting shardRouting = + TestShardRouting.newShardRouting( + new ShardId("index", "_na_", 0), randomAlphaOfLength(10), primary, ShardRoutingState.INITIALIZING, recoverySource); + return newShard(shardRouting, settings, engineFactory); + } + + protected IndexShard newShard(ShardRouting shardRouting, final IndexingOperationListener... listeners) throws IOException { + return newShard(shardRouting, Settings.EMPTY, new InternalEngineFactory(), listeners); + } + + /** + * Creates a new initializing shard. The shard will have its own unique data path. + * + * @param shardRouting the {@link ShardRouting} to use for this shard + * @param settings the settings to use for this shard + * @param engineFactory the engine factory to use for this shard + * @param listeners an optional set of listeners to add to the shard */ protected IndexShard newShard( final ShardRouting shardRouting, + final Settings settings, + final EngineFactory engineFactory, final IndexingOperationListener... listeners) throws IOException { assert shardRouting.initializing() : shardRouting; - Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) - .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) - .build(); + Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) + .put(settings) + .build(); IndexMetaData.Builder metaData = IndexMetaData.builder(shardRouting.getIndexName()) - .settings(settings) + .settings(indexSettings) .primaryTerm(0, primaryTerm) .putMapping("_doc", "{ \"properties\": {} }"); - return newShard(shardRouting, metaData.build(), listeners); + return newShard(shardRouting, metaData.build(), engineFactory, listeners); } /** @@ -225,7 +257,7 @@ protected IndexShard newShard(ShardId shardId, boolean primary, IndexingOperatio ShardRouting shardRouting = TestShardRouting.newShardRouting(shardId, randomAlphaOfLength(5), primary, ShardRoutingState.INITIALIZING, primary ? RecoverySource.StoreRecoverySource.EMPTY_STORE_INSTANCE : RecoverySource.PeerRecoverySource.INSTANCE); - return newShard(shardRouting, listeners); + return newShard(shardRouting, Settings.EMPTY, new InternalEngineFactory(), listeners); } /** @@ -265,9 +297,10 @@ protected IndexShard newShard(ShardId shardId, boolean primary, String nodeId, I * @param indexMetaData indexMetaData for the shard, including any mapping * @param listeners an optional set of listeners to add to the shard */ - protected IndexShard newShard(ShardRouting routing, IndexMetaData indexMetaData, IndexingOperationListener... listeners) + protected IndexShard newShard( + ShardRouting routing, IndexMetaData indexMetaData, EngineFactory engineFactory, IndexingOperationListener... listeners) throws IOException { - return newShard(routing, indexMetaData, null, new InternalEngineFactory(), () -> {}, listeners); + return newShard(routing, indexMetaData, null, engineFactory, () -> {}, listeners); } /** @@ -298,23 +331,25 @@ protected IndexShard newShard(ShardRouting routing, IndexMetaData indexMetaData, * @param routing shard routing to use * @param shardPath path to use for shard data * @param indexMetaData indexMetaData for the shard, including any mapping - * @param store an optional custom store to use. If null a default file based store will be created + * @param storeProvider an optional custom store provider to use. If null a default file based store will be created * @param indexSearcherWrapper an optional wrapper to be used during searchers * @param globalCheckpointSyncer callback for syncing global checkpoints * @param indexEventListener index event listener * @param listeners an optional set of listeners to add to the shard */ protected IndexShard newShard(ShardRouting routing, ShardPath shardPath, IndexMetaData indexMetaData, - @Nullable Store store, @Nullable IndexSearcherWrapper indexSearcherWrapper, + @Nullable CheckedFunction storeProvider, + @Nullable IndexSearcherWrapper indexSearcherWrapper, @Nullable EngineFactory engineFactory, Runnable globalCheckpointSyncer, IndexEventListener indexEventListener, IndexingOperationListener... listeners) throws IOException { final Settings nodeSettings = Settings.builder().put("node.name", routing.currentNodeId()).build(); final IndexSettings indexSettings = new IndexSettings(indexMetaData, nodeSettings); final IndexShard indexShard; - if (store == null) { - store = createStore(indexSettings, shardPath); + if (storeProvider == null) { + storeProvider = is -> createStore(is, shardPath); } + final Store store = storeProvider.apply(indexSettings); boolean success = false; try { IndexCache indexCache = new IndexCache(indexSettings, new DisabledQueryCache(indexSettings), null); @@ -372,7 +407,7 @@ protected IndexShard reinitShard(IndexShard current, ShardRouting routing, Index } /** - * creates a new empyu shard and starts it. The shard will be either a replica or a primary. + * Creates a new empty shard and starts it. The shard will randomly be a replica or a primary. */ protected IndexShard newStartedShard() throws IOException { return newStartedShard(randomBoolean()); @@ -383,8 +418,30 @@ protected IndexShard newStartedShard() throws IOException { * * @param primary controls whether the shard will be a primary or a replica. */ - protected IndexShard newStartedShard(boolean primary) throws IOException { - IndexShard shard = newShard(primary); + protected IndexShard newStartedShard(final boolean primary) throws IOException { + return newStartedShard(primary, Settings.EMPTY, new InternalEngineFactory()); + } + /** + * Creates a new empty shard with the specified settings and engine factory and starts it. + * + * @param primary controls whether the shard will be a primary or a replica. + * @param settings the settings to use for this shard + * @param engineFactory the engine factory to use for this shard + */ + protected IndexShard newStartedShard( + final boolean primary, final Settings settings, final EngineFactory engineFactory) throws IOException { + return newStartedShard(p -> newShard(p, settings, engineFactory), primary); + } + + /** + * creates a new empty shard and starts it. + * + * @param shardFunction shard factory function + * @param primary controls whether the shard will be a primary or a replica. + */ + protected IndexShard newStartedShard(CheckedFunction shardFunction, + boolean primary) throws IOException { + IndexShard shard = shardFunction.apply(primary); if (primary) { recoverShardFromStore(shard); } else {