Skip to content

Commit

Permalink
drop index.shard.check_on_startup: fix (#32279)
Browse files Browse the repository at this point in the history
Relates #31389

(cherry picked from commit 3d82a30)
  • Loading branch information
vladimirdolzhenko committed Aug 31, 2018
1 parent cf28ba6 commit 6de8c6b
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 56 deletions.
4 changes: 1 addition & 3 deletions docs/reference/index-modules.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@ corruption is detected, it will prevent the shard from being opened. Accepts:

`fix`::

Check for both physical and logical corruption. Segments that were reported
as corrupted will be automatically removed. This option *may result in data loss*.
Use with extreme caution!
The same as `false`. This option is deprecated and will be completely removed in 7.0.

WARNING: Expert only. Checking shards may take a lot of time on large indices.
--
Expand Down
22 changes: 9 additions & 13 deletions server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,10 @@ public IndexShard(
logger.debug("state: [CREATED]");

this.checkIndexOnStartup = indexSettings.getValue(IndexSettings.INDEX_CHECK_ON_STARTUP);
if ("fix".equals(checkIndexOnStartup)) {
deprecationLogger.deprecated("Setting [index.shard.check_on_startup] is set to deprecated value [fix], "
+ "which has no effect and will not be accepted in future");
}
this.translogConfig = new TranslogConfig(shardId, shardPath().resolveTranslog(), indexSettings, bigArrays);
final String aId = shardRouting.allocationId().getId();
this.globalCheckpointListeners = new GlobalCheckpointListeners(shardId, threadPool.executor(ThreadPool.Names.LISTENER), logger);
Expand Down Expand Up @@ -1354,7 +1358,7 @@ private void innerOpenEngineAndTranslog() throws IOException {
}
recoveryState.setStage(RecoveryState.Stage.VERIFY_INDEX);
// also check here, before we apply the translog
if (Booleans.isTrue(checkIndexOnStartup)) {
if (Booleans.isTrue(checkIndexOnStartup) || "checksum".equals(checkIndexOnStartup)) {
try {
checkIndex();
} catch (IOException ex) {
Expand Down Expand Up @@ -1958,6 +1962,9 @@ void checkIndex() throws IOException {
if (store.tryIncRef()) {
try {
doCheckIndex();
} catch (IOException e) {
store.markStoreCorrupted(e);
throw e;
} finally {
store.decRef();
}
Expand Down Expand Up @@ -2001,18 +2008,7 @@ private void doCheckIndex() throws IOException {
return;
}
logger.warn("check index [failure]\n{}", os.bytes().utf8ToString());
if ("fix".equals(checkIndexOnStartup)) {
if (logger.isDebugEnabled()) {
logger.debug("fixing index, writing new segments file ...");
}
store.exorciseIndex(status);
if (logger.isDebugEnabled()) {
logger.debug("index fixed, wrote new segments file \"{}\"", status.segmentsFileName);
}
} else {
// only throw a failure if we are not going to fix the index
throw new IllegalStateException("index check failure but can't fix it");
}
throw new IOException("index check failure");
}
}

Expand Down
15 changes: 2 additions & 13 deletions server/src/main/java/org/elasticsearch/index/store/Store.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
static final int VERSION_STACK_TRACE = 1; // we write the stack trace too since 1.4.0
static final int VERSION_START = 0;
static final int VERSION = VERSION_WRITE_THROWABLE;
static final String CORRUPTED = "corrupted_";
// public is for test purposes
public static final String CORRUPTED = "corrupted_";
public static final Setting<TimeValue> INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING =
Setting.timeSetting("index.store.stats_refresh_interval", TimeValue.timeValueSeconds(10), Property.IndexScope);

Expand Down Expand Up @@ -360,18 +361,6 @@ public CheckIndex.Status checkIndex(PrintStream out) throws IOException {
}
}

/**
* Repairs the index using the previous returned status from {@link #checkIndex(PrintStream)}.
*/
public void exorciseIndex(CheckIndex.Status status) throws IOException {
metadataLock.writeLock().lock();
try (CheckIndex checkIndex = new CheckIndex(directory)) {
checkIndex.exorciseIndex(status);
} finally {
metadataLock.writeLock().unlock();
}
}

public StoreStats stats() throws IOException {
ensureOpen();
return new StoreStats(directory.estimateSize());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
Expand Down Expand Up @@ -110,6 +111,7 @@
import org.elasticsearch.snapshots.SnapshotId;
import org.elasticsearch.snapshots.SnapshotInfo;
import org.elasticsearch.snapshots.SnapshotShardFailure;
import org.elasticsearch.test.CorruptionUtils;
import org.elasticsearch.test.DummyShardLock;
import org.elasticsearch.test.FieldMaskingReader;
import org.elasticsearch.test.VersionUtils;
Expand All @@ -118,7 +120,11 @@

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -1229,7 +1235,7 @@ public String[] listAll() throws IOException {
};

try (Store store = createStore(shardId, new IndexSettings(metaData, Settings.EMPTY), directory)) {
IndexShard shard = newShard(shardRouting, shardPath, metaData, store,
IndexShard shard = newShard(shardRouting, shardPath, metaData, i -> store,
null, new InternalEngineFactory(), () -> {
}, EMPTY_EVENT_LISTENER);
AtomicBoolean failureCallbackTriggered = new AtomicBoolean(false);
Expand Down Expand Up @@ -2569,6 +2575,143 @@ public void testReadSnapshotConcurrently() throws IOException, InterruptedExcept
closeShards(newShard);
}

public void testIndexCheckOnStartup() throws Exception {
final IndexShard indexShard = newStartedShard(true);

final long numDocs = between(10, 100);
for (long i = 0; i < numDocs; i++) {
indexDoc(indexShard, "_doc", Long.toString(i), "{}");
}
indexShard.flush(new FlushRequest());
closeShards(indexShard);

final ShardPath shardPath = indexShard.shardPath();

final Path indexPath = corruptIndexFile(shardPath);

final AtomicInteger corruptedMarkerCount = new AtomicInteger();
final SimpleFileVisitor<Path> corruptedVisitor = new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) {
corruptedMarkerCount.incrementAndGet();
}
return FileVisitResult.CONTINUE;
}
};
Files.walkFileTree(indexPath, corruptedVisitor);

assertThat("corruption marker should not be there", corruptedMarkerCount.get(), equalTo(0));

final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(),
RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE
);
// start shard and perform index check on startup. It enforce shard to fail due to corrupted index files
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
.settings(Settings.builder()
.put(indexShard.indexSettings.getSettings())
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("true", "checksum")))
.build();

IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData,
null, null, indexShard.engineFactory,
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);

final IndexShardRecoveryException indexShardRecoveryException =
expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, true));
assertThat(indexShardRecoveryException.getMessage(), equalTo("failed recovery"));

// check that corrupt marker is there
Files.walkFileTree(indexPath, corruptedVisitor);
assertThat("store has to be marked as corrupted", corruptedMarkerCount.get(), equalTo(1));

try {
closeShards(corruptedShard);
} catch (RuntimeException e) {
assertThat(e.getMessage(), equalTo("CheckIndex failed"));
}
}

public void testShardDoesNotStartIfCorruptedMarkerIsPresent() throws Exception {
final IndexShard indexShard = newStartedShard(true);

final long numDocs = between(10, 100);
for (long i = 0; i < numDocs; i++) {
indexDoc(indexShard, "_doc", Long.toString(i), "{}");
}
indexShard.flush(new FlushRequest());
closeShards(indexShard);

final ShardPath shardPath = indexShard.shardPath();

final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(),
RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE
);
final IndexMetaData indexMetaData = indexShard.indexSettings().getIndexMetaData();

final Path indexPath = shardPath.getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);

// create corrupted marker
final String corruptionMessage = "fake ioexception";
try(Store store = createStore(indexShard.indexSettings(), shardPath)) {
store.markStoreCorrupted(new IOException(corruptionMessage));
}

// try to start shard on corrupted files
final IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData,
null, null, indexShard.engineFactory,
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);

final IndexShardRecoveryException exception1 = expectThrows(IndexShardRecoveryException.class,
() -> newStartedShard(p -> corruptedShard, true));
assertThat(exception1.getCause().getMessage(), equalTo(corruptionMessage + " (resource=preexisting_corruption)"));
closeShards(corruptedShard);

final AtomicInteger corruptedMarkerCount = new AtomicInteger();
final SimpleFileVisitor<Path> corruptedVisitor = new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) {
corruptedMarkerCount.incrementAndGet();
}
return FileVisitResult.CONTINUE;
}
};
Files.walkFileTree(indexPath, corruptedVisitor);
assertThat("store has to be marked as corrupted", corruptedMarkerCount.get(), equalTo(1));

// try to start another time shard on corrupted files
final IndexShard corruptedShard2 = newShard(shardRouting, shardPath, indexMetaData,
null, null, indexShard.engineFactory,
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);

final IndexShardRecoveryException exception2 = expectThrows(IndexShardRecoveryException.class,
() -> newStartedShard(p -> corruptedShard2, true));
assertThat(exception2.getCause().getMessage(), equalTo(corruptionMessage + " (resource=preexisting_corruption)"));
closeShards(corruptedShard2);

// check that corrupt marker is there
corruptedMarkerCount.set(0);
Files.walkFileTree(indexPath, corruptedVisitor);
assertThat("store still has a single corrupt marker", corruptedMarkerCount.get(), equalTo(1));
}

private Path corruptIndexFile(ShardPath shardPath) throws IOException {
final Path indexPath = shardPath.getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
final Path[] filesToCorrupt =
Files.walk(indexPath)
.filter(p -> {
final String name = p.getFileName().toString();
return Files.isRegularFile(p)
&& name.startsWith("extra") == false // Skip files added by Lucene's ExtrasFS
&& IndexWriter.WRITE_LOCK_NAME.equals(name) == false
&& name.startsWith("segments_") == false && name.endsWith(".si") == false;
})
.toArray(Path[]::new);
CorruptionUtils.corruptFile(random(), filesToCorrupt);
return indexPath;
}

/**
* Simulates a scenario that happens when we are async fetching snapshot metadata from GatewayService
* and checking index concurrently. This should always be possible without any exception.
Expand All @@ -2592,7 +2735,7 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
.settings(Settings.builder()
.put(indexShard.indexSettings.getSettings())
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum", "fix")))
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum")))
.build();
final IndexShard newShard = newShard(shardRouting, indexShard.shardPath(), indexMetaData,
null, null, indexShard.engineFactory, indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
Expand Down Expand Up @@ -2634,6 +2777,16 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
closeShards(newShard);
}

public void testCheckOnStartupDeprecatedValue() throws Exception {
final Settings settings = Settings.builder().put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), "fix").build();

final IndexShard newShard = newShard(true, settings);
closeShards(newShard);

assertWarnings("Setting [index.shard.check_on_startup] is set to deprecated value [fix], "
+ "which has no effect and will not be accepted in future");
}

class Result {
private final int localCheckpoint;
private final int maxSeqNo;
Expand Down
Loading

0 comments on commit 6de8c6b

Please sign in to comment.