Skip to content

Commit

Permalink
Dedup translog operations by reading in reverse (#27268)
Browse files Browse the repository at this point in the history
Currently, translog operations are read and processed one by one. This
may be a problem as stale operations in translogs may suddenly reappear
in recoveries. To make sure that stale operations won't be processed, we
read the translog files in a reverse order (eg. from the most recent
file to the oldest file) and only process an operation if its sequence
number was not seen before.

Relates to #10708
  • Loading branch information
dnhatn committed Nov 26, 2017
1 parent ff87944 commit 5f138e5
Show file tree
Hide file tree
Showing 7 changed files with 454 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@

package org.elasticsearch.index.translog;

import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.LongObjectHashMap;
import com.carrotsearch.hppc.LongSet;
import org.apache.lucene.util.FixedBitSet;
import org.elasticsearch.index.seqno.SequenceNumbers;

import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;
Expand All @@ -30,32 +36,44 @@ final class MultiSnapshot implements Translog.Snapshot {

private final TranslogSnapshot[] translogs;
private final int totalOperations;
private int overriddenOperations;
private final Closeable onClose;
private int index;
private final SeqNoSet seenSeqNo;

/**
* Creates a new point in time snapshot of the given snapshots. Those snapshots are always iterated in-order.
*/
MultiSnapshot(TranslogSnapshot[] translogs, Closeable onClose) {
this.translogs = translogs;
totalOperations = Arrays.stream(translogs).mapToInt(TranslogSnapshot::totalOperations).sum();
this.totalOperations = Arrays.stream(translogs).mapToInt(TranslogSnapshot::totalOperations).sum();
this.overriddenOperations = 0;
this.onClose = onClose;
index = 0;
this.seenSeqNo = new SeqNoSet();
this.index = translogs.length - 1;
}


@Override
public int totalOperations() {
return totalOperations;
}

@Override
public int overriddenOperations() {
return overriddenOperations;
}

@Override
public Translog.Operation next() throws IOException {
for (; index < translogs.length; index++) {
for (; index >= 0; index--) {
final TranslogSnapshot current = translogs[index];
Translog.Operation op = current.next();
if (op != null) { // if we are null we move to the next snapshot
return op;
Translog.Operation op;
while ((op = current.next()) != null) {
if (op.seqNo() == SequenceNumbers.UNASSIGNED_SEQ_NO || seenSeqNo.getAndSet(op.seqNo()) == false) {
return op;
} else {
overriddenOperations++;
}
}
}
return null;
Expand All @@ -65,4 +83,76 @@ public Translog.Operation next() throws IOException {
public void close() throws IOException {
onClose.close();
}

/**
* A wrapper of {@link FixedBitSet} but allows to check if all bits are set in O(1).
*/
private static final class CountedBitSet {
private short onBits;
private final FixedBitSet bitset;

CountedBitSet(short numBits) {
assert numBits > 0;
this.onBits = 0;
this.bitset = new FixedBitSet(numBits);
}

boolean getAndSet(int index) {
assert index >= 0;
boolean wasOn = bitset.getAndSet(index);
if (wasOn == false) {
onBits++;
}
return wasOn;
}

boolean hasAllBitsOn() {
return onBits == bitset.length();
}
}

/**
* Sequence numbers from translog are likely to form contiguous ranges,
* thus collapsing a completed bitset into a single entry will reduce memory usage.
*/
static final class SeqNoSet {
static final short BIT_SET_SIZE = 1024;
private final LongSet completedSets = new LongHashSet();
private final LongObjectHashMap<CountedBitSet> ongoingSets = new LongObjectHashMap<>();

/**
* Marks this sequence number and returns <tt>true</tt> if it is seen before.
*/
boolean getAndSet(long value) {
assert value >= 0;
final long key = value / BIT_SET_SIZE;

if (completedSets.contains(key)) {
return true;
}

CountedBitSet bitset = ongoingSets.get(key);
if (bitset == null) {
bitset = new CountedBitSet(BIT_SET_SIZE);
ongoingSets.put(key, bitset);
}

final boolean wasOn = bitset.getAndSet(Math.toIntExact(value % BIT_SET_SIZE));
if (bitset.hasAllBitsOn()) {
ongoingSets.remove(key);
completedSets.add(key);
}
return wasOn;
}

// For testing
long completeSetsSize() {
return completedSets.size();
}

// For testing
long ongoingSetsSize() {
return ongoingSets.size();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -831,10 +831,19 @@ public int hashCode() {
public interface Snapshot extends Closeable {

/**
* The total number of operations in the translog.
* The total estimated number of operations in the snapshot.
*/
int totalOperations();

/**
* The number of operations have been overridden (eg. superseded) in the snapshot so far.
* If two operations have the same sequence number, the operation with a lower term will be overridden by the operation
* with a higher term. Unlike {@link #totalOperations()}, this value is updated each time after {@link #next()}) is called.
*/
default int overriddenOperations() {
return 0;
}

/**
* Returns the next operation in the snapshot or <code>null</code> if we reached the end.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.function.Supplier;
Expand Down Expand Up @@ -567,8 +568,9 @@ protected SendSnapshotResult sendSnapshot(final long startingSeqNo, final Transl
cancellableThreads.executeIO(sendBatch);
}

assert expectedTotalOps == skippedOps + totalSentOps
: "expected total [" + expectedTotalOps + "], skipped [" + skippedOps + "], total sent [" + totalSentOps + "]";
assert expectedTotalOps == snapshot.overriddenOperations() + skippedOps + totalSentOps
: String.format(Locale.ROOT, "expected total [%d], overridden [%d], skipped [%d], total sent [%d]",
expectedTotalOps, snapshot.overriddenOperations(), skippedOps, totalSentOps);

logger.trace("sent final batch of [{}][{}] (total: [{}]) translog operations", ops, new ByteSizeValue(size), expectedTotalOps);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,23 @@
import org.hamcrest.Matcher;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Future;

import static org.elasticsearch.index.translog.SnapshotMatchers.containsOperationsInAnyOrder;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
import static org.hamcrest.core.Is.is;

public class IndexLevelReplicationTests extends ESIndexLevelReplicationTestCase {

Expand Down Expand Up @@ -299,6 +306,68 @@ public void testRequestFailureReplication() throws Exception {
}
}

public void testSeqNoCollision() throws Exception {
try (ReplicationGroup shards = createGroup(2)) {
shards.startAll();
int initDocs = shards.indexDocs(randomInt(10));
List<IndexShard> replicas = shards.getReplicas();
IndexShard replica1 = replicas.get(0);
IndexShard replica2 = replicas.get(1);
shards.syncGlobalCheckpoint();

logger.info("--> Isolate replica1");
IndexRequest indexDoc1 = new IndexRequest(index.getName(), "type", "d1").source("{}", XContentType.JSON);
BulkShardRequest replicationRequest = indexOnPrimary(indexDoc1, shards.getPrimary());
indexOnReplica(replicationRequest, replica2);

final Translog.Operation op1;
final List<Translog.Operation> initOperations = new ArrayList<>(initDocs);
try (Translog.Snapshot snapshot = replica2.getTranslog().newSnapshot()) {
assertThat(snapshot.totalOperations(), equalTo(initDocs + 1));
for (int i = 0; i < initDocs; i++) {
Translog.Operation op = snapshot.next();
assertThat(op, is(notNullValue()));
initOperations.add(op);
}
op1 = snapshot.next();
assertThat(op1, notNullValue());
assertThat(snapshot.next(), nullValue());
assertThat(snapshot.overriddenOperations(), equalTo(0));
}
// Make sure that replica2 receives translog ops (eg. op2) from replica1 and overwrites its stale operation (op1).
logger.info("--> Promote replica1 as the primary");
shards.promoteReplicaToPrimary(replica1).get(); // wait until resync completed.
shards.index(new IndexRequest(index.getName(), "type", "d2").source("{}", XContentType.JSON));
final Translog.Operation op2;
try (Translog.Snapshot snapshot = replica2.getTranslog().newSnapshot()) {
assertThat(snapshot.totalOperations(), equalTo(initDocs + 2));
op2 = snapshot.next();
assertThat(op2.seqNo(), equalTo(op1.seqNo()));
assertThat(op2.primaryTerm(), greaterThan(op1.primaryTerm()));
assertThat("Remaining of snapshot should contain init operations", snapshot, containsOperationsInAnyOrder(initOperations));
assertThat(snapshot.overriddenOperations(), equalTo(1));
}

// Make sure that peer-recovery transfers all but non-overridden operations.
IndexShard replica3 = shards.addReplica();
logger.info("--> Promote replica2 as the primary");
shards.promoteReplicaToPrimary(replica2);
logger.info("--> Recover replica3 from replica2");
recoverReplica(replica3, replica2);
try (Translog.Snapshot snapshot = replica3.getTranslog().newSnapshot()) {
assertThat(snapshot.totalOperations(), equalTo(initDocs + 1));
assertThat(snapshot.next(), equalTo(op2));
assertThat("Remaining of snapshot should contain init operations", snapshot, containsOperationsInAnyOrder(initOperations));
assertThat("Peer-recovery should not send overridden operations", snapshot.overriddenOperations(), equalTo(0));
}
// TODO: We should assert the content of shards in the ReplicationGroup.
// Without rollback replicas(current implementation), we don't have the same content across shards:
// - replica1 has {doc1}
// - replica2 has {doc1, doc2}
// - replica3 can have either {doc2} only if operation-based recovery or {doc1, doc2} if file-based recovery
}
}

/** Throws <code>documentFailure</code> on every indexing operation */
static class ThrowingDocumentFailureEngineFactory implements EngineFactory {
final String documentFailureMessage;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.translog;

import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.LongSet;
import org.elasticsearch.common.Randomness;
import org.elasticsearch.test.ESTestCase;

import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.LongStream;

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.Matchers.lessThanOrEqualTo;

public class MultiSnapshotTests extends ESTestCase {

public void testTrackSeqNoSimpleRange() throws Exception {
final MultiSnapshot.SeqNoSet bitSet = new MultiSnapshot.SeqNoSet();
final List<Long> values = LongStream.range(0, 1024).boxed().collect(Collectors.toList());
Randomness.shuffle(values);
for (int i = 0; i < 1023; i++) {
assertThat(bitSet.getAndSet(values.get(i)), equalTo(false));
assertThat(bitSet.ongoingSetsSize(), equalTo(1L));
assertThat(bitSet.completeSetsSize(), equalTo(0L));
}

assertThat(bitSet.getAndSet(values.get(1023)), equalTo(false));
assertThat(bitSet.ongoingSetsSize(), equalTo(0L));
assertThat(bitSet.completeSetsSize(), equalTo(1L));

assertThat(bitSet.getAndSet(between(0, 1023)), equalTo(true));
assertThat(bitSet.getAndSet(between(1024, Integer.MAX_VALUE)), equalTo(false));
}

public void testTrackSeqNoDenseRanges() throws Exception {
final MultiSnapshot.SeqNoSet bitSet = new MultiSnapshot.SeqNoSet();
final LongSet normalSet = new LongHashSet();
IntStream.range(0, scaledRandomIntBetween(5_000, 10_000)).forEach(i -> {
long seq = between(0, 5000);
boolean existed = normalSet.add(seq) == false;
assertThat("SeqNoSet != Set" + seq, bitSet.getAndSet(seq), equalTo(existed));
assertThat(bitSet.ongoingSetsSize() + bitSet.completeSetsSize(), lessThanOrEqualTo(5L));
});
}

public void testTrackSeqNoSparseRanges() throws Exception {
final MultiSnapshot.SeqNoSet bitSet = new MultiSnapshot.SeqNoSet();
final LongSet normalSet = new LongHashSet();
IntStream.range(0, scaledRandomIntBetween(5_000, 10_000)).forEach(i -> {
long seq = between(i * 10_000, i * 30_000);
boolean existed = normalSet.add(seq) == false;
assertThat("SeqNoSet != Set", bitSet.getAndSet(seq), equalTo(existed));
});
}

public void testTrackSeqNoMimicTranslogRanges() throws Exception {
final MultiSnapshot.SeqNoSet bitSet = new MultiSnapshot.SeqNoSet();
final LongSet normalSet = new LongHashSet();
long currentSeq = between(10_000_000, 1_000_000_000);
final int iterations = scaledRandomIntBetween(100, 2000);
assertThat(bitSet.completeSetsSize(), equalTo(0L));
assertThat(bitSet.ongoingSetsSize(), equalTo(0L));
long totalDocs = 0;
for (long i = 0; i < iterations; i++) {
int batchSize = between(1, 1500);
totalDocs += batchSize;
currentSeq -= batchSize;
List<Long> batch = LongStream.range(currentSeq, currentSeq + batchSize)
.boxed()
.collect(Collectors.toList());
Randomness.shuffle(batch);
batch.forEach(seq -> {
boolean existed = normalSet.add(seq) == false;
assertThat("SeqNoSet != Set", bitSet.getAndSet(seq), equalTo(existed));
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(4L));
});
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(2L));
}
assertThat(bitSet.completeSetsSize(), lessThanOrEqualTo(totalDocs / 1024));
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(2L));
}
}
Loading

0 comments on commit 5f138e5

Please sign in to comment.