Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dedup translog operations by reading in reverse #27268

Merged
merged 19 commits into from
Nov 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@

package org.elasticsearch.index.translog;

import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.LongObjectHashMap;
import com.carrotsearch.hppc.LongSet;
import org.apache.lucene.util.FixedBitSet;
import org.elasticsearch.index.seqno.SequenceNumbers;

import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;
Expand All @@ -30,32 +36,44 @@ final class MultiSnapshot implements Translog.Snapshot {

private final TranslogSnapshot[] translogs;
private final int totalOperations;
private int overriddenOperations;
private final Closeable onClose;
private int index;
private final SeqNoSet seenSeqNo;

/**
* Creates a new point in time snapshot of the given snapshots. Those snapshots are always iterated in-order.
*/
MultiSnapshot(TranslogSnapshot[] translogs, Closeable onClose) {
this.translogs = translogs;
totalOperations = Arrays.stream(translogs).mapToInt(TranslogSnapshot::totalOperations).sum();
this.totalOperations = Arrays.stream(translogs).mapToInt(TranslogSnapshot::totalOperations).sum();
this.overriddenOperations = 0;
this.onClose = onClose;
index = 0;
this.seenSeqNo = new SeqNoSet();
this.index = translogs.length - 1;
}


@Override
public int totalOperations() {
return totalOperations;
}

@Override
public int overriddenOperations() {
return overriddenOperations;
}

@Override
public Translog.Operation next() throws IOException {
for (; index < translogs.length; index++) {
for (; index >= 0; index--) {
final TranslogSnapshot current = translogs[index];
Translog.Operation op = current.next();
if (op != null) { // if we are null we move to the next snapshot
return op;
Translog.Operation op;
while ((op = current.next()) != null) {
if (op.seqNo() == SequenceNumbers.UNASSIGNED_SEQ_NO || seenSeqNo.getAndSet(op.seqNo()) == false) {
return op;
} else {
overriddenOperations++;
}
}
}
return null;
Expand All @@ -65,4 +83,76 @@ public Translog.Operation next() throws IOException {
public void close() throws IOException {
onClose.close();
}

/**
* A wrapper of {@link FixedBitSet} but allows to check if all bits are set in O(1).
*/
private static final class CountedBitSet {
private short onBits;
private final FixedBitSet bitset;

CountedBitSet(short numBits) {
assert numBits > 0;
this.onBits = 0;
this.bitset = new FixedBitSet(numBits);
}

boolean getAndSet(int index) {
assert index >= 0;
boolean wasOn = bitset.getAndSet(index);
if (wasOn == false) {
onBits++;
}
return wasOn;
}

boolean hasAllBitsOn() {
return onBits == bitset.length();
}
}

/**
* Sequence numbers from translog are likely to form contiguous ranges,
* thus collapsing a completed bitset into a single entry will reduce memory usage.
*/
static final class SeqNoSet {
static final short BIT_SET_SIZE = 1024;
private final LongSet completedSets = new LongHashSet();
private final LongObjectHashMap<CountedBitSet> ongoingSets = new LongObjectHashMap<>();

/**
* Marks this sequence number and returns <tt>true</tt> if it is seen before.
*/
boolean getAndSet(long value) {
assert value >= 0;
final long key = value / BIT_SET_SIZE;

if (completedSets.contains(key)) {
return true;
}

CountedBitSet bitset = ongoingSets.get(key);
if (bitset == null) {
bitset = new CountedBitSet(BIT_SET_SIZE);
ongoingSets.put(key, bitset);
}

final boolean wasOn = bitset.getAndSet(Math.toIntExact(value % BIT_SET_SIZE));
if (bitset.hasAllBitsOn()) {
ongoingSets.remove(key);
completedSets.add(key);
}
return wasOn;
}

// For testing
long completeSetsSize() {
return completedSets.size();
}

// For testing
long ongoingSetsSize() {
return ongoingSets.size();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -831,10 +831,19 @@ public int hashCode() {
public interface Snapshot extends Closeable {

/**
* The total number of operations in the translog.
* The total estimated number of operations in the snapshot.
*/
int totalOperations();

/**
* The number of operations have been overridden (eg. superseded) in the snapshot so far.
* If two operations have the same sequence number, the operation with a lower term will be overridden by the operation
* with a higher term. Unlike {@link #totalOperations()}, this value is updated each time after {@link #next()}) is called.
*/
default int overriddenOperations() {
return 0;
}

/**
* Returns the next operation in the snapshot or <code>null</code> if we reached the end.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.function.Supplier;
Expand Down Expand Up @@ -567,8 +568,9 @@ protected SendSnapshotResult sendSnapshot(final long startingSeqNo, final Transl
cancellableThreads.executeIO(sendBatch);
}

assert expectedTotalOps == skippedOps + totalSentOps
: "expected total [" + expectedTotalOps + "], skipped [" + skippedOps + "], total sent [" + totalSentOps + "]";
assert expectedTotalOps == snapshot.overriddenOperations() + skippedOps + totalSentOps
: String.format(Locale.ROOT, "expected total [%d], overridden [%d], skipped [%d], total sent [%d]",
expectedTotalOps, snapshot.overriddenOperations(), skippedOps, totalSentOps);

logger.trace("sent final batch of [{}][{}] (total: [{}]) translog operations", ops, new ByteSizeValue(size), expectedTotalOps);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,23 @@
import org.hamcrest.Matcher;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Future;

import static org.elasticsearch.index.translog.SnapshotMatchers.containsOperationsInAnyOrder;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
import static org.hamcrest.core.Is.is;

public class IndexLevelReplicationTests extends ESIndexLevelReplicationTestCase {

Expand Down Expand Up @@ -299,6 +306,68 @@ public void testRequestFailureReplication() throws Exception {
}
}

public void testSeqNoCollision() throws Exception {
try (ReplicationGroup shards = createGroup(2)) {
shards.startAll();
int initDocs = shards.indexDocs(randomInt(10));
List<IndexShard> replicas = shards.getReplicas();
IndexShard replica1 = replicas.get(0);
IndexShard replica2 = replicas.get(1);
shards.syncGlobalCheckpoint();

logger.info("--> Isolate replica1");
IndexRequest indexDoc1 = new IndexRequest(index.getName(), "type", "d1").source("{}", XContentType.JSON);
BulkShardRequest replicationRequest = indexOnPrimary(indexDoc1, shards.getPrimary());
indexOnReplica(replicationRequest, replica2);

final Translog.Operation op1;
final List<Translog.Operation> initOperations = new ArrayList<>(initDocs);
try (Translog.Snapshot snapshot = replica2.getTranslog().newSnapshot()) {
assertThat(snapshot.totalOperations(), equalTo(initDocs + 1));
for (int i = 0; i < initDocs; i++) {
Translog.Operation op = snapshot.next();
assertThat(op, is(notNullValue()));
initOperations.add(op);
}
op1 = snapshot.next();
assertThat(op1, notNullValue());
assertThat(snapshot.next(), nullValue());
assertThat(snapshot.overriddenOperations(), equalTo(0));
}
// Make sure that replica2 receives translog ops (eg. op2) from replica1 and overwrites its stale operation (op1).
logger.info("--> Promote replica1 as the primary");
shards.promoteReplicaToPrimary(replica1).get(); // wait until resync completed.
shards.index(new IndexRequest(index.getName(), "type", "d2").source("{}", XContentType.JSON));
final Translog.Operation op2;
try (Translog.Snapshot snapshot = replica2.getTranslog().newSnapshot()) {
assertThat(snapshot.totalOperations(), equalTo(initDocs + 2));
op2 = snapshot.next();
assertThat(op2.seqNo(), equalTo(op1.seqNo()));
assertThat(op2.primaryTerm(), greaterThan(op1.primaryTerm()));
assertThat("Remaining of snapshot should contain init operations", snapshot, containsOperationsInAnyOrder(initOperations));
assertThat(snapshot.overriddenOperations(), equalTo(1));
}

// Make sure that peer-recovery transfers all but non-overridden operations.
IndexShard replica3 = shards.addReplica();
logger.info("--> Promote replica2 as the primary");
shards.promoteReplicaToPrimary(replica2);
logger.info("--> Recover replica3 from replica2");
recoverReplica(replica3, replica2);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also assert that the contents of the shards are the same - see ReplicationGroup#assertAllEqual

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, we don't rollback replicas; replica2 has doc1 and doc2 while replica1 (eg. the promoted primary) does not have doc1.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, but you can remove replica1 with a comment about it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bleskes, This becomes more interesting to me :).

  • Replica1 has {doc1}
  • Replica2 has {doc1, doc2}
  • Replica3 can have either {doc2} only if operation-based recovery from replica2, or {doc1, doc2} if file-based recovery from replica2.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Funny you say this as I have a similar thought this morning while thinking about another issue. That other issue caused me to add a todo on the #10708 ticket to deal with it (see replica recover with rollback). I'm fine with leaving the test as is for now (with a comment explaining the situation and a todo to fix this when possible)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added a TODO for this.

try (Translog.Snapshot snapshot = replica3.getTranslog().newSnapshot()) {
assertThat(snapshot.totalOperations(), equalTo(initDocs + 1));
assertThat(snapshot.next(), equalTo(op2));
assertThat("Remaining of snapshot should contain init operations", snapshot, containsOperationsInAnyOrder(initOperations));
assertThat("Peer-recovery should not send overridden operations", snapshot.overriddenOperations(), equalTo(0));
}
// TODO: We should assert the content of shards in the ReplicationGroup.
// Without rollback replicas(current implementation), we don't have the same content across shards:
// - replica1 has {doc1}
// - replica2 has {doc1, doc2}
// - replica3 can have either {doc2} only if operation-based recovery or {doc1, doc2} if file-based recovery
}
}

/** Throws <code>documentFailure</code> on every indexing operation */
static class ThrowingDocumentFailureEngineFactory implements EngineFactory {
final String documentFailureMessage;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.translog;

import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.LongSet;
import org.elasticsearch.common.Randomness;
import org.elasticsearch.test.ESTestCase;

import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.LongStream;

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.Matchers.lessThanOrEqualTo;

public class MultiSnapshotTests extends ESTestCase {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd still like to see a deterministic (i.e. no randomness) test that exercises the corners of SeqNumSet.

Copy link
Member Author

@dnhatn dnhatn Nov 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added 7d3482b


public void testTrackSeqNoSimpleRange() throws Exception {
final MultiSnapshot.SeqNoSet bitSet = new MultiSnapshot.SeqNoSet();
final List<Long> values = LongStream.range(0, 1024).boxed().collect(Collectors.toList());
Randomness.shuffle(values);
for (int i = 0; i < 1023; i++) {
assertThat(bitSet.getAndSet(values.get(i)), equalTo(false));
assertThat(bitSet.ongoingSetsSize(), equalTo(1L));
assertThat(bitSet.completeSetsSize(), equalTo(0L));
}

assertThat(bitSet.getAndSet(values.get(1023)), equalTo(false));
assertThat(bitSet.ongoingSetsSize(), equalTo(0L));
assertThat(bitSet.completeSetsSize(), equalTo(1L));

assertThat(bitSet.getAndSet(between(0, 1023)), equalTo(true));
assertThat(bitSet.getAndSet(between(1024, Integer.MAX_VALUE)), equalTo(false));
}

public void testTrackSeqNoDenseRanges() throws Exception {
final MultiSnapshot.SeqNoSet bitSet = new MultiSnapshot.SeqNoSet();
final LongSet normalSet = new LongHashSet();
IntStream.range(0, scaledRandomIntBetween(5_000, 10_000)).forEach(i -> {
long seq = between(0, 5000);
boolean existed = normalSet.add(seq) == false;
assertThat("SeqNoSet != Set" + seq, bitSet.getAndSet(seq), equalTo(existed));
assertThat(bitSet.ongoingSetsSize() + bitSet.completeSetsSize(), lessThanOrEqualTo(5L));
});
}

public void testTrackSeqNoSparseRanges() throws Exception {
final MultiSnapshot.SeqNoSet bitSet = new MultiSnapshot.SeqNoSet();
final LongSet normalSet = new LongHashSet();
IntStream.range(0, scaledRandomIntBetween(5_000, 10_000)).forEach(i -> {
long seq = between(i * 10_000, i * 30_000);
boolean existed = normalSet.add(seq) == false;
assertThat("SeqNoSet != Set", bitSet.getAndSet(seq), equalTo(existed));
});
}

public void testTrackSeqNoMimicTranslogRanges() throws Exception {
final MultiSnapshot.SeqNoSet bitSet = new MultiSnapshot.SeqNoSet();
final LongSet normalSet = new LongHashSet();
long currentSeq = between(10_000_000, 1_000_000_000);
final int iterations = scaledRandomIntBetween(100, 2000);
assertThat(bitSet.completeSetsSize(), equalTo(0L));
assertThat(bitSet.ongoingSetsSize(), equalTo(0L));
long totalDocs = 0;
for (long i = 0; i < iterations; i++) {
int batchSize = between(1, 1500);
totalDocs += batchSize;
currentSeq -= batchSize;
List<Long> batch = LongStream.range(currentSeq, currentSeq + batchSize)
.boxed()
.collect(Collectors.toList());
Randomness.shuffle(batch);
batch.forEach(seq -> {
boolean existed = normalSet.add(seq) == false;
assertThat("SeqNoSet != Set", bitSet.getAndSet(seq), equalTo(existed));
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(4L));
});
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(2L));
}
assertThat(bitSet.completeSetsSize(), lessThanOrEqualTo(totalDocs / 1024));
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(2L));
}
}
Loading