-
Notifications
You must be signed in to change notification settings - Fork 24.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Dedup translog operations by reading in reverse #27268
Changes from 10 commits
d6e64ea
d006988
5935009
ea11a2a
ebe92b6
def1d39
5cc33bb
aa21d87
9bdb507
b3e0b6d
3bd5cd5
7d3482b
a3994f8
48b0ff9
97aa4d7
c960f37
a3ff8c8
f79e3ef
29aa2f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,12 @@ | |
|
||
package org.elasticsearch.index.translog; | ||
|
||
import com.carrotsearch.hppc.LongHashSet; | ||
import com.carrotsearch.hppc.LongObjectHashMap; | ||
import com.carrotsearch.hppc.LongSet; | ||
import org.apache.lucene.util.FixedBitSet; | ||
import org.elasticsearch.index.seqno.SequenceNumbers; | ||
|
||
import java.io.Closeable; | ||
import java.io.IOException; | ||
import java.util.Arrays; | ||
|
@@ -30,32 +36,44 @@ final class MultiSnapshot implements Translog.Snapshot { | |
|
||
private final TranslogSnapshot[] translogs; | ||
private final int totalOperations; | ||
private int overriddenOperations; | ||
private final Closeable onClose; | ||
private int index; | ||
private final SeqNumSet seenSeqNo; | ||
|
||
/** | ||
* Creates a new point in time snapshot of the given snapshots. Those snapshots are always iterated in-order. | ||
*/ | ||
MultiSnapshot(TranslogSnapshot[] translogs, Closeable onClose) { | ||
this.translogs = translogs; | ||
totalOperations = Arrays.stream(translogs).mapToInt(TranslogSnapshot::totalOperations).sum(); | ||
this.totalOperations = Arrays.stream(translogs).mapToInt(TranslogSnapshot::totalOperations).sum(); | ||
this.overriddenOperations = 0; | ||
this.onClose = onClose; | ||
index = 0; | ||
this.seenSeqNo = new SeqNumSet(); | ||
this.index = translogs.length - 1; | ||
} | ||
|
||
|
||
@Override | ||
public int totalOperations() { | ||
return totalOperations; | ||
} | ||
|
||
@Override | ||
public int overriddenOperations() { | ||
return overriddenOperations; | ||
} | ||
|
||
@Override | ||
public Translog.Operation next() throws IOException { | ||
for (; index < translogs.length; index++) { | ||
for (; index >= 0; index--) { | ||
final TranslogSnapshot current = translogs[index]; | ||
Translog.Operation op = current.next(); | ||
if (op != null) { // if we are null we move to the next snapshot | ||
return op; | ||
Translog.Operation op; | ||
while ((op = current.next()) != null) { | ||
if (op.seqNo() == SequenceNumbers.UNASSIGNED_SEQ_NO || seenSeqNo.getAndSet(op.seqNo()) == false) { | ||
return op; | ||
} else { | ||
overriddenOperations++; | ||
} | ||
} | ||
} | ||
return null; | ||
|
@@ -65,4 +83,76 @@ public Translog.Operation next() throws IOException { | |
public void close() throws IOException { | ||
onClose.close(); | ||
} | ||
|
||
/** | ||
* A wrapper of {@link FixedBitSet} but allows to check if all bits are set in O(1). | ||
*/ | ||
private static final class CountedBitSet { | ||
private short onBits; | ||
private final FixedBitSet bitset; | ||
|
||
CountedBitSet(short numBits) { | ||
assert numBits > 0; | ||
this.onBits = 0; | ||
this.bitset = new FixedBitSet(numBits); | ||
} | ||
|
||
boolean getAndSet(int index) { | ||
assert index >= 0; | ||
boolean wasOn = bitset.getAndSet(index); | ||
if (wasOn == false) { | ||
onBits++; | ||
} | ||
return wasOn; | ||
} | ||
|
||
boolean hasAllBitsOn() { | ||
return onBits == bitset.length(); | ||
} | ||
} | ||
|
||
/** | ||
* Sequence numbers from translog are likely to form contiguous ranges, | ||
* thus collapsing a completed bitset into a single entry will reduce memory usage. | ||
*/ | ||
static final class SeqNumSet { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we call this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This class is used only here; a concise name in a private context is better. Anyway, I renamed it to |
||
static final short BIT_SET_SIZE = 1024; | ||
private final LongSet completedSets = new LongHashSet(); | ||
private final LongObjectHashMap<CountedBitSet> ongoingSets = new LongObjectHashMap<>(); | ||
|
||
/** | ||
* Marks this sequence number and returns <tt>true</tt> if it is seen before. | ||
*/ | ||
boolean getAndSet(long value) { | ||
assert value >= 0; | ||
final long key = value / BIT_SET_SIZE; | ||
|
||
if (completedSets.contains(key)) { | ||
return true; | ||
} | ||
|
||
CountedBitSet bitset = ongoingSets.get(key); | ||
if (bitset == null) { | ||
bitset = new CountedBitSet(BIT_SET_SIZE); | ||
ongoingSets.put(key, bitset); | ||
} | ||
|
||
final boolean wasOn = bitset.getAndSet(Math.toIntExact(value % BIT_SET_SIZE)); | ||
if (bitset.hasAllBitsOn()) { | ||
ongoingSets.remove(key); | ||
completedSets.add(key); | ||
} | ||
return wasOn; | ||
} | ||
|
||
// For testing | ||
long completeSetsSize() { | ||
return completedSets.size(); | ||
} | ||
|
||
// For testing | ||
long ongoingSetsSize() { | ||
return ongoingSets.size(); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -832,10 +832,19 @@ public int hashCode() { | |
public interface Snapshot extends Closeable { | ||
|
||
/** | ||
* The total number of operations in the translog. | ||
* The total estimated number of operations in the snapshot. | ||
*/ | ||
int totalOperations(); | ||
|
||
/** | ||
* The number of operations has been overridden (eg. superseded) in the snapshot so far. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. number of operations that have been There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
* If two operations have the same sequence number, the operation with a lower term will be overridden by the operation | ||
* with a higher term. Unlike {@link #totalOperations()}, this value is updated each time after {@link #next()}) is called. | ||
*/ | ||
default int overriddenOperations() { | ||
return 0; | ||
} | ||
|
||
/** | ||
* Returns the next operation in the snapshot or <code>null</code> if we reached the end. | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,16 +46,23 @@ | |
import org.hamcrest.Matcher; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.concurrent.CountDownLatch; | ||
import java.util.concurrent.Future; | ||
|
||
import static org.elasticsearch.index.translog.SnapshotMatchers.containsOperationsInAnyOrder; | ||
import static org.hamcrest.Matchers.anyOf; | ||
import static org.hamcrest.Matchers.containsString; | ||
import static org.hamcrest.Matchers.equalTo; | ||
import static org.hamcrest.Matchers.greaterThan; | ||
import static org.hamcrest.Matchers.greaterThanOrEqualTo; | ||
import static org.hamcrest.Matchers.instanceOf; | ||
import static org.hamcrest.Matchers.notNullValue; | ||
import static org.hamcrest.Matchers.nullValue; | ||
import static org.hamcrest.core.Is.is; | ||
|
||
public class IndexLevelReplicationTests extends ESIndexLevelReplicationTestCase { | ||
|
||
|
@@ -299,6 +306,65 @@ public void testRequestFailureReplication() throws Exception { | |
} | ||
} | ||
|
||
public void testTranslogDedupOperations() throws Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we call this testSeqNoCollision? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
try (ReplicationGroup shards = createGroup(2)) { | ||
shards.startAll(); | ||
int initDocs = shards.indexDocs(randomInt(10)); | ||
List<IndexShard> replicas = shards.getReplicas(); | ||
IndexShard replica1 = replicas.get(0); | ||
IndexShard replica2 = replicas.get(1); | ||
|
||
logger.info("--> Isolate replica1"); | ||
IndexRequest indexDoc1 = new IndexRequest(index.getName(), "type", "d1").source("{}", XContentType.JSON); | ||
BulkShardRequest replicationRequest = indexOnPrimary(indexDoc1, shards.getPrimary()); | ||
for (int i = 1; i < replicas.size(); i++) { | ||
indexOnReplica(replicationRequest, replicas.get(i)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you just use replica2? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
} | ||
|
||
final Translog.Operation op1; | ||
final List<Translog.Operation> initOperations = new ArrayList<>(initDocs); | ||
try (Translog.Snapshot snapshot = replica2.getTranslog().newSnapshot()) { | ||
assertThat(snapshot.totalOperations(), equalTo(initDocs + 1)); | ||
for (int i = 0; i < initDocs; i++) { | ||
Translog.Operation op = snapshot.next(); | ||
assertThat(op, is(notNullValue())); | ||
initOperations.add(op); | ||
} | ||
op1 = snapshot.next(); | ||
assertThat(op1, notNullValue()); | ||
assertThat(snapshot.next(), nullValue()); | ||
assertThat(snapshot.overriddenOperations(), equalTo(0)); | ||
} | ||
|
||
// Make sure that replica2 receives translog ops (eg. op2) from replica1 and overwrites its stale operation (op1). | ||
logger.info("--> Promote replica1 as the primary"); | ||
shards.promoteReplicaToPrimary(replica1); | ||
shards.index(new IndexRequest(index.getName(), "type", "d2").source("{}", XContentType.JSON)); | ||
final Translog.Operation op2; | ||
try (Translog.Snapshot snapshot = replica2.getTranslog().newSnapshot()) { | ||
assertThat(snapshot.totalOperations(), greaterThanOrEqualTo(initDocs + 2)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why can't we test for hard equality? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
op2 = snapshot.next(); | ||
assertThat(op2.seqNo(), equalTo(op1.seqNo())); | ||
assertThat(op2.primaryTerm(), greaterThan(op1.primaryTerm())); | ||
assertThat("Remaining of snapshot should contain init operations", snapshot, containsOperationsInAnyOrder(initOperations)); | ||
assertThat(snapshot.overriddenOperations(), greaterThanOrEqualTo(1)); | ||
} | ||
|
||
// Make sure that peer-recovery transfers all but non-overridden operations. | ||
IndexShard replica3 = shards.addReplica(); | ||
logger.info("--> Promote replica2 as the primary"); | ||
shards.promoteReplicaToPrimary(replica2); | ||
logger.info("--> Recover replica3 from replica2"); | ||
recoverReplica(replica3, replica2); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you also assert that the contents of the shards are the same - see ReplicationGroup#assertAllEqual There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently, we don't rollback replicas; replica2 has doc1 and doc2 while replica1 (eg. the promoted primary) does not have doc1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure, but you can remove replica1 with a comment about it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bleskes, This becomes more interesting to me :).
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Funny you say this as I have a similar thought this morning while thinking about another issue. That other issue caused me to add a todo on the #10708 ticket to deal with it (see replica recover with rollback). I'm fine with leaving the test as is for now (with a comment explaining the situation and a todo to fix this when possible) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have added a TODO for this. |
||
try (Translog.Snapshot snapshot = replica3.getTranslog().newSnapshot()) { | ||
assertThat(snapshot.totalOperations(), equalTo(initDocs + 1)); | ||
assertThat(snapshot.next(), equalTo(op2)); | ||
assertThat("Remaining of snapshot should contain init operations", snapshot, containsOperationsInAnyOrder(initOperations)); | ||
assertThat("Peer-recovery should not send overridden operations", snapshot.overriddenOperations(), equalTo(0)); | ||
} | ||
} | ||
} | ||
|
||
/** Throws <code>documentFailure</code> on every indexing operation */ | ||
static class ThrowingDocumentFailureEngineFactory implements EngineFactory { | ||
final String documentFailureMessage; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.index.translog; | ||
|
||
import com.carrotsearch.hppc.LongHashSet; | ||
import com.carrotsearch.hppc.LongSet; | ||
import org.elasticsearch.common.Randomness; | ||
import org.elasticsearch.test.ESTestCase; | ||
|
||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
import java.util.stream.LongStream; | ||
|
||
import static org.hamcrest.CoreMatchers.equalTo; | ||
import static org.hamcrest.Matchers.lessThanOrEqualTo; | ||
|
||
public class MultiSnapshotTests extends ESTestCase { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd still like to see a deterministic (i.e. no randomness) test that exercises the corners of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added 7d3482b |
||
public void testTrackSeqNumDenseRanges() throws Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does this test add compared to |
||
final MultiSnapshot.SeqNumSet bitSet = new MultiSnapshot.SeqNumSet(); | ||
final LongSet normalSet = new LongHashSet(); | ||
IntStream.range(0, scaledRandomIntBetween(5_000, 10_000)).forEach(i -> { | ||
long seq = between(0, 5000); | ||
boolean existed = normalSet.add(seq) == false; | ||
assertThat("SeqNumSet != Set" + seq, bitSet.getAndSet(seq), equalTo(existed)); | ||
assertThat(bitSet.ongoingSetsSize() + bitSet.completeSetsSize(), lessThanOrEqualTo(5L)); | ||
}); | ||
} | ||
|
||
public void testTrackSeqNumSparseRanges() throws Exception { | ||
final MultiSnapshot.SeqNumSet bitSet = new MultiSnapshot.SeqNumSet(); | ||
final LongSet normalSet = new LongHashSet(); | ||
IntStream.range(0, scaledRandomIntBetween(5_000, 10_000)).forEach(i -> { | ||
long seq = between(i * 10_000, i * 30_000); | ||
boolean existed = normalSet.add(seq) == false; | ||
assertThat("SeqNumSet != Set", bitSet.getAndSet(seq), equalTo(existed)); | ||
}); | ||
} | ||
|
||
public void testSequenceNumMimicTranslog() throws Exception { | ||
final MultiSnapshot.SeqNumSet bitSet = new MultiSnapshot.SeqNumSet(); | ||
final LongSet normalSet = new LongHashSet(); | ||
long currentSeq = between(10_000_000, 1_000_000_000); | ||
final int iterations = scaledRandomIntBetween(100, 2000); | ||
assertThat(bitSet.completeSetsSize(), equalTo(0L)); | ||
assertThat(bitSet.ongoingSetsSize(), equalTo(0L)); | ||
long totalDocs = 0; | ||
for (long i = 0; i < iterations; i++) { | ||
int batchSize = between(1, 1500); | ||
totalDocs += batchSize; | ||
currentSeq -= batchSize; | ||
List<Long> batch = LongStream.range(currentSeq, currentSeq + batchSize) | ||
.boxed() | ||
.collect(Collectors.toList()); | ||
Randomness.shuffle(batch); | ||
batch.forEach(seq -> { | ||
boolean existed = normalSet.add(seq) == false; | ||
assertThat("SeqNumSet != Set", bitSet.getAndSet(seq), equalTo(existed)); | ||
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(4L)); | ||
}); | ||
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(2L)); | ||
} | ||
assertThat(bitSet.completeSetsSize(), lessThanOrEqualTo(totalDocs / 1024)); | ||
assertThat(bitSet.ongoingSetsSize(), lessThanOrEqualTo(2L)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we call this
SequenceNumberSet
instead? Also, this idea feels more widely applicable. Can it be used elsewhere?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be used by
MultiSnapshot
only.