elastic · dnhatn · May 9, 2018 · Apr 25, 2018 · Apr 25, 2018 · May 4, 2018
diff --git a/server/src/main/java/org/elasticsearch/index/engine/Engine.java b/server/src/main/java/org/elasticsearch/index/engine/Engine.java
@@ -58,6 +58,7 @@
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.concurrent.ReleasableLock;
 import org.elasticsearch.index.VersionType;
+import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.mapper.Mapping;
 import org.elasticsearch.index.mapper.ParseContext.Document;
 import org.elasticsearch.index.mapper.ParsedDocument;
@@ -609,6 +610,12 @@ public Translog.Location getTranslogLastWriteLocation() {
         return getTranslog().getLastWriteLocation();
     }
 
+    /**
+     * Creates a new "translog" snapshot from Lucene for reading operations whose seqno in the requesting seqno range
+     */
+    public abstract Translog.Snapshot newLuceneChangesSnapshot(String source, MapperService mapperService,
+                                                               long minSeqNo, long maxSeqNo, boolean requiredFullRange) throws IOException;
+
     protected final void ensureOpen(Exception suppressed) {
         if (isClosed.get()) {
             AlreadyClosedException ace = new AlreadyClosedException(shardId + " engine is closed", failedEngine.get());

diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java
@@ -68,6 +68,7 @@
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.VersionType;
 import org.elasticsearch.index.mapper.IdFieldMapper;
+import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.mapper.ParseContext;
 import org.elasticsearch.index.mapper.ParsedDocument;
 import org.elasticsearch.index.mapper.SeqNoFieldMapper;
@@ -148,6 +149,7 @@ public class InternalEngine extends Engine {
     private final CounterMetric numDocUpdates = new CounterMetric();
     private final NumericDocValuesField softDeleteField = Lucene.newSoftDeleteField();
     private final boolean softDeleteEnabled;
+    private final LastRefreshedCheckpointListener lastRefreshedCheckpointListener;
 
     /**
      * How many bytes we are currently moving to disk, via either IndexWriter.flush or refresh.  IndexingMemoryController polls this
@@ -224,6 +226,8 @@ public InternalEngine(EngineConfig engineConfig) {
             for (ReferenceManager.RefreshListener listener: engineConfig.getInternalRefreshListener()) {
                 this.internalSearcherManager.addListener(listener);
             }
+            this.lastRefreshedCheckpointListener = new LastRefreshedCheckpointListener(localCheckpointTracker.getCheckpoint());
+            this.internalSearcherManager.addListener(lastRefreshedCheckpointListener);
             success = true;
         } finally {
             if (success == false) {
@@ -2342,6 +2346,18 @@ long getNumDocUpdates() {
         return numDocUpdates.count();
     }
 
+    public Translog.Snapshot newLuceneChangesSnapshot(String source, MapperService mapperService,
+                                                      long minSeqNo, long maxSeqNo, boolean requiredFullRange) throws IOException {
+        // TODO: Should we defer the refresh until we really need it?
+        ensureOpen();
+        if (lastRefreshedCheckpoint() < maxSeqNo) {
+            refresh(source, SearcherScope.INTERNAL);
+        }
+        refresh(source, SearcherScope.INTERNAL);
+        return new LuceneChangesSnapshot(() -> acquireSearcher(source, SearcherScope.INTERNAL), mapperService,
+            minSeqNo, maxSeqNo, requiredFullRange);
+    }
+
     @Override
     public boolean isRecovering() {
         return pendingTranslogRecovery.get();
@@ -2388,4 +2404,28 @@ public long softUpdateDocuments(Term term, Iterable<? extends Iterable<? extends
             return super.softUpdateDocuments(term, docs, softDeletes);
         }
     }
+
+    /**
+     * Returned the maximum local checkpoint value has been refreshed internally.
+     */
+    final long lastRefreshedCheckpoint() {
+        return lastRefreshedCheckpointListener.refreshedCheckpoint.get();
+    }
+    private final class LastRefreshedCheckpointListener implements ReferenceManager.RefreshListener {
+        final AtomicLong refreshedCheckpoint;
+        private long pendingCheckpoint;
+        LastRefreshedCheckpointListener(long initialLocalCheckpoint) {
+            this.refreshedCheckpoint = new AtomicLong(initialLocalCheckpoint);
+        }
+        @Override
+        public void beforeRefresh() {
+            pendingCheckpoint = localCheckpointTracker.getCheckpoint(); // All change until this point should be visible after refresh
+        }
+        @Override
+        public void afterRefresh(boolean didRefresh) {
+            if (didRefresh) {
+                refreshedCheckpoint.getAndUpdate(prev -> Math.max(prev, pendingCheckpoint));
+            }
+        }
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/index/engine/LuceneChangesSnapshot.java b/server/src/main/java/org/elasticsearch/index/engine/LuceneChangesSnapshot.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.engine;
+
+import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedNumericSortField;
+import org.apache.lucene.search.TopDocs;
+import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.core.internal.io.IOUtils;
+import org.elasticsearch.index.VersionType;
+import org.elasticsearch.index.fieldvisitor.FieldsVisitor;
+import org.elasticsearch.index.mapper.IdFieldMapper;
+import org.elasticsearch.index.mapper.MapperService;
+import org.elasticsearch.index.mapper.SeqNoFieldMapper;
+import org.elasticsearch.index.mapper.Uid;
+import org.elasticsearch.index.mapper.VersionFieldMapper;
+import org.elasticsearch.index.translog.Translog;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.List;
+import java.util.function.Supplier;
+
+/**
+ * A {@link Translog.Snapshot} from changes in a Lucene index
+ */
+final class LuceneChangesSnapshot implements Translog.Snapshot {
+    private final long fromSeqNo, toSeqNo;
+    private long lastSeenSeqNo;
+    private int skippedOperations;
+    private final boolean requiredFullRange;
+
+    private final IndexSearcher searcher;
+    private final MapperService mapperService;
+    private int docIndex;
+    private final TopDocs topDocs;
+
+    private final Closeable onClose;
+
+    /**
+     * Creates a new "translog" snapshot from Lucene for reading operations whose seq# in the specified range.
+     *
+     * @param searcherFactory   the engine searcher factory (prefer the internal searcher)
+     * @param mapperService     the mapper service which will be mainly used to resolve the document's type and uid
+     * @param fromSeqNo         the min requesting seq# - inclusive
+     * @param toSeqNo           the maximum requesting seq# - inclusive
+     * @param requiredFullRange if true, the snapshot will strictly check for the existence of operations between fromSeqNo and toSeqNo
+     */
+    LuceneChangesSnapshot(Supplier<Engine.Searcher> searcherFactory, MapperService mapperService,
+                          long fromSeqNo, long toSeqNo, boolean requiredFullRange) throws IOException {
+        if (fromSeqNo < 0 || toSeqNo < 0 || fromSeqNo > toSeqNo) {
+            throw new IllegalArgumentException("Invalid range; from_seqno [" + fromSeqNo + "], to_seqno [" + toSeqNo + "]");
+        }
+        this.mapperService = mapperService;
+        this.fromSeqNo = fromSeqNo;
+        this.toSeqNo = toSeqNo;
+        this.lastSeenSeqNo = fromSeqNo - 1;
+        this.requiredFullRange = requiredFullRange;
+        boolean success = false;
+        final Engine.Searcher engineSearcher = searcherFactory.get();
+        try {
+            this.searcher = new IndexSearcher(Lucene.wrapAllDocsLive(engineSearcher.getDirectoryReader()));
+            this.searcher.setQueryCache(null);
+            this.topDocs = searchOperations(searcher);
+            success = true;
+            this.onClose = engineSearcher;
+        } finally {
+            if (success == false) {
+                IOUtils.close(engineSearcher);
+            }
+        }
+    }
+
+    @Override
+    public void close() throws IOException {
+        onClose.close();
+    }
+
+    @Override
+    public int totalOperations() {
+        return Math.toIntExact(topDocs.totalHits);
+    }
+
+    @Override
+    public int overriddenOperations() {
+        return skippedOperations;
+    }
+
+    @Override
+    public Translog.Operation next() throws IOException {
+        final Translog.Operation op = nextOp();
+        if (requiredFullRange && lastSeenSeqNo < toSeqNo) {
+            final long expectedSeqNo = lastSeenSeqNo + 1;
+            if (op == null || op.seqNo() != expectedSeqNo) {
+                throw new IllegalStateException("Not all operations between min_seqno [" + fromSeqNo + "] " +
+                    "and max_seqno [" + toSeqNo + "] found; expected seqno [" + expectedSeqNo + "]; found [" + op + "]");
+            }
+        }
+        if (op != null) {
+            lastSeenSeqNo = op.seqNo();
+        }
+        return op;
+    }
+
+    private Translog.Operation nextOp() throws IOException {
+        final ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+        for (; docIndex < scoreDocs.length; docIndex++) {
+            if (scoreDocs[docIndex].doc == DocIdSetIterator.NO_MORE_DOCS) {
+                return null;
+            }
+            final Translog.Operation op = readDocAsOp(scoreDocs[docIndex].doc);
+            if (op != null) {
+                return op;
+            }
+        }
+        return null;
+    }
+
+    private TopDocs searchOperations(IndexSearcher searcher) throws IOException {
+        final Query rangeQuery = LongPoint.newRangeQuery(SeqNoFieldMapper.NAME, fromSeqNo, toSeqNo);
+        final Sort sortedBySeqNoThenByTerm = new Sort(
+            new SortedNumericSortField(SeqNoFieldMapper.NAME, SortField.Type.LONG),
+            new SortedNumericSortField(SeqNoFieldMapper.PRIMARY_TERM_NAME, SortField.Type.LONG, true)
+        );
+        return searcher.search(rangeQuery, Integer.MAX_VALUE, sortedBySeqNoThenByTerm);
+    }
+
+    private Translog.Operation readDocAsOp(int docID) throws IOException {
+        final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
+        final LeafReaderContext leaf = leaves.get(ReaderUtil.subIndex(docID, leaves));
+        final int segmentDocID = docID - leaf.docBase;
+        final long seqNo = readNumericDV(leaf, SeqNoFieldMapper.NAME, segmentDocID);
+        // This operation has seen and will be skipped anyway - do not visit other fields.
+        if (seqNo == lastSeenSeqNo) {
+            skippedOperations++;
+            return null;
+        }
+
+        final long primaryTerm = readNumericDV(leaf, SeqNoFieldMapper.PRIMARY_TERM_NAME, segmentDocID);
+        final FieldsVisitor fields = new FieldsVisitor(true);
+        searcher.doc(docID, fields);
+        fields.postProcess(mapperService);
+
+        final Translog.Operation op;
+        final boolean isTombstone = isTombstoneOperation(leaf, segmentDocID);
+        if (isTombstone && fields.uid() == null) {
+            op = new Translog.NoOp(seqNo, primaryTerm, ""); // TODO: store reason in ignored fields?
+            assert readNumericDV(leaf, Lucene.SOFT_DELETE_FIELD, segmentDocID) == 1
+                : "Noop operation but soft_deletes field is not set [" + op + "]";
+        } else {
+            final String id = fields.uid().id();
+            final String type = fields.uid().type();
+            final Term uid = new Term(IdFieldMapper.NAME, Uid.encodeId(id));
+            final long version = readNumericDV(leaf, VersionFieldMapper.NAME, segmentDocID);
+            if (isTombstone) {
+                op = new Translog.Delete(type, id, uid, seqNo, primaryTerm, version, VersionType.INTERNAL);
+                assert readNumericDV(leaf, Lucene.SOFT_DELETE_FIELD, segmentDocID) == 1
+                    : "Delete operation but soft_deletes field is not set [" + op + "]";
+            } else {
+                final BytesReference source = fields.source();
+                // TODO: pass the latest timestamp from engine.
+                final long autoGeneratedIdTimestamp = -1;
+                op = new Translog.Index(type, id, seqNo, primaryTerm, version, VersionType.INTERNAL,
+                    source.toBytesRef().bytes, fields.routing(), autoGeneratedIdTimestamp);
+            }
+        }
+        return op;
+    }
+
+    private boolean isTombstoneOperation(LeafReaderContext leaf, int segmentDocID) throws IOException {
+        final NumericDocValues tombstoneDV = leaf.reader().getNumericDocValues(SeqNoFieldMapper.TOMBSTONE_NAME);
+        if (tombstoneDV != null && tombstoneDV.advanceExact(segmentDocID)) {
+            return tombstoneDV.longValue() == 1;
+        }
+        return false;
+    }
+
+    private long readNumericDV(LeafReaderContext leaf, String field, int segmentDocID) throws IOException {
+        final NumericDocValues dv = leaf.reader().getNumericDocValues(field);
+        if (dv == null || dv.advanceExact(segmentDocID) == false) {
+            throw new IllegalStateException("DocValues for field [" + field + "] is not found");
+        }
+        return dv.longValue();
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -1595,11 +1595,7 @@ public Closeable acquireTranslogRetentionLock() {
      * The caller has to close the returned snapshot after finishing the reading.
      */
     public Translog.Snapshot newTranslogSnapshotFromMinSeqNo(long minSeqNo) throws IOException {
-        return newTranslogSnapshotBetween(minSeqNo, Long.MAX_VALUE);
-    }
-
-    public Translog.Snapshot newTranslogSnapshotBetween(long minSeqNo, long maxSeqNo) throws IOException {
-        return getEngine().newTranslogSnapshotBetween(minSeqNo, maxSeqNo);
+        return getEngine().newTranslogSnapshotBetween(minSeqNo, Long.MAX_VALUE);
     }
 
     /**
@@ -1609,6 +1605,22 @@ public int estimateTranslogOperationsFromMinSeq(long minSeqNo) {
         return getEngine().estimateTranslogOperationsFromMinSeq(minSeqNo);
     }
 
+    /**
+     * Creates a new "translog" snapshot from Lucene for reading operations whose seqno is between minSeqNo and maxSeqNo.
+     * The caller has to close the returned snapshot after finishing the reading.
+     *
+     * @param source            the source of the request
+     * @param minSeqNo          the min_seqno to read - inclusive
+     * @param maxSeqNo          the max_seqno to read - inclusive
+     * @param requiredFullRange if true then {@link Translog.Snapshot#next()} will throw {@link IllegalStateException}
+     *                          if any operation between minSeqNo and maxSeqNo is missing. This parameter should be only
+     *                          enabled when the requesting range is below the global checkpoint.
+     */
+    public Translog.Snapshot newLuceneChangesSnapshot(String source, long minSeqNo, long maxSeqNo,
+                                                      boolean requiredFullRange) throws IOException {
+        return getEngine().newLuceneChangesSnapshot(source, mapperService, minSeqNo, maxSeqNo, requiredFullRange);
+    }
+
     public List<Segment> segments(boolean verbose) {
         return getEngine().segments(verbose);
     }

diff --git a/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java b/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java
@@ -1545,42 +1545,6 @@ public void testConcurrentOutOfOrderDocsOnReplica() throws IOException, Interrup
         assertVisibleCount(engine, totalExpectedOps);
     }
 
-    private void concurrentlyApplyOps(List<Engine.Operation> ops, InternalEngine engine) throws InterruptedException {
-        Thread[] thread = new Thread[randomIntBetween(3, 5)];
-        CountDownLatch startGun = new CountDownLatch(thread.length);
-        AtomicInteger offset = new AtomicInteger(-1);
-        for (int i = 0; i < thread.length; i++) {
-            thread[i] = new Thread(() -> {
-                startGun.countDown();
-                try {
-                    startGun.await();
-                } catch (InterruptedException e) {
-                    throw new AssertionError(e);
-                }
-                int docOffset;
-                while ((docOffset = offset.incrementAndGet()) < ops.size()) {
-                    try {
-                        final Engine.Operation op = ops.get(docOffset);
-                        if (op instanceof Engine.Index) {
-                            engine.index((Engine.Index) op);
-                        } else {
-                            engine.delete((Engine.Delete) op);
-                        }
-                        if ((docOffset + 1) % 4 == 0) {
-                            engine.refresh("test");
-                        }
-                    } catch (IOException e) {
-                        throw new AssertionError(e);
-                    }
-                }
-            });
-            thread[i].start();
-        }
-        for (int i = 0; i < thread.length; i++) {
-            thread[i].join();
-        }
-    }
-
     public void testInternalVersioningOnPrimary() throws IOException {
         final List<Engine.Operation> ops = generateSingleDocHistory(false, VersionType.INTERNAL, 2, 2, 20, "1");
         assertOpsOnPrimary(ops, Versions.NOT_FOUND, true, engine);