Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to lucene-8.0.0-snapshot-31d7dfe6b1 #35224

Merged
merged 14 commits into from
Nov 6, 2018
2 changes: 1 addition & 1 deletion buildSrc/version.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
elasticsearch = 7.0.0
lucene = 8.0.0-snapshot-7d0a7782fa
lucene = 8.0.0-snapshot-31d7dfe6b1

# optional dependencies
spatial4j = 0.7
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.UpperCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
Expand Down Expand Up @@ -308,7 +307,8 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
tokenizers.put("char_group", CharGroupTokenizerFactory::new);
tokenizers.put("classic", ClassicTokenizerFactory::new);
tokenizers.put("letter", LetterTokenizerFactory::new);
tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
// TODO deprecate and remove in API
tokenizers.put("lowercase", XLowerCaseTokenizerFactory::new);
tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.put("pattern", PatternTokenizerFactory::new);
Expand Down Expand Up @@ -503,7 +503,8 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
// TODO deprecate and remove in API
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() {
@Override
public String name() {
return "lowercase";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharTokenizer;

import java.io.IOException;

@Deprecated
class XLowerCaseTokenizer extends Tokenizer {

private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;

private static final int IO_BUFFER_SIZE = 4096;

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);

@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
int length = 0;
int start = -1; // this variable is always initialized
int end = -1;
char[] buffer = termAtt.buffer();
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
if (ioBuffer.getLength() == 0) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0) {
break;
} else {
finalOffset = correctOffset(offset);
return false;
}
}
dataLen = ioBuffer.getLength();
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
final int charCount = Character.charCount(c);
bufferIndex += charCount;

if (Character.isLetter(c)) { // if it's a token char
if (length == 0) { // start of token
assert start == -1;
start = offset + bufferIndex - charCount;
end = start;
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars(Character.toLowerCase(c), buffer, length); // buffer it, normalized
int maxTokenLen = CharTokenizer.DEFAULT_MAX_WORD_LEN;
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
}
} else if (length > 0) { // at non-Letter w/ chars
break; // return 'em
}
}

termAtt.setLength(length);
assert start != -1;
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end));
return true;

}

@Override
public final void end() throws IOException {
super.end();
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}

@Override
public void reset() throws IOException {
super.reset();
bufferIndex = 0;
offset = 0;
dataLen = 0;
finalOffset = 0;
ioBuffer.reset(); // make sure to reset the IO buffer!!
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,20 @@
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;

public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent {
@Deprecated
public class XLowerCaseTokenizerFactory extends AbstractTokenizerFactory {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we keep the old name ?


LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
public XLowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, settings);
}

@Override
public Tokenizer create() {
return new LowerCaseTokenizer();
}

@Override
public Object getMultiTermComponent() {
return this;
return new XLowerCaseTokenizer();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use a LetterTokenizer followed by a LowerCaseFilter like explained in the deprecation javadocs ?
I don't think we need to keep an XLowerCaseTokenizer.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd hoped to do that, but unfortunately the contract of create demands that we return a Tokenizer, and I don't think there's an easy way of wrapping a Tokenizer + TokenFilter combination here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, thanks for explaining

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we at least restrict the usage to old indices (created before 7.0) in order to be able to remove it in 8 ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed with @romseygeek offline we'll handle the deprecation and removal in a follow up pr.

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ protected Map<String, Class<?>> getTokenizers() {
tokenizers.put("edgengram", EdgeNGramTokenizerFactory.class);
tokenizers.put("classic", ClassicTokenizerFactory.class);
tokenizers.put("letter", LetterTokenizerFactory.class);
tokenizers.put("lowercase", LowerCaseTokenizerFactory.class);
// tokenizers.put("lowercase", XLowerCaseTokenizerFactory.class);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it commented ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests here are explicitly checking that we can load lucene analysis classes. LowercaseTokenizer isn't there any more, so this needs to be removed - the commenting out was just to get tests to pass.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, thanks

tokenizers.put("pathhierarchy", PathHierarchyTokenizerFactory.class);
tokenizers.put("pattern", PatternTokenizerFactory.class);
tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class);
Expand Down Expand Up @@ -223,7 +223,7 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getPreConfiguredTokenizers());
tokenizers.put("keyword", null);
tokenizers.put("lowercase", null);
tokenizers.put("lowercase", Void.class);
tokenizers.put("classic", null);
tokenizers.put("uax_url_email", org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class);
tokenizers.put("path_hierarchy", null);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
8db13c6e146c851614c9f862f1eac67431f9b509

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public void testDefaults() throws Exception {
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertEquals(1, pointField.fieldType().pointDataDimensionCount());
assertFalse(pointField.fieldType().stored());
assertEquals(1230, pointField.numericValue().longValue());
IndexableField dvField = fields[1];
Expand Down Expand Up @@ -149,7 +149,7 @@ public void testNoDocValues() throws Exception {
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(1, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertEquals(1, pointField.fieldType().pointDataDimensionCount());
assertEquals(1230, pointField.numericValue().longValue());
}

Expand All @@ -173,7 +173,7 @@ public void testStore() throws Exception {
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(3, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertEquals(1, pointField.fieldType().pointDataDimensionCount());
assertEquals(1230, pointField.numericValue().doubleValue(), 0d);
IndexableField dvField = fields[1];
assertEquals(DocValuesType.SORTED_NUMERIC, dvField.fieldType().docValuesType());
Expand Down Expand Up @@ -202,7 +202,7 @@ public void testCoerce() throws Exception {
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertEquals(1, pointField.fieldType().pointDataDimensionCount());
assertEquals(1230, pointField.numericValue().longValue());
IndexableField dvField = fields[1];
assertEquals(DocValuesType.SORTED_NUMERIC, dvField.fieldType().docValuesType());
Expand Down Expand Up @@ -317,7 +317,7 @@ public void testNullValue() throws IOException {
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertEquals(1, pointField.fieldType().pointDataDimensionCount());
assertFalse(pointField.fieldType().stored());
assertEquals(25, pointField.numericValue().longValue());
IndexableField dvField = fields[1];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ Tuple<List<BytesRef>, Map<String, List<byte[]>>> extractTermsAndRanges(IndexRead
extractedTerms.add(builder.toBytesRef());
}
}
if (info.getPointDimensionCount() == 1) { // not != 0 because range fields are not supported
if (info.getPointIndexDimensionCount() == 1) { // not != 0 because range fields are not supported
PointValues values = reader.getPointValues(info.name);
List<byte[]> encodedPointValues = new ArrayList<>();
encodedPointValues.add(values.getMinPackedValue().clone());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
Expand Down Expand Up @@ -1090,7 +1090,7 @@ private void duelRun(PercolateQuery.QueryStore queryStore, MemoryIndex memoryInd
String queryToString = shardSearcher.doc(controlTopDocs.scoreDocs[i].doc).get("query_to_string");
logger.error("controlTopDocs.scoreDocs[{}].query_to_string={}", i, queryToString);

TermsEnum tenum = MultiFields.getFields(shardSearcher.getIndexReader()).terms(fieldType.queryTermsField.name()).iterator();
TermsEnum tenum = MultiTerms.getTerms(shardSearcher.getIndexReader(), fieldType.queryTermsField.name()).iterator();
StringBuilder builder = new StringBuilder();
for (BytesRef term = tenum.next(); term != null; term = tenum.next()) {
PostingsEnum penum = tenum.postings(null);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b474e1a2d7f0172338a08f159849a6c491781d70

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fc547e69837bcb808f1782bfa35490645bab9cae

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
e08961a2ec9414947693659ff79bb7e21a410298

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
09280919225656c7ce2a14af29666a02bd86c540

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
880f10393cdefff7575fbf5b2ced890666ec81dc

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b41451a9d4e30b8a9a14ccdd7553e5796f77cf44

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
145fd2c803d682c2cb2d78e6e350e09a09a09ea0

This file was deleted.

Loading