-
Notifications
You must be signed in to change notification settings - Fork 24.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Upgrade to lucene-8.0.0-snapshot-31d7dfe6b1 #35224
Changes from 9 commits
c861c14
7bd1dc9
c1ca8a8
a3c1282
aa10fa5
0781a49
2182fa1
4b20850
6d5431b
1b358ee
1ca6845
2c9e726
88a0802
35a239f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.CharacterUtils; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | ||
import org.apache.lucene.analysis.util.CharTokenizer; | ||
|
||
import java.io.IOException; | ||
|
||
@Deprecated | ||
class XLowerCaseTokenizer extends Tokenizer { | ||
|
||
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; | ||
|
||
private static final int IO_BUFFER_SIZE = 4096; | ||
|
||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); | ||
|
||
private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); | ||
|
||
@Override | ||
public final boolean incrementToken() throws IOException { | ||
clearAttributes(); | ||
int length = 0; | ||
int start = -1; // this variable is always initialized | ||
int end = -1; | ||
char[] buffer = termAtt.buffer(); | ||
while (true) { | ||
if (bufferIndex >= dataLen) { | ||
offset += dataLen; | ||
CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils | ||
if (ioBuffer.getLength() == 0) { | ||
dataLen = 0; // so next offset += dataLen won't decrement offset | ||
if (length > 0) { | ||
break; | ||
} else { | ||
finalOffset = correctOffset(offset); | ||
return false; | ||
} | ||
} | ||
dataLen = ioBuffer.getLength(); | ||
bufferIndex = 0; | ||
} | ||
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone | ||
final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength()); | ||
final int charCount = Character.charCount(c); | ||
bufferIndex += charCount; | ||
|
||
if (Character.isLetter(c)) { // if it's a token char | ||
if (length == 0) { // start of token | ||
assert start == -1; | ||
start = offset + bufferIndex - charCount; | ||
end = start; | ||
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds | ||
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer | ||
} | ||
end += charCount; | ||
length += Character.toChars(Character.toLowerCase(c), buffer, length); // buffer it, normalized | ||
int maxTokenLen = CharTokenizer.DEFAULT_MAX_WORD_LEN; | ||
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test | ||
break; | ||
} | ||
} else if (length > 0) { // at non-Letter w/ chars | ||
break; // return 'em | ||
} | ||
} | ||
|
||
termAtt.setLength(length); | ||
assert start != -1; | ||
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end)); | ||
return true; | ||
|
||
} | ||
|
||
@Override | ||
public final void end() throws IOException { | ||
super.end(); | ||
// set final offset | ||
offsetAtt.setOffset(finalOffset, finalOffset); | ||
} | ||
|
||
@Override | ||
public void reset() throws IOException { | ||
super.reset(); | ||
bufferIndex = 0; | ||
offset = 0; | ||
dataLen = 0; | ||
finalOffset = 0; | ||
ioBuffer.reset(); // make sure to reset the IO buffer!! | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,26 +20,20 @@ | |
package org.elasticsearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.core.LowerCaseTokenizer; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.env.Environment; | ||
import org.elasticsearch.index.IndexSettings; | ||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory; | ||
import org.elasticsearch.index.analysis.MultiTermAwareComponent; | ||
|
||
public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent { | ||
@Deprecated | ||
public class XLowerCaseTokenizerFactory extends AbstractTokenizerFactory { | ||
|
||
LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { | ||
public XLowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { | ||
super(indexSettings, settings); | ||
} | ||
|
||
@Override | ||
public Tokenizer create() { | ||
return new LowerCaseTokenizer(); | ||
} | ||
|
||
@Override | ||
public Object getMultiTermComponent() { | ||
return this; | ||
return new XLowerCaseTokenizer(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use a LetterTokenizer followed by a LowerCaseFilter like explained in the deprecation javadocs ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd hoped to do that, but unfortunately the contract of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, thanks for explaining There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we at least restrict the usage to old indices (created before 7.0) in order to be able to remove it in 8 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed with @romseygeek offline we'll handle the deprecation and removal in a follow up pr. |
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,7 +48,7 @@ protected Map<String, Class<?>> getTokenizers() { | |
tokenizers.put("edgengram", EdgeNGramTokenizerFactory.class); | ||
tokenizers.put("classic", ClassicTokenizerFactory.class); | ||
tokenizers.put("letter", LetterTokenizerFactory.class); | ||
tokenizers.put("lowercase", LowerCaseTokenizerFactory.class); | ||
// tokenizers.put("lowercase", XLowerCaseTokenizerFactory.class); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is it commented ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The tests here are explicitly checking that we can load lucene analysis classes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, thanks |
||
tokenizers.put("pathhierarchy", PathHierarchyTokenizerFactory.class); | ||
tokenizers.put("pattern", PatternTokenizerFactory.class); | ||
tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class); | ||
|
@@ -223,7 +223,7 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() { | |
protected Map<String, Class<?>> getPreConfiguredTokenizers() { | ||
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getPreConfiguredTokenizers()); | ||
tokenizers.put("keyword", null); | ||
tokenizers.put("lowercase", null); | ||
tokenizers.put("lowercase", Void.class); | ||
tokenizers.put("classic", null); | ||
tokenizers.put("uax_url_email", org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class); | ||
tokenizers.put("path_hierarchy", null); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
8db13c6e146c851614c9f862f1eac67431f9b509 |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
b474e1a2d7f0172338a08f159849a6c491781d70 |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
fc547e69837bcb808f1782bfa35490645bab9cae |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
e08961a2ec9414947693659ff79bb7e21a410298 |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
09280919225656c7ce2a14af29666a02bd86c540 |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
880f10393cdefff7575fbf5b2ced890666ec81dc |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
b41451a9d4e30b8a9a14ccdd7553e5796f77cf44 |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
145fd2c803d682c2cb2d78e6e350e09a09a09ea0 |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we keep the old name ?