-
Notifications
You must be signed in to change notification settings - Fork 24.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Upgrade to lucene-8.0.0-snapshot-31d7dfe6b1 (#35224)
- Loading branch information
Showing
81 changed files
with
382 additions
and
322 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
113 changes: 113 additions & 0 deletions
113
.../analysis-common/src/main/java/org/elasticsearch/analysis/common/XLowerCaseTokenizer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.CharacterUtils; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | ||
import org.apache.lucene.analysis.util.CharTokenizer; | ||
|
||
import java.io.IOException; | ||
|
||
@Deprecated | ||
class XLowerCaseTokenizer extends Tokenizer { | ||
|
||
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; | ||
|
||
private static final int IO_BUFFER_SIZE = 4096; | ||
|
||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); | ||
|
||
private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); | ||
|
||
@Override | ||
public final boolean incrementToken() throws IOException { | ||
clearAttributes(); | ||
int length = 0; | ||
int start = -1; // this variable is always initialized | ||
int end = -1; | ||
char[] buffer = termAtt.buffer(); | ||
while (true) { | ||
if (bufferIndex >= dataLen) { | ||
offset += dataLen; | ||
CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils | ||
if (ioBuffer.getLength() == 0) { | ||
dataLen = 0; // so next offset += dataLen won't decrement offset | ||
if (length > 0) { | ||
break; | ||
} else { | ||
finalOffset = correctOffset(offset); | ||
return false; | ||
} | ||
} | ||
dataLen = ioBuffer.getLength(); | ||
bufferIndex = 0; | ||
} | ||
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone | ||
final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength()); | ||
final int charCount = Character.charCount(c); | ||
bufferIndex += charCount; | ||
|
||
if (Character.isLetter(c)) { // if it's a token char | ||
if (length == 0) { // start of token | ||
assert start == -1; | ||
start = offset + bufferIndex - charCount; | ||
end = start; | ||
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds | ||
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer | ||
} | ||
end += charCount; | ||
length += Character.toChars(Character.toLowerCase(c), buffer, length); // buffer it, normalized | ||
int maxTokenLen = CharTokenizer.DEFAULT_MAX_WORD_LEN; | ||
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test | ||
break; | ||
} | ||
} else if (length > 0) { // at non-Letter w/ chars | ||
break; // return 'em | ||
} | ||
} | ||
|
||
termAtt.setLength(length); | ||
assert start != -1; | ||
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end)); | ||
return true; | ||
|
||
} | ||
|
||
@Override | ||
public final void end() throws IOException { | ||
super.end(); | ||
// set final offset | ||
offsetAtt.setOffset(finalOffset, finalOffset); | ||
} | ||
|
||
@Override | ||
public void reset() throws IOException { | ||
super.reset(); | ||
bufferIndex = 0; | ||
offset = 0; | ||
dataLen = 0; | ||
finalOffset = 0; | ||
ioBuffer.reset(); // make sure to reset the IO buffer!! | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 change: 1 addition & 0 deletions
1
modules/lang-expression/licenses/lucene-expressions-8.0.0-snapshot-31d7dfe6b1.jar.sha1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
8db13c6e146c851614c9f862f1eac67431f9b509 |
1 change: 0 additions & 1 deletion
1
modules/lang-expression/licenses/lucene-expressions-8.0.0-snapshot-7d0a7782fa.jar.sha1
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 change: 1 addition & 0 deletions
1
plugins/analysis-icu/licenses/lucene-analyzers-icu-8.0.0-snapshot-31d7dfe6b1.jar.sha1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
b474e1a2d7f0172338a08f159849a6c491781d70 |
1 change: 0 additions & 1 deletion
1
plugins/analysis-icu/licenses/lucene-analyzers-icu-8.0.0-snapshot-7d0a7782fa.jar.sha1
This file was deleted.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
...s/analysis-kuromoji/licenses/lucene-analyzers-kuromoji-8.0.0-snapshot-31d7dfe6b1.jar.sha1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
fc547e69837bcb808f1782bfa35490645bab9cae |
1 change: 0 additions & 1 deletion
1
...s/analysis-kuromoji/licenses/lucene-analyzers-kuromoji-8.0.0-snapshot-7d0a7782fa.jar.sha1
This file was deleted.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
plugins/analysis-nori/licenses/lucene-analyzers-nori-8.0.0-snapshot-31d7dfe6b1.jar.sha1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
e08961a2ec9414947693659ff79bb7e21a410298 |
1 change: 0 additions & 1 deletion
1
plugins/analysis-nori/licenses/lucene-analyzers-nori-8.0.0-snapshot-7d0a7782fa.jar.sha1
This file was deleted.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
...s/analysis-phonetic/licenses/lucene-analyzers-phonetic-8.0.0-snapshot-31d7dfe6b1.jar.sha1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
09280919225656c7ce2a14af29666a02bd86c540 |
1 change: 0 additions & 1 deletion
1
...s/analysis-phonetic/licenses/lucene-analyzers-phonetic-8.0.0-snapshot-7d0a7782fa.jar.sha1
This file was deleted.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
...ins/analysis-smartcn/licenses/lucene-analyzers-smartcn-8.0.0-snapshot-31d7dfe6b1.jar.sha1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
880f10393cdefff7575fbf5b2ced890666ec81dc |
1 change: 0 additions & 1 deletion
1
...ins/analysis-smartcn/licenses/lucene-analyzers-smartcn-8.0.0-snapshot-7d0a7782fa.jar.sha1
This file was deleted.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
...ins/analysis-stempel/licenses/lucene-analyzers-stempel-8.0.0-snapshot-31d7dfe6b1.jar.sha1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
b41451a9d4e30b8a9a14ccdd7553e5796f77cf44 |
1 change: 0 additions & 1 deletion
1
...ins/analysis-stempel/licenses/lucene-analyzers-stempel-8.0.0-snapshot-7d0a7782fa.jar.sha1
This file was deleted.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
...nalysis-ukrainian/licenses/lucene-analyzers-morfologik-8.0.0-snapshot-31d7dfe6b1.jar.sha1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
145fd2c803d682c2cb2d78e6e350e09a09a09ea0 |
1 change: 0 additions & 1 deletion
1
...nalysis-ukrainian/licenses/lucene-analyzers-morfologik-8.0.0-snapshot-7d0a7782fa.jar.sha1
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.