Skip to content

Commit

Permalink
Exposed ICU collator options in IcuCollationTokenFilterFactory
Browse files Browse the repository at this point in the history
 Closes #6
  • Loading branch information
martijnvg committed Sep 28, 2012
1 parent e7d045e commit 59d7f5c
Show file tree
Hide file tree
Showing 5 changed files with 416 additions and 8 deletions.
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,31 @@ And here is a sample of custom collation:
}
}

Optional options:
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed
explanation for the specific values.
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
faster and more complete collation behavior. Since a great many of the world's languages do not require text
normalization, most locales set `no` as the default decomposition mode.

Expert options:
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
strength is set to `primary` this will ignore accent differences.
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
for strength `tertiary`.
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
and Hiragana characters in `quaternary` strength .

ICU Tokenizer
-------------
Expand Down
10 changes: 10 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,16 @@
<artifactId>testng</artifactId>
<version>6.8</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
</exclusion>
<exclusion>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@
* <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
* Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
* in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
*
*
*/
public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {

Expand Down Expand Up @@ -96,6 +94,81 @@ public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings index
collator = Collator.getInstance();
}
}

// set the strength flag, otherwise it will be the default.
String strength = settings.get("strength");
if (strength != null) {
if (strength.equalsIgnoreCase("primary")) {
collator.setStrength(Collator.PRIMARY);
} else if (strength.equalsIgnoreCase("secondary")) {
collator.setStrength(Collator.SECONDARY);
} else if (strength.equalsIgnoreCase("tertiary")) {
collator.setStrength(Collator.TERTIARY);
} else if (strength.equalsIgnoreCase("quaternary")) {
collator.setStrength(Collator.QUATERNARY);
} else if (strength.equalsIgnoreCase("identical")) {
collator.setStrength(Collator.IDENTICAL);
} else {
throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength);
}
}

// set the decomposition flag, otherwise it will be the default.
String decomposition = settings.get("decomposition");
if (decomposition != null) {
if (decomposition.equalsIgnoreCase("no")) {
collator.setDecomposition(Collator.NO_DECOMPOSITION);
} else if (decomposition.equalsIgnoreCase("canonical")) {
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
} else {
throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition);
}
}

// expert options: concrete subclasses are always a RuleBasedCollator
RuleBasedCollator rbc = (RuleBasedCollator) collator;
String alternate = settings.get("alternate");
if (alternate != null) {
if (alternate.equalsIgnoreCase("shifted")) {
rbc.setAlternateHandlingShifted(true);
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
rbc.setAlternateHandlingShifted(false);
} else {
throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate);
}
}

Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
if (caseLevel != null) {
rbc.setCaseLevel(caseLevel);
}

String caseFirst = settings.get("caseFirst");
if (caseFirst != null) {
if (caseFirst.equalsIgnoreCase("lower")) {
rbc.setLowerCaseFirst(true);
} else if (caseFirst.equalsIgnoreCase("upper")) {
rbc.setUpperCaseFirst(true);
} else {
throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst);
}
}

Boolean numeric = settings.getAsBoolean("numeric", null);
if (numeric != null) {
rbc.setNumericCollation(numeric);
}

String variableTop = settings.get("variableTop");
if (variableTop != null) {
rbc.setVariableTop(variableTop);
}

Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
if (hiraganaQuaternaryMode != null) {
rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
}

this.collator = collator;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.hamcrest.MatcherAssert;
import org.testng.annotations.Test;

import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.instanceOf;

/**
Expand All @@ -53,18 +53,18 @@ public void testDefaultsIcuAnalysis() {
AnalysisService analysisService = injector.getInstance(AnalysisService.class);

TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));

TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));

filterFactory = analysisService.tokenFilter("icu_folding");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));

filterFactory = analysisService.tokenFilter("icu_collation");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));

filterFactory = analysisService.tokenFilter("icu_transform");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
}
}
Loading

0 comments on commit 59d7f5c

Please sign in to comment.