Exposed ICU collator options in IcuCollationTokenFilterFactory

Closes #6
elastic · Sep 28, 2012 · 59d7f5c · 59d7f5c
1 parent e7d045e
commit 59d7f5c
Show file tree

Hide file tree

Showing 5 changed files with 416 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -103,6 +103,31 @@ And here is a sample of custom collation:
         }
     }
 
+Optional options:
+* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
+ The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
+ Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
+ See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed
+ explanation for the specific values.
+* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
+`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
+normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
+before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
+faster and more complete collation behavior. Since a great many of the world's languages do not require text
+normalization, most locales set `no` as the default decomposition mode.
+
+Expert options:
+* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
+ to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
+* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
+ strength is set to `primary` this will ignore accent differences.
+* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
+ for strength `tertiary`.
+* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
+ example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
+* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
+* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
+ and Hiragana characters in `quaternary` strength .
 
 ICU Tokenizer
 -------------

diff --git a/pom.xml b/pom.xml
@@ -68,6 +68,16 @@
             <artifactId>testng</artifactId>
             <version>6.8</version>
             <scope>test</scope>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.hamcrest</groupId>
+                    <artifactId>hamcrest-core</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>junit</groupId>
+                    <artifactId>junit</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
 
         <dependency>

diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java
@@ -45,8 +45,6 @@
  * <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
  * Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
  * in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
- *
- *
  */
 public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
 
@@ -96,6 +94,81 @@ public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings index
                 collator = Collator.getInstance();
             }
         }
+
+        // set the strength flag, otherwise it will be the default.
+        String strength = settings.get("strength");
+        if (strength != null) {
+            if (strength.equalsIgnoreCase("primary")) {
+                collator.setStrength(Collator.PRIMARY);
+            } else if (strength.equalsIgnoreCase("secondary")) {
+                collator.setStrength(Collator.SECONDARY);
+            } else if (strength.equalsIgnoreCase("tertiary")) {
+                collator.setStrength(Collator.TERTIARY);
+            } else if (strength.equalsIgnoreCase("quaternary")) {
+                collator.setStrength(Collator.QUATERNARY);
+            } else if (strength.equalsIgnoreCase("identical")) {
+                collator.setStrength(Collator.IDENTICAL);
+            } else {
+                throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength);
+            }
+        }
+
+        // set the decomposition flag, otherwise it will be the default.
+        String decomposition = settings.get("decomposition");
+        if (decomposition != null) {
+            if (decomposition.equalsIgnoreCase("no")) {
+                collator.setDecomposition(Collator.NO_DECOMPOSITION);
+            } else if (decomposition.equalsIgnoreCase("canonical")) {
+                collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+            } else {
+                throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition);
+            }
+        }
+
+        // expert options: concrete subclasses are always a RuleBasedCollator
+        RuleBasedCollator rbc = (RuleBasedCollator) collator;
+        String alternate = settings.get("alternate");
+        if (alternate != null) {
+            if (alternate.equalsIgnoreCase("shifted")) {
+                rbc.setAlternateHandlingShifted(true);
+            } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+                rbc.setAlternateHandlingShifted(false);
+            } else {
+                throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate);
+            }
+        }
+
+        Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
+        if (caseLevel != null) {
+            rbc.setCaseLevel(caseLevel);
+        }
+
+        String caseFirst = settings.get("caseFirst");
+        if (caseFirst != null) {
+            if (caseFirst.equalsIgnoreCase("lower")) {
+                rbc.setLowerCaseFirst(true);
+            } else if (caseFirst.equalsIgnoreCase("upper")) {
+                rbc.setUpperCaseFirst(true);
+            } else {
+                throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst);
+            }
+        }
+
+        Boolean numeric = settings.getAsBoolean("numeric", null);
+        if (numeric != null) {
+            rbc.setNumericCollation(numeric);
+        }
+
+        String variableTop = settings.get("variableTop");
+        if (variableTop != null) {
+            rbc.setVariableTop(variableTop);
+        }
+
+        Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
+        if (hiraganaQuaternaryMode != null) {
+            rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
+        }
+
         this.collator = collator;
     }
 

diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
@@ -29,10 +29,10 @@
 import org.elasticsearch.index.settings.IndexSettingsModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
-import org.hamcrest.MatcherAssert;
 import org.testng.annotations.Test;
 
 import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
+import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.instanceOf;
 
 /**
@@ -53,18 +53,18 @@ public void testDefaultsIcuAnalysis() {
         AnalysisService analysisService = injector.getInstance(AnalysisService.class);
 
         TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
-        MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
+        assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
 
         TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
 
         filterFactory = analysisService.tokenFilter("icu_folding");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
 
         filterFactory = analysisService.tokenFilter("icu_collation");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
 
         filterFactory = analysisService.tokenFilter("icu_transform");
-        MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
+        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
     }
 }