diff --git a/src/Examine.Lucene/Analyzers/CultureInvariantStandardAnalyzer.cs b/src/Examine.Lucene/Analyzers/CultureInvariantStandardAnalyzer.cs index dee785db8..a5535b9b9 100644 --- a/src/Examine.Lucene/Analyzers/CultureInvariantStandardAnalyzer.cs +++ b/src/Examine.Lucene/Analyzers/CultureInvariantStandardAnalyzer.cs @@ -15,10 +15,13 @@ namespace Examine.Lucene.Analyzers public sealed class CultureInvariantStandardAnalyzer : Analyzer { private readonly CharArraySet _stopWordsSet; + private readonly bool _caseInsensitive; + private readonly bool _ignoreLanguageAccents; public CultureInvariantStandardAnalyzer(CharArraySet stopWords) + : this(stopWords, true, true) { - _stopWordsSet = stopWords; + } public CultureInvariantStandardAnalyzer() @@ -26,6 +29,13 @@ public CultureInvariantStandardAnalyzer() { } + public CultureInvariantStandardAnalyzer(CharArraySet stopWords, bool caseInsensitive, bool ignoreLanguageAccents) + { + _stopWordsSet = stopWords; + _caseInsensitive = caseInsensitive; + _ignoreLanguageAccents = ignoreLanguageAccents; + } + protected override TokenStreamComponents CreateComponents( string fieldName, TextReader reader) @@ -37,7 +47,15 @@ protected override TokenStreamComponents CreateComponents( TokenStream result = new StandardFilter(LuceneInfo.CurrentVersion, tokenizer); - result = new LowerCaseFilter(LuceneInfo.CurrentVersion, result); + if (_caseInsensitive) + { + result = new LowerCaseFilter(LuceneInfo.CurrentVersion, result); + } + + if (_ignoreLanguageAccents) + { + result = new ASCIIFoldingFilter(result ?? tokenizer); + } result = new StopFilter(LuceneInfo.CurrentVersion, result, _stopWordsSet); diff --git a/src/Examine.Test/Examine.Lucene/Search/AnalyzerTests.cs b/src/Examine.Test/Examine.Lucene/Search/AnalyzerTests.cs new file mode 100644 index 000000000..3d8665d17 --- /dev/null +++ b/src/Examine.Test/Examine.Lucene/Search/AnalyzerTests.cs @@ -0,0 +1,70 @@ +using Examine.Lucene.Analyzers; +using Examine.Lucene.Providers; +using NUnit.Framework; + +namespace Examine.Test.Examine.Lucene.Search +{ + [TestFixture] + public class AnalyzerTests : ExamineBaseTest + { + [Test] + public void Given_CultureInvariantWhitespaceAnalyzer_When_SearchingBothCharVariants_Then_BothAreFound() + { + var analyzer = new CultureInvariantWhitespaceAnalyzer(); + using (var luceneDir = new RandomIdRAMDirectory()) + using (var indexer = GetTestIndex(luceneDir, analyzer)) + { + indexer.IndexItems(new[] { + ValueSet.FromObject(1.ToString(), "content", + new { bodyText = "Something rød something"}), + ValueSet.FromObject(2.ToString(), "content", + new { nodeName = "Something rod something"}) + }); + + var searcher = (BaseLuceneSearcher)indexer.Searcher; + + var query1 = searcher + .CreateQuery("content") + .Field("bodyText", "rod"); + var results1 = query1.Execute(); + + var query2 = searcher + .CreateQuery("content") + .Field("bodyText", "rød"); + var results2 = query1.Execute(); + + Assert.AreEqual(1, results1.TotalItemCount); + } + } + + [Test] + public void Given_CultureInvariantStandardAnalyzer_When_SearchingBothCharVariants_Then_BothAreFound() + { + var analyzer = new CultureInvariantStandardAnalyzer(); + using (var luceneDir = new RandomIdRAMDirectory()) + using (var indexer = GetTestIndex(luceneDir, analyzer)) + { + indexer.IndexItems(new[] { + ValueSet.FromObject(1.ToString(), "content", + new { bodyText = "Something rød something"}), + ValueSet.FromObject(2.ToString(), "content", + new { nodeName = "Something rod something"}) + }); + + var searcher = (BaseLuceneSearcher)indexer.Searcher; + + var query1 = searcher + .CreateQuery("content") + .Field("bodyText", "rod"); + var results1 = query1.Execute(); + + var query2 = searcher + .CreateQuery("content") + .Field("bodyText", "rød"); + var results2 = query1.Execute(); + + Assert.AreEqual(1, results1.TotalItemCount); + } + } + } +}