BUG: Fixed surrogate pair and culture-sensitivity issues with many an…

…alyzers. (see #296)
apache · Aug 2, 2020 · 3c4cfa4 · 3c4cfa4
1 parent 9aea45c
commit 3c4cfa4
Show file tree

Hide file tree

Showing 15 changed files with 43 additions and 58 deletions.
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
@@ -25,7 +25,7 @@ namespace Lucene.Net.Analysis.Core
     /// <summary>
     /// A <see cref="LetterTokenizer"/> is a tokenizer that divides text at non-letters. That's to
     /// say, it defines tokens as maximal strings of adjacent letters, as defined by
-    /// <see cref="char.IsLetter(char)"/> predicate.
+    /// <see cref="Character.IsLetter(int)"/> predicate.
     /// <para>
     /// Note: this does a decent job for most European languages, but does a terrible
     /// job for some Asian languages, where words are not separated by spaces.

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
@@ -1,4 +1,5 @@
-using Lucene.Net.Analysis.Util;
+using J2N;
+using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
 using System.IO;
 
@@ -65,7 +66,7 @@ public WhitespaceTokenizer(LuceneVersion matchVersion, AttributeFactory factory,
         /// </summary>
         protected override bool IsTokenChar(int c)
         {
-            return !char.IsWhiteSpace((char)c);
+            return !Character.IsWhiteSpace(c);
         }
     }
 }
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs
@@ -2,6 +2,7 @@
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.El
 {
@@ -37,6 +38,8 @@ public sealed class GreekLowerCaseFilter : TokenFilter
         private readonly ICharTermAttribute termAtt;
         private readonly CharacterUtils charUtils;
 
+        private static readonly CultureInfo culture = new CultureInfo("el"); // LUCENENET specific - use Greek culture when lowercasing.
+
         /// <summary>
         /// Create a <see cref="GreekLowerCaseFilter"/> that normalizes Greek token text.
         /// </summary>
@@ -127,7 +130,7 @@ private int LowerCase(int codepoint)
                     return '\u03C2'; // small final sigma
 
                 default:
-                    return Character.ToLower(codepoint);
+                    return Character.ToLower(codepoint, culture); // LUCENENET specific - need to use specific culture to override current thread
             }
         }
     }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs
@@ -841,9 +841,9 @@ public virtual bool Stem(int i0)
         //                            ch = buffer[offset++];
         //                    }
 
-        //                    if (char.IsLetter((char)ch))
+        //                    if (Character.IsLetter(ch))
         //                    {
-        //                        s.Add(char.ToLowerInvariant((char)ch));
+        //                        s.Add(Character.ToLower(ch, CultureInfo.InvariantCulture));
         //                    }
         //                    else
         //                    {

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs
@@ -1,5 +1,6 @@
 using J2N;
 using Lucene.Net.Analysis.TokenAttributes;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Ga
 {
@@ -28,6 +29,8 @@ public sealed class IrishLowerCaseFilter : TokenFilter
     {
         private readonly ICharTermAttribute termAtt;
 
+        private static readonly CultureInfo culture = new CultureInfo("ga"); // LUCENENET specific - use Irish culture when lowercasing.
+
         /// <summary>
         /// Create an <see cref="IrishLowerCaseFilter"/> that normalises Irish token text.
         /// </summary>
@@ -60,7 +63,7 @@ public override bool IncrementToken()
 
                 for (int i = idx; i < chLen;)
                 {
-                    i += Character.ToChars(Character.ToLower(chArray[i]), chArray, i);
+                    i += Character.ToChars(Character.ToLower(chArray[i], culture), chArray, i); // LUCENENET specific - use Irish culture when lowercasing
                 }
                 return true;
             }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
@@ -1,4 +1,5 @@
-using System.Globalization;
+using J2N;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Miscellaneous
 {
@@ -84,17 +85,17 @@ private static byte[] LoadDefaultWordDelimTable() // LUCENENET: Avoid static con
             for (int i = 0; i < 256; i++)
             {
                 byte code = 0;
-                if (char.IsLower((char)i))
+                if (Character.IsLower(i))
                 {
-                    code |= (byte)WordDelimiterFilter.LOWER;
+                    code |= WordDelimiterFilter.LOWER;
                 }
-                else if (char.IsUpper((char)i))
+                else if (Character.IsUpper(i))
                 {
-                    code |= (byte)WordDelimiterFilter.UPPER;
+                    code |= WordDelimiterFilter.UPPER;
                 }
-                else if (char.IsDigit((char)i))
+                else if (Character.IsDigit(i))
                 {
-                    code |= (byte)WordDelimiterFilter.DIGIT;
+                    code |= WordDelimiterFilter.DIGIT;
                 }
                 if (code == 0)
                 {
@@ -318,7 +319,7 @@ private int CharType(int ch)
         /// <returns> Type of the character </returns>
         public static byte GetType(int ch)
         {
-            switch (CharUnicodeInfo.GetUnicodeCategory((char)ch))
+            switch (Character.GetType(ch))
             {
                 case UnicodeCategory.UppercaseLetter:
                     return WordDelimiterFilter.UPPER;

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -95,7 +95,7 @@ protected override bool IncrementWord()
 
             // find the next set of boundaries, skipping over non-tokens
             int end = wordBreaker.Next();
-            while (end != BreakIterator.Done && !char.IsLetterOrDigit((char)Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+            while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
             {
                 start = end;
                 end = wordBreaker.Next();

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
@@ -1,5 +1,4 @@
 using J2N;
-using J2N.Globalization;
 using Lucene.Net.Analysis.TokenAttributes;
 using System;
 using System.Globalization;
@@ -35,12 +34,13 @@ namespace Lucene.Net.Analysis.Tr
     public sealed class TurkishLowerCaseFilter : TokenFilter
     {
         private const int LATIN_CAPITAL_LETTER_I = '\u0049';
-        private const int LATIN_CAPITAL_LETTER_DOTTED_I = '\u0130';
         private const int LATIN_SMALL_LETTER_I = '\u0069';
         private const int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
         private const int COMBINING_DOT_ABOVE = '\u0307';
         private readonly ICharTermAttribute termAtt;
 
+        private static readonly CultureInfo culture = new CultureInfo("tr"); // LUCENENET specific - we need to do a culture-sensitive lowercase operation in Turkish
+
         /// <summary>
         /// Create a new <see cref="TurkishLowerCaseFilter"/>, that normalizes Turkish token text 
         /// to lower case.
@@ -64,7 +64,7 @@ public override sealed bool IncrementToken()
                 {
                     int ch = Character.CodePointAt(buffer, i, length);
 
-                    iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && CharUnicodeInfo.GetUnicodeCategory((char)ch) == UnicodeCategory.NonSpacingMark));
+                    iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && Character.GetType(ch) == UnicodeCategory.NonSpacingMark));
 
                     if (iOrAfter) // all the special I turkish handling happens here.
                     {
@@ -93,32 +93,8 @@ public override sealed bool IncrementToken()
                         }
                     }
 
-                    using (var culture = new CultureContext("tr"))
-                    {
-                        switch (ch)
-                        {
-                            // LUCENENET: The .NET char.ToLower() function works correctly in 
-                            // Turkish as long as the current thread is set to tr-TR (well, technically the 
-                            // culture change is only required for the LATIN_CAPITAL_LETTER_I case). .NET does 
-                            // not split these characters into separate letter/non-spacing mark characters,
-                            // but the user might still input them that way so we still need the above
-                            // block to handle that case.
-                            //
-                            // LUCENENET TODO: Oddly, the Character.ToLowerCase() function below does not work right
-                            // for Turkish. Which begs the question, should this special case be there so Turkish works
-                            // everywhere? Or should we leave it a special case here because that is the way it works in Java?
-                            //
-                            // References:
-                            // http://haacked.com/archive/2012/07/05/turkish-i-problem-and-why-you-should-care.aspx/
-                            // http://www.i18nguy.com/unicode/turkish-i18n.html
-                            case LATIN_CAPITAL_LETTER_I:
-                            case LATIN_CAPITAL_LETTER_DOTTED_I:
-                                i += Character.ToChars(char.ToLower((char)ch), buffer, i);
-                                continue;
-                        }
-                    }
-
-                    i += Character.ToChars(Character.ToLower(ch), buffer, i);
+                    // LUCENENET specific - need to pass Turkish culture to get the correct lowercase results
+                    i += Character.ToChars(Character.ToLower(ch, culture), buffer, i);
                 }
 
                 termAtt.Length = length;
@@ -139,8 +115,7 @@ private bool IsBeforeDot(char[] s, int pos, int len)
             for (int i = pos; i < len;)
             {
                 int ch = Character.CodePointAt(s, i, len);
-                //if (char.getType(ch) != char.NON_SPACING_MARK)
-                if (CharUnicodeInfo.GetUnicodeCategory((char)ch) != UnicodeCategory.NonSpacingMark)
+                if (Character.GetType(ch) != UnicodeCategory.NonSpacingMark)
                 {
                     return false;
                 }
@@ -161,9 +136,7 @@ private bool IsBeforeDot(char[] s, int pos, int len)
         private int Delete(char[] s, int pos, int len)
         {
             if (pos < len)
-            {
                 Array.Copy(s, pos + 1, s, pos, len - pos - 1);
-            }
 
             return len - 1;
         }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
@@ -168,7 +168,7 @@ public static CharacterBuffer NewCharacterBuffer(int bufferSize)
 
 
         /// <summary>
-        /// Converts each unicode codepoint to lowerCase via <see cref="Character.ToLower(int)"/> starting 
+        /// Converts each unicode codepoint to lowerCase via <see cref="TextInfo.ToLower(string)"/> in the invariant culture starting 
         /// at the given offset. </summary>
         /// <param name="buffer"> the char buffer to lowercase </param>
         /// <param name="offset"> the offset to start at </param>
@@ -199,7 +199,7 @@ public static CharacterBuffer NewCharacterBuffer(int bufferSize)
         }
 
         /// <summary>
-        /// Converts each unicode codepoint to UpperCase via <see cref="Character.ToUpper(int)"/> starting 
+        /// Converts each unicode codepoint to UpperCase via <see cref="TextInfo.ToUpper(string)"/> in the invariant culture starting 
         /// at the given offset. </summary>
         /// <param name="buffer"> the char buffer to UPPERCASE </param>
         /// <param name="offset"> the offset to start at </param>

diff --git a/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs b/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs
@@ -8,6 +8,7 @@
 using Morfologik.Stemming.Polish;
 using System;
 using System.Collections.Generic;
+using System.Globalization;
 using System.Text;
 using System.Text.RegularExpressions;
 
@@ -55,6 +56,8 @@ public class MorfologikFilter : TokenFilter
 
         private int lemmaListIndex;
 
+        private static readonly CultureInfo culture = new CultureInfo("pl"); // LUCENENET specific - do lowercasing in Polish culture
+
         /// <summary>
         /// Creates a filter with the default (Polish) dictionary.
         /// </summary>
@@ -166,7 +169,7 @@ private string ToLowercase(string chs)
             for (int i = 0; i < length;)
             {
                 i += Character.ToChars(
-                    Character.ToLower(Character.CodePointAt(chs, i)), buffer, i);
+                    Character.ToLower(Character.CodePointAt(chs, i), culture), buffer, i); // LUCENENET specific - need to use explicit culture to override current thread
             }
 
             return scratch.ToString();

diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
@@ -79,7 +79,7 @@ public virtual void TestRandomRealisticWhiteSpace()
                 for (int j = 0; j < charArray.Length;)
                 {
                     int cp = Character.CodePointAt(charArray, j, charArray.Length);
-                    if (!char.IsWhiteSpace((char)cp))
+                    if (!Character.IsWhiteSpace(cp))
                     {
                         sb.AppendCodePoint(cp);
                     }

diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs
@@ -1,6 +1,7 @@
 using Lucene.Net.Analysis.TokenAttributes;
 using NUnit.Framework;
 using System;
+using System.Globalization;
 using System.IO;
 
 namespace Lucene.Net.Analysis.Payloads
@@ -39,7 +40,7 @@ public virtual void Test()
             nptf.Reset();
             while (nptf.IncrementToken())
             {
-                assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals(char.ToUpper(termAtt.Buffer[0]).ToString(), StringComparison.Ordinal));
+                assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals(char.ToUpper(termAtt.Buffer[0]).ToString(), StringComparison.Ordinal)); // LUCENENET specific - intentionally using current culture
                 assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.Payload != null);
                 string type = payloadAtt.Payload.Utf8ToString();
                 assertTrue(type + " is not equal to " + typeAtt.Type, type.Equals(typeAtt.Type, StringComparison.Ordinal));
@@ -67,7 +68,7 @@ public override bool IncrementToken()
             {
                 if (m_input.IncrementToken())
                 {
-                    typeAtt.Type = char.ToUpper(termAtt.Buffer[0]).ToString();
+                    typeAtt.Type = char.ToUpper(termAtt.Buffer[0]).ToString(); // LUCENENET specific - intentionally using current culture
                     return true;
                 }
                 else

diff --git a/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs b/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs
@@ -213,7 +213,7 @@ public override object Format(Passage[] passages, String content)
                         assertEquals(matchStart + 1, matchEnd);
                         // and the offsets must be correct...
                         assertEquals(1, term.Length);
-                        assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart]));
+                        assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart], CultureInfo.InvariantCulture));
                     }
                     // record just the start/end offset for simplicity
                     seen.Add(new Pair(p.StartOffset, p.EndOffset));

diff --git a/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs b/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs
@@ -110,7 +110,7 @@ public void Test_Dictionary_Set_Null()
         public void Test_Dictionary_AddReplace()
         {
             string key = "A";
-            string key2 = "a".ToUpper();
+            string key2 = "a".ToUpperInvariant();
 
             dictionary.Add(key, "value");
             dictionary[key2] = "value2";

diff --git a/...dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs b/...dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs
@@ -211,7 +211,7 @@ public override object Format(Passage[] passages, String content)
                         assertEquals(matchStart + 1, matchEnd);
                         // and the offsets must be correct...
                         assertEquals(1, term.Length);
-                        assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart]));
+                        assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart], CultureInfo.InvariantCulture)); // LUCENENET specific - need to use invariant culture to match Java
                     }
                     // record just the start/end offset for simplicity
                     seen.Add(new Pair(p.StartOffset, p.EndOffset));