Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add numerical ordering option for string comparison operations #109861

Merged
merged 9 commits into from
Nov 25, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,9 @@ public static string GetDistroVersionString()
public static bool IsNotHybridGlobalization => !IsHybridGlobalization;
public static bool IsNotHybridGlobalizationOnApplePlatform => !IsHybridGlobalizationOnApplePlatform;

// This can be removed once numeric comparisons are supported on Apple platforms
public static bool IsNumericComparisonSupported => !IsHybridGlobalizationOnApplePlatform;

// HG on apple platforms implies ICU
public static bool IsIcuGlobalization => !IsInvariantGlobalization && (IsHybridGlobalizationOnApplePlatform || ICUVersion > new Version(0, 0, 0, 0));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,11 @@ private unsafe SortKey IcuCreateSortKey(string source, CompareOptions options)
throw new PlatformNotSupportedException(GetPNSEWithReason("CreateSortKey", "non-invariant culture"));
return InvariantCreateSortKey(source, options);
}
#elif TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
PranavSenthilnathan marked this conversation as resolved.
Show resolved Hide resolved
if (GlobalizationMode.Hybrid)
{
AssertComparisonSupported(options);
}
#endif

if ((options & ValidCompareMaskOffFlags) != 0)
Expand Down Expand Up @@ -776,6 +781,11 @@ private unsafe int IcuGetSortKey(ReadOnlySpan<char> source, Span<byte> destinati
throw new PlatformNotSupportedException(GetPNSEWithReason("GetSortKey", "non-invariant culture"));
return InvariantGetSortKey(source, destination, options);
}
#elif TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
AssertComparisonSupported(options);
}
#endif

// It's ok to pass nullptr (for empty buffers) to ICU's sort key routines.
Expand Down Expand Up @@ -827,6 +837,11 @@ private unsafe int IcuGetSortKeyLength(ReadOnlySpan<char> source, CompareOptions
throw new PlatformNotSupportedException(GetPNSEWithReason("GetSortKeyLength", "non-invariant culture"));
return InvariantGetSortKeyLength(source, options);
}
#elif TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
AssertComparisonSupported(options);
}
#endif

// It's ok to pass nullptr (for empty buffers) to ICU's sort key routines.
Expand Down Expand Up @@ -889,6 +904,11 @@ private unsafe int IcuGetHashCodeOfString(ReadOnlySpan<char> source, CompareOpti
ReadOnlySpan<char> sanitizedSource = SanitizeForInvariantHash(source, options);
return InvariantGetHashCode(sanitizedSource, options);
}
#elif TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
AssertComparisonSupported(options);
}
#endif

// according to ICU User Guide the performance of ucol_getSortKey is worse when it is called with null output buffer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ private static unsafe bool NlsIsSortable(ReadOnlySpan<char> text)
private const int NORM_IGNOREWIDTH = 0x00020000; // Does not differentiate between a single-byte character and the same character as a double-byte character.
private const int NORM_LINGUISTIC_CASING = 0x08000000; // use linguistic rules for casing
private const int SORT_STRINGSORT = 0x00001000; // Treats punctuation the same as symbols.
private const int SORT_DIGITSASNUMBERS = 0x00000008; // Treat digits as numbers during sorting, for example, sort "2" before "10".

private static int GetNativeCompareFlags(CompareOptions options)
{
Expand All @@ -595,6 +596,7 @@ private static int GetNativeCompareFlags(CompareOptions options)
if ((options & CompareOptions.IgnoreSymbols) != 0) { nativeCompareFlags |= NORM_IGNORESYMBOLS; }
if ((options & CompareOptions.IgnoreWidth) != 0) { nativeCompareFlags |= NORM_IGNOREWIDTH; }
if ((options & CompareOptions.StringSort) != 0) { nativeCompareFlags |= SORT_STRINGSORT; }
if ((options & CompareOptions.NumericOrdering) != 0) { nativeCompareFlags |= SORT_DIGITSASNUMBERS; }

// TODO: Can we try for GetNativeCompareFlags to never
// take Ordinal or OrdinalIgnoreCase. This value is not part of Win32, we just handle it special
Expand All @@ -607,6 +609,7 @@ private static int GetNativeCompareFlags(CompareOptions options)
CompareOptions.IgnoreNonSpace |
CompareOptions.IgnoreSymbols |
CompareOptions.IgnoreWidth |
CompareOptions.NumericOrdering |
CompareOptions.StringSort)) == 0) ||
(options == CompareOptions.Ordinal), "[CompareInfo.GetNativeCompareFlags]Expected all flags to be handled");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,19 +175,19 @@ private ReadOnlySpan<char> SanitizeForInvariantHash(ReadOnlySpan<char> source, C
}

private static bool IndexingOptionsNotSupported(CompareOptions options) =>
(options & CompareOptions.IgnoreSymbols) == CompareOptions.IgnoreSymbols;
(options & (CompareOptions.IgnoreSymbols | CompareOptions.NumericOrdering)) != 0;

private static bool CompareOptionsNotSupported(CompareOptions options) =>
(options & CompareOptions.IgnoreWidth) == CompareOptions.IgnoreWidth ||
((options & CompareOptions.IgnoreNonSpace) == CompareOptions.IgnoreNonSpace && (options & CompareOptions.IgnoreKanaType) != CompareOptions.IgnoreKanaType);
((options & CompareOptions.IgnoreNonSpace) == CompareOptions.IgnoreNonSpace && (options & CompareOptions.IgnoreKanaType) == 0);

private static string GetPNSE(CompareOptions options) =>
SR.Format(SR.PlatformNotSupported_HybridGlobalizationWithCompareOptions, options);

private static bool CompareOptionsNotSupportedForCulture(CompareOptions options, string cultureName) =>
(options == CompareOptions.IgnoreKanaType &&
((options & ~CompareOptions.NumericOrdering) == CompareOptions.IgnoreKanaType &&
(string.IsNullOrEmpty(cultureName) || cultureName.Split('-')[0] != "ja")) ||
(options == CompareOptions.None &&
((options & ~CompareOptions.NumericOrdering) == CompareOptions.None &&
(cultureName.Split('-')[0] == "ja"));

private static string GetPNSEForCulture(CompareOptions options, string cultureName) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ public sealed partial class CompareInfo : IDeserializationCallback
// Mask used to check if Compare() / GetHashCode(string) / GetSortKey has the right flags.
private const CompareOptions ValidCompareMaskOffFlags =
~(CompareOptions.IgnoreCase | CompareOptions.IgnoreSymbols | CompareOptions.IgnoreNonSpace |
CompareOptions.IgnoreWidth | CompareOptions.IgnoreKanaType | CompareOptions.StringSort);
CompareOptions.IgnoreWidth | CompareOptions.IgnoreKanaType | CompareOptions.StringSort |
CompareOptions.NumericOrdering);

// Cache the invariant CompareInfo
internal static readonly CompareInfo Invariant = CultureInfo.InvariantCulture.CompareInfo;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,75 @@

namespace System.Globalization
{
/// <summary>
/// Defines the string comparison options to use with <see cref="CompareInfo"/>.
/// </summary>
[Flags]
public enum CompareOptions
{
/// <summary>
/// Indicates the default option settings for string comparisons
/// </summary>
None = 0x00000000,

/// <summary>
/// Indicates that the string comparison must ignore case.
/// </summary>
IgnoreCase = 0x00000001,

/// <summary>
/// Indicates that the string comparison must ignore nonspacing combining characters, such as diacritics.
/// The <see href="https://go.microsoft.com/fwlink/?linkid=37123">Unicode Standard</see> defines combining characters as
/// characters that are combined with base characters to produce a new character. Nonspacing combining characters do not
/// occupy a spacing position by themselves when rendered.
/// </summary>
IgnoreNonSpace = 0x00000002,

/// <summary>
/// Indicates that the string comparison must ignore symbols, such as white-space characters, punctuation, currency symbols,
/// the percent sign, mathematical symbols, the ampersand, and so on.
/// </summary>
IgnoreSymbols = 0x00000004,

/// <summary>
/// Indicates that the string comparison must ignore the Kana type. Kana type refers to Japanese hiragana and katakana characters, which represent phonetic sounds in the Japanese language.
/// Hiragana is used for native Japanese expressions and words, while katakana is used for words borrowed from other languages, such as "computer" or "Internet".
/// A phonetic sound can be expressed in both hiragana and katakana. If this value is selected, the hiragana character for one sound is considered equal to the katakana character for the same sound.
/// </summary>
IgnoreKanaType = 0x00000008,

/// <summary>
/// Indicates that the string comparison must ignore the character width. For example, Japanese katakana characters can be written as full-width or half-width.
/// If this value is selected, the katakana characters written as full-width are considered equal to the same characters written as half-width.
/// </summary>
IgnoreWidth = 0x00000010,

/// <summary>
/// Indicates that the string comparison must sort sequences of digits (Unicode general category "Nd") based on their numeric value.
/// For example, "2" comes before "10". Non-digit characters such as decimal points, minus or plus signs, etc.
/// are not considered as part of the sequence and will terminate it. This flag is not valid for indexing
/// (such as <see cref="CompareInfo.IndexOf(string, string, CompareOptions)"/>, <see cref="CompareInfo.IsPrefix(string, string, CompareOptions)"/>, etc.).
/// </summary>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

///

will be good to add a remarks to this one giving more information like this option can be used in comparisons but not for search (IndexOf/StartsWith/EndsWith). Will be good to hint the behavior difference too when ICU is used against NLS. And last tell this option cannot be combined with the ordinal operations,

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment was getting long so I just included just the part about indexing. Mentioning search might be confusing because people might consider IndexOf (which isn't supported) as search and GetHashCode (which is supported) as not. I think it's easier to just say that NumericOrdering works in all cases except for indexing.

I prefer to keep the combination behavior with Ordinal and OrdinalIgnoreCase on those members since this is really a property of theirs instead of numeric ordering. This is also consistent with the other options as well which don't mention Ordinal even though they can't combine with them either.

I think the ICU and NLS differences should probably go in docs rather than in doc comments since NLS usage is not going to be high. There are already docs about NLS/ICU differences that we can append to (https://learn.microsoft.com/en-us/dotnet/core/extensions/globalization-icu#behavioral-differences).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, if you prefer that, it will be better to edit the docs of the indexing APIs and add the remark there. I am trying to make it easy for the API users to understand when this new enum value is not allowed. I guess users can be puzzled if they get exceptions and do not understand what is wrong.

You don't have to block the PR on that, but it will be good to open a doc issue/PR to add the info as needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like that idea. I just checked and we already list the valid options so I will update those docs once this goes in:
Compare
IndexOf

NumericOrdering = 0x00000020,

/// <summary>
/// String comparison must ignore case, then perform an ordinal comparison. This technique is equivalent to
/// converting the string to uppercase using the invariant culture and then performing an ordinal comparison on the result.
/// This value cannot be combined with other <see cref="CompareOptions" /> values and must be used alone.
/// </summary>
OrdinalIgnoreCase = 0x10000000, // This flag can not be used with other flags.

/// <summary>
/// Indicates that the string comparison must use the string sort algorithm. In a string sort, the hyphen and the apostrophe,
/// as well as other nonalphanumeric symbols, come before alphanumeric characters.
/// </summary>
StringSort = 0x20000000,

/// <summary>
/// Indicates that the string comparison must use successive Unicode UTF-16 encoded values of the string (code unit by code unit comparison),
/// leading to a fast comparison but one that is culture-insensitive. A string starting with a code unit XXXX16 comes before a string starting with YYYY16,
/// if XXXX16 is less than YYYY16. This value cannot be combined with other <see cref="CompareOptions" /> values and must be used alone.
/// </summary>
Ordinal = 0x40000000, // This flag can not be used with other flags.
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,9 @@ public sealed class CultureAwareComparer : StringComparer, IAlternateEqualityCom
internal static readonly CultureAwareComparer InvariantCaseSensitiveInstance = new CultureAwareComparer(CompareInfo.Invariant, CompareOptions.None);
internal static readonly CultureAwareComparer InvariantIgnoreCaseInstance = new CultureAwareComparer(CompareInfo.Invariant, CompareOptions.IgnoreCase);

private const CompareOptions ValidCompareMaskOffFlags = ~(CompareOptions.IgnoreCase | CompareOptions.IgnoreSymbols | CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreWidth | CompareOptions.IgnoreKanaType | CompareOptions.StringSort);
private const CompareOptions ValidCompareMaskOffFlags =
~(CompareOptions.IgnoreCase | CompareOptions.IgnoreSymbols | CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreKanaType |
CompareOptions.IgnoreWidth | CompareOptions.NumericOrdering | CompareOptions.StringSort);

private readonly CompareInfo _compareInfo; // Do not rename (binary serialization)
private readonly CompareOptions _options;
Expand Down
1 change: 1 addition & 0 deletions src/libraries/System.Runtime/ref/System.Runtime.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9186,6 +9186,7 @@ public enum CompareOptions
IgnoreSymbols = 4,
IgnoreKanaType = 8,
IgnoreWidth = 16,
NumericOrdering = 32,
OrdinalIgnoreCase = 268435456,
StringSort = 536870912,
Ordinal = 1073741824,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: should we change this to use hex numbers for the sake of the readability?

Copy link
Member Author

@PranavSenthilnathan PranavSenthilnathan Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how the GenAPI tool generates it and the guidance I've heard is to make as few diffs from that tool as possible.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but still can make diffs :-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ref assembly isn’t really designed for readability, it’s designed to be correct and automatically generated

Deviations and manual diffs just cause later downstream pain and hinder the ability to rerun the tool, as people have to fight against it

The better option would be to submit a bug or better a patch such that the output produced by the tool for flags enabled enums is “better” such as using hex or logical shifts to represent the bits instead (1 << 0, 1 << 1, 1 << 2, etc)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've created an issue: dotnet/sdk#44999. GenAPI seems to be in a kind of limbo state where there is a new Roslyn based version we want to switch to (tracked by dotnet/sdk#31088) and we don't want to maintain the current CCI-based one (https://github.com/dotnet/arcade/blob/main/src/Microsoft.DotNet.GenAPI/README.md).

Expand Down
Loading
Loading