Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

React to CheckForOverflowUnderflow in regex source generator #78228

Merged
merged 2 commits into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,27 @@ static void AppendHashtableContents(IndentedTextWriter writer, IEnumerable<Dicti
}

/// <summary>Emits the code for the RunnerFactory. This is the actual logic for the regular expression.</summary>
private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers)
private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
void EnterCheckOverflow()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: no need to reset CI for this, but I think it is a bit more readable in general when we have local funcs at the end of the method (even better if they are after a return statement)

{
if (checkOverflow)
{
writer.WriteLine($"unchecked");
writer.WriteLine($"{{");
writer.Indent++;
}
}

void ExitCheckOverflow()
{
if (checkOverflow)
{
writer.Indent--;
writer.WriteLine($"}}");
}
}

writer.WriteLine($"/// <summary>Provides a factory for creating <see cref=\"RegexRunner\"/> instances to be used by methods on <see cref=\"Regex\"/>.</summary>");
writer.WriteLine($"private sealed class RunnerFactory : RegexRunnerFactory");
writer.WriteLine($"{{");
Expand Down Expand Up @@ -208,7 +227,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer,
writer.WriteLine($" protected override void Scan(ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 3;
EnterCheckOverflow();
(bool needsTryFind, bool needsTryMatch) = EmitScan(writer, rm);
ExitCheckOverflow();
writer.Indent -= 3;
writer.WriteLine($" }}");
if (needsTryFind)
Expand All @@ -220,7 +241,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer,
writer.WriteLine($" private bool TryFindNextPossibleStartingPosition(ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 3;
EnterCheckOverflow();
EmitTryFindNextPossibleStartingPosition(writer, rm, requiredHelpers);
ExitCheckOverflow();
writer.Indent -= 3;
writer.WriteLine($" }}");
}
Expand All @@ -233,7 +256,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer,
writer.WriteLine($" private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 3;
EmitTryMatchAtCurrentPosition(writer, rm, requiredHelpers);
EnterCheckOverflow();
EmitTryMatchAtCurrentPosition(writer, rm, requiredHelpers, checkOverflow);
ExitCheckOverflow();
writer.Indent -= 3;
writer.WriteLine($" }}");
}
Expand Down Expand Up @@ -288,51 +313,52 @@ private static void AddIsWordCharHelper(Dictionary<string, string[]> requiredHel
}

/// <summary>Adds the IsBoundary helper to the required helpers collection.</summary>
private static void AddIsBoundaryHelper(Dictionary<string, string[]> requiredHelpers)
private static void AddIsBoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
const string IsBoundary = nameof(IsBoundary);
if (!requiredHelpers.ContainsKey(IsBoundary))
{
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
requiredHelpers.Add(IsBoundary, new string[]
{
"/// <summary>Determines whether the specified index is a boundary.</summary>",
"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
"internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)",
"{",
" int indexMinus1 = index - 1;",
" return ((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=",
" ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));",
"",
" static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');",
"}",
$"/// <summary>Determines whether the specified index is a boundary.</summary>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)",
$"{{",
$" int indexMinus1 = index - 1;",
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=",
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));",
$"",
$" static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');",
$"}}",
});

AddIsWordCharHelper(requiredHelpers);
}
}

/// <summary>Adds the IsECMABoundary helper to the required helpers collection.</summary>
private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> requiredHelpers)
private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
const string IsECMABoundary = nameof(IsECMABoundary);
if (!requiredHelpers.ContainsKey(IsECMABoundary))
{
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
requiredHelpers.Add(IsECMABoundary, new string[]
{
"/// <summary>Determines whether the specified index is a boundary (ECMAScript).</summary>",
"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
"internal static bool IsECMABoundary(ReadOnlySpan<char> inputSpan, int index)",
"{",
" int indexMinus1 = index - 1;",
" return ((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=",
" ((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));",
"",
" static bool IsECMAWordChar(char ch) =>",
" ((((uint)ch - 'A') & ~0x20) < 26) || // ASCII letter",
" (((uint)ch - '0') < 10) || // digit",
" ch == '_' || // underscore",
" ch == '\\u0130'; // latin capital letter I with dot above",
"}",
$"/// <summary>Determines whether the specified index is a boundary (ECMAScript).</summary>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool IsECMABoundary(ReadOnlySpan<char> inputSpan, int index)",
$"{{",
$" int indexMinus1 = index - 1;",
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=",
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));",
$"",
$" static bool IsECMAWordChar(char ch) =>",
$" char.IsAsciiLetterOrDigit(ch) ||",
$" ch == '_' ||",
$" ch == '\\u0130'; // latin capital letter I with dot above",
$"}}",
});
}
}
Expand Down Expand Up @@ -426,7 +452,7 @@ FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start or
/// <summary>Emits the body of the TryFindNextPossibleStartingPosition.</summary>
private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers)
{
RegexOptions options = (RegexOptions)rm.Options;
RegexOptions options = rm.Options;
RegexTree regexTree = rm.Tree;
bool rtl = (options & RegexOptions.RightToLeft) != 0;

Expand Down Expand Up @@ -1034,7 +1060,7 @@ void EmitLiteralAfterAtomicLoop()
}

/// <summary>Emits the body of the TryMatchAtCurrentPosition.</summary>
private static void EmitTryMatchAtCurrentPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers)
private static void EmitTryMatchAtCurrentPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
// In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled
// version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via
Expand Down Expand Up @@ -2677,14 +2703,14 @@ void EmitBoundary(RegexNode node)
call = node.Kind is RegexNodeKind.Boundary ?
$"!{HelpersTypeName}.IsBoundary" :
$"{HelpersTypeName}.IsBoundary";
AddIsBoundaryHelper(requiredHelpers);
AddIsBoundaryHelper(requiredHelpers, checkOverflow);
}
else
{
call = node.Kind is RegexNodeKind.ECMABoundary ?
$"!{HelpersTypeName}.IsECMABoundary" :
$"{HelpersTypeName}.IsECMABoundary";
AddIsECMABoundaryHelper(requiredHelpers);
AddIsECMABoundaryHelper(requiredHelpers, checkOverflow);
}

using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))"))
Expand Down
41 changes: 23 additions & 18 deletions src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,24 @@ public partial class RegexGenerator : IIncrementalGenerator
"#pragma warning disable CS0219 // Variable assigned but never used",
};

private record struct CompilationData(bool AllowUnsafe, bool CheckOverflow);

public void Initialize(IncrementalGeneratorInitializationContext context)
{
// To avoid invalidating every regex's output when anything from the compilation changes,
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
// we extract from it the only things we care about.
IncrementalValueProvider<CompilationData> compilationDataProvider =
context.CompilationProvider
.Select((x, _) =>
x.Options is CSharpCompilationOptions options ?
new CompilationData(options.AllowUnsafe, options.CheckOverflow) :
default);

// Produces one entry per generated regex. This may be:
// - Diagnostic in the case of a failure that should end the compilation
// - (RegexMethod regexMethod, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers) in the case of valid regex
// - (RegexMethod regexMethod, string reason, Diagnostic diagnostic) in the case of a limited-support regex
IncrementalValueProvider<ImmutableArray<object>> codeOrDiagnostics =
IncrementalValueProvider<ImmutableArray<object>> results =
context.SyntaxProvider

// Find all MethodDeclarationSyntax nodes attributed with GeneratedRegex and gather the required information.
Expand All @@ -52,9 +63,13 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
GetSemanticTargetForGeneration)
.Where(static m => m is not null)

// Incorporate the compilation data, as it impacts code generation.
.Combine(compilationDataProvider)

// Generate the RunnerFactory for each regex, if possible. This is where the bulk of the implementation occurs.
.Select((state, _) =>
.Select((methodOrDiagnosticAndCompilationData, _) =>
{
object? state = methodOrDiagnosticAndCompilationData.Left;
if (state is not RegexMethod regexMethod)
{
Debug.Assert(state is Diagnostic);
Expand All @@ -74,26 +89,16 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
var writer = new IndentedTextWriter(sw);
writer.Indent += 2;
writer.WriteLine();
EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers);
EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers, methodOrDiagnosticAndCompilationData.Right.CheckOverflow);
writer.Indent -= 2;
return (regexMethod, sw.ToString(), requiredHelpers);
return (regexMethod, sw.ToString(), requiredHelpers, methodOrDiagnosticAndCompilationData.Right);
})
.Collect();

// To avoid invalidating every regex's output when anything from the compilation changes,
// we extract from it the only things we care about: whether unsafe code is allowed,
// and a name based on the assembly's name, and only that information is then fed into
// RegisterSourceOutput along with all of the cached generated data from each regex.
IncrementalValueProvider<(bool AllowUnsafe, string? AssemblyName)> compilationDataProvider =
context.CompilationProvider
.Select((x, _) => (x.Options is CSharpCompilationOptions { AllowUnsafe: true }, x.AssemblyName));

// When there something to output, take all the generated strings and concatenate them to output,
// and raise all of the created diagnostics.
context.RegisterSourceOutput(codeOrDiagnostics.Combine(compilationDataProvider), static (context, compilationDataAndResults) =>
context.RegisterSourceOutput(results, static (context, results) =>
{
ImmutableArray<object> results = compilationDataAndResults.Left;

// Report any top-level diagnostics.
bool allFailures = true;
foreach (object result in results)
Expand Down Expand Up @@ -150,7 +155,7 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
context.ReportDiagnostic(limitedSupportResult.Item3);
regexMethod = limitedSupportResult.Item1;
}
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>> regexImpl)
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>, CompilationData> regexImpl)
{
foreach (KeyValuePair<string, string[]> helper in regexImpl.Item3)
{
Expand Down Expand Up @@ -214,11 +219,11 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
writer.WriteLine();
}
}
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>> regexImpl)
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>, CompilationData> regexImpl)
{
if (!regexImpl.Item1.IsDuplicate)
{
EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, compilationDataAndResults.Right.AllowUnsafe);
EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, regexImpl.Item4.AllowUnsafe);
writer.WriteLine();
}
}
Expand Down
Loading