Skip to content

Commit

Permalink
React to CheckForOverflowUnderflow in regex source generator (#78228)
Browse files Browse the repository at this point in the history
* React to CheckForOverflowUnderflow in regex source generator

The regex source generator uses code patterns that might have arithmetic overflows, e.g. a bounds check with `(uint)index < span.Length`.  These are intentional, and they're benign... unless the project/compilation has opted-in to overflow/underflow checking (CheckForOverflowUnderflow).  In that case, the code for many patterns can start throwing false positive overflow exceptions, making the source generator unusable.

This commit causes the generator to look at the CheckOverflow setting in the compilation options, and if it's set, to emit `unchecked { ... }` around all the relevant code.

* Address PR feedback
  • Loading branch information
stephentoub authored Nov 11, 2022
1 parent 484938a commit 3e94fdc
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,27 @@ static void AppendHashtableContents(IndentedTextWriter writer, IEnumerable<Dicti
}

/// <summary>Emits the code for the RunnerFactory. This is the actual logic for the regular expression.</summary>
private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers)
private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
void EnterCheckOverflow()
{
if (checkOverflow)
{
writer.WriteLine($"unchecked");
writer.WriteLine($"{{");
writer.Indent++;
}
}

void ExitCheckOverflow()
{
if (checkOverflow)
{
writer.Indent--;
writer.WriteLine($"}}");
}
}

writer.WriteLine($"/// <summary>Provides a factory for creating <see cref=\"RegexRunner\"/> instances to be used by methods on <see cref=\"Regex\"/>.</summary>");
writer.WriteLine($"private sealed class RunnerFactory : RegexRunnerFactory");
writer.WriteLine($"{{");
Expand Down Expand Up @@ -208,7 +227,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer,
writer.WriteLine($" protected override void Scan(ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 3;
EnterCheckOverflow();
(bool needsTryFind, bool needsTryMatch) = EmitScan(writer, rm);
ExitCheckOverflow();
writer.Indent -= 3;
writer.WriteLine($" }}");
if (needsTryFind)
Expand All @@ -220,7 +241,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer,
writer.WriteLine($" private bool TryFindNextPossibleStartingPosition(ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 3;
EnterCheckOverflow();
EmitTryFindNextPossibleStartingPosition(writer, rm, requiredHelpers);
ExitCheckOverflow();
writer.Indent -= 3;
writer.WriteLine($" }}");
}
Expand All @@ -233,7 +256,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer,
writer.WriteLine($" private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 3;
EmitTryMatchAtCurrentPosition(writer, rm, requiredHelpers);
EnterCheckOverflow();
EmitTryMatchAtCurrentPosition(writer, rm, requiredHelpers, checkOverflow);
ExitCheckOverflow();
writer.Indent -= 3;
writer.WriteLine($" }}");
}
Expand Down Expand Up @@ -288,51 +313,52 @@ private static void AddIsWordCharHelper(Dictionary<string, string[]> requiredHel
}

/// <summary>Adds the IsBoundary helper to the required helpers collection.</summary>
private static void AddIsBoundaryHelper(Dictionary<string, string[]> requiredHelpers)
private static void AddIsBoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
const string IsBoundary = nameof(IsBoundary);
if (!requiredHelpers.ContainsKey(IsBoundary))
{
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
requiredHelpers.Add(IsBoundary, new string[]
{
"/// <summary>Determines whether the specified index is a boundary.</summary>",
"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
"internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)",
"{",
" int indexMinus1 = index - 1;",
" return ((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=",
" ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));",
"",
" static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');",
"}",
$"/// <summary>Determines whether the specified index is a boundary.</summary>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)",
$"{{",
$" int indexMinus1 = index - 1;",
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=",
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));",
$"",
$" static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');",
$"}}",
});

AddIsWordCharHelper(requiredHelpers);
}
}

/// <summary>Adds the IsECMABoundary helper to the required helpers collection.</summary>
private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> requiredHelpers)
private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
const string IsECMABoundary = nameof(IsECMABoundary);
if (!requiredHelpers.ContainsKey(IsECMABoundary))
{
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
requiredHelpers.Add(IsECMABoundary, new string[]
{
"/// <summary>Determines whether the specified index is a boundary (ECMAScript).</summary>",
"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
"internal static bool IsECMABoundary(ReadOnlySpan<char> inputSpan, int index)",
"{",
" int indexMinus1 = index - 1;",
" return ((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=",
" ((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));",
"",
" static bool IsECMAWordChar(char ch) =>",
" ((((uint)ch - 'A') & ~0x20) < 26) || // ASCII letter",
" (((uint)ch - '0') < 10) || // digit",
" ch == '_' || // underscore",
" ch == '\\u0130'; // latin capital letter I with dot above",
"}",
$"/// <summary>Determines whether the specified index is a boundary (ECMAScript).</summary>",
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
$"internal static bool IsECMABoundary(ReadOnlySpan<char> inputSpan, int index)",
$"{{",
$" int indexMinus1 = index - 1;",
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=",
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));",
$"",
$" static bool IsECMAWordChar(char ch) =>",
$" char.IsAsciiLetterOrDigit(ch) ||",
$" ch == '_' ||",
$" ch == '\\u0130'; // latin capital letter I with dot above",
$"}}",
});
}
}
Expand Down Expand Up @@ -426,7 +452,7 @@ FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start or
/// <summary>Emits the body of the TryFindNextPossibleStartingPosition.</summary>
private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers)
{
RegexOptions options = (RegexOptions)rm.Options;
RegexOptions options = rm.Options;
RegexTree regexTree = rm.Tree;
bool rtl = (options & RegexOptions.RightToLeft) != 0;

Expand Down Expand Up @@ -1034,7 +1060,7 @@ void EmitLiteralAfterAtomicLoop()
}

/// <summary>Emits the body of the TryMatchAtCurrentPosition.</summary>
private static void EmitTryMatchAtCurrentPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers)
private static void EmitTryMatchAtCurrentPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
{
// In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled
// version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via
Expand Down Expand Up @@ -2677,14 +2703,14 @@ void EmitBoundary(RegexNode node)
call = node.Kind is RegexNodeKind.Boundary ?
$"!{HelpersTypeName}.IsBoundary" :
$"{HelpersTypeName}.IsBoundary";
AddIsBoundaryHelper(requiredHelpers);
AddIsBoundaryHelper(requiredHelpers, checkOverflow);
}
else
{
call = node.Kind is RegexNodeKind.ECMABoundary ?
$"!{HelpersTypeName}.IsECMABoundary" :
$"{HelpersTypeName}.IsECMABoundary";
AddIsECMABoundaryHelper(requiredHelpers);
AddIsECMABoundaryHelper(requiredHelpers, checkOverflow);
}

using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))"))
Expand Down
41 changes: 23 additions & 18 deletions src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,24 @@ public partial class RegexGenerator : IIncrementalGenerator
"#pragma warning disable CS0219 // Variable assigned but never used",
};

private record struct CompilationData(bool AllowUnsafe, bool CheckOverflow);

public void Initialize(IncrementalGeneratorInitializationContext context)
{
// To avoid invalidating every regex's output when anything from the compilation changes,
// we extract from it the only things we care about.
IncrementalValueProvider<CompilationData> compilationDataProvider =
context.CompilationProvider
.Select((x, _) =>
x.Options is CSharpCompilationOptions options ?
new CompilationData(options.AllowUnsafe, options.CheckOverflow) :
default);

// Produces one entry per generated regex. This may be:
// - Diagnostic in the case of a failure that should end the compilation
// - (RegexMethod regexMethod, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers) in the case of valid regex
// - (RegexMethod regexMethod, string reason, Diagnostic diagnostic) in the case of a limited-support regex
IncrementalValueProvider<ImmutableArray<object>> codeOrDiagnostics =
IncrementalValueProvider<ImmutableArray<object>> results =
context.SyntaxProvider

// Find all MethodDeclarationSyntax nodes attributed with GeneratedRegex and gather the required information.
Expand All @@ -52,9 +63,13 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
GetSemanticTargetForGeneration)
.Where(static m => m is not null)

// Incorporate the compilation data, as it impacts code generation.
.Combine(compilationDataProvider)

// Generate the RunnerFactory for each regex, if possible. This is where the bulk of the implementation occurs.
.Select((state, _) =>
.Select((methodOrDiagnosticAndCompilationData, _) =>
{
object? state = methodOrDiagnosticAndCompilationData.Left;
if (state is not RegexMethod regexMethod)
{
Debug.Assert(state is Diagnostic);
Expand All @@ -74,26 +89,16 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
var writer = new IndentedTextWriter(sw);
writer.Indent += 2;
writer.WriteLine();
EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers);
EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers, methodOrDiagnosticAndCompilationData.Right.CheckOverflow);
writer.Indent -= 2;
return (regexMethod, sw.ToString(), requiredHelpers);
return (regexMethod, sw.ToString(), requiredHelpers, methodOrDiagnosticAndCompilationData.Right);
})
.Collect();

// To avoid invalidating every regex's output when anything from the compilation changes,
// we extract from it the only things we care about: whether unsafe code is allowed,
// and a name based on the assembly's name, and only that information is then fed into
// RegisterSourceOutput along with all of the cached generated data from each regex.
IncrementalValueProvider<(bool AllowUnsafe, string? AssemblyName)> compilationDataProvider =
context.CompilationProvider
.Select((x, _) => (x.Options is CSharpCompilationOptions { AllowUnsafe: true }, x.AssemblyName));

// When there something to output, take all the generated strings and concatenate them to output,
// and raise all of the created diagnostics.
context.RegisterSourceOutput(codeOrDiagnostics.Combine(compilationDataProvider), static (context, compilationDataAndResults) =>
context.RegisterSourceOutput(results, static (context, results) =>
{
ImmutableArray<object> results = compilationDataAndResults.Left;

// Report any top-level diagnostics.
bool allFailures = true;
foreach (object result in results)
Expand Down Expand Up @@ -150,7 +155,7 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
context.ReportDiagnostic(limitedSupportResult.Item3);
regexMethod = limitedSupportResult.Item1;
}
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>> regexImpl)
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>, CompilationData> regexImpl)
{
foreach (KeyValuePair<string, string[]> helper in regexImpl.Item3)
{
Expand Down Expand Up @@ -214,11 +219,11 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
writer.WriteLine();
}
}
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>> regexImpl)
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>, CompilationData> regexImpl)
{
if (!regexImpl.Item1.IsDuplicate)
{
EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, compilationDataAndResults.Right.AllowUnsafe);
EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, regexImpl.Item4.AllowUnsafe);
writer.WriteLine();
}
}
Expand Down
Loading

0 comments on commit 3e94fdc

Please sign in to comment.