From 727c872f0c7ecbb7ed0bd66363fc01d1b8e921ce Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 11 Nov 2022 11:45:16 -0500 Subject: [PATCH 1/2] React to CheckForOverflowUnderflow in regex source generator The regex source generator uses code patterns that might have arithmetic overflows, e.g. a bounds check with `(uint)index < span.Length`. These are intentional, and they're benign... unless the project/compilation has opted-in to overflow/underflow checking (CheckForOverflowUnderflow). In that case, the code for many patterns can start throwing false positive overflow exceptions, making the source generator unusable. This commit causes the generator to look at the CheckOverflow setting in the compilation options, and if it's set, to emit `unchecked { ... }` around all the relevant code. --- .../gen/RegexGenerator.Emitter.cs | 90 ++++++++++++------- .../gen/RegexGenerator.cs | 41 +++++---- .../RegexGeneratorHelper.netcoreapp.cs | 13 +-- .../RegexGeneratorParserTests.cs | 27 +++--- 4 files changed, 105 insertions(+), 66 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 9e08b3ee1d28a..c54d27966b4ae 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -172,8 +172,27 @@ static void AppendHashtableContents(IndentedTextWriter writer, IEnumerableEmits the code for the RunnerFactory. This is the actual logic for the regular expression. - private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, RegexMethod rm, Dictionary requiredHelpers) + private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, RegexMethod rm, Dictionary requiredHelpers, bool checkOverflow) { + void EnterCheckOverflow() + { + if (checkOverflow) + { + writer.WriteLine($"unchecked"); + writer.WriteLine($"{{"); + writer.Indent++; + } + } + + void ExitCheckOverflow() + { + if (checkOverflow) + { + writer.Indent--; + writer.WriteLine($"}}"); + } + } + writer.WriteLine($"/// Provides a factory for creating instances to be used by methods on ."); writer.WriteLine($"private sealed class RunnerFactory : RegexRunnerFactory"); writer.WriteLine($"{{"); @@ -208,7 +227,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, writer.WriteLine($" protected override void Scan(ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 3; + EnterCheckOverflow(); (bool needsTryFind, bool needsTryMatch) = EmitScan(writer, rm); + ExitCheckOverflow(); writer.Indent -= 3; writer.WriteLine($" }}"); if (needsTryFind) @@ -220,7 +241,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, writer.WriteLine($" private bool TryFindNextPossibleStartingPosition(ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 3; + EnterCheckOverflow(); EmitTryFindNextPossibleStartingPosition(writer, rm, requiredHelpers); + ExitCheckOverflow(); writer.Indent -= 3; writer.WriteLine($" }}"); } @@ -233,7 +256,9 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer, writer.WriteLine($" private bool TryMatchAtCurrentPosition(ReadOnlySpan inputSpan)"); writer.WriteLine($" {{"); writer.Indent += 3; - EmitTryMatchAtCurrentPosition(writer, rm, requiredHelpers); + EnterCheckOverflow(); + EmitTryMatchAtCurrentPosition(writer, rm, requiredHelpers, checkOverflow); + ExitCheckOverflow(); writer.Indent -= 3; writer.WriteLine($" }}"); } @@ -288,23 +313,24 @@ private static void AddIsWordCharHelper(Dictionary requiredHel } /// Adds the IsBoundary helper to the required helpers collection. - private static void AddIsBoundaryHelper(Dictionary requiredHelpers) + private static void AddIsBoundaryHelper(Dictionary requiredHelpers, bool checkOverflow) { const string IsBoundary = nameof(IsBoundary); if (!requiredHelpers.ContainsKey(IsBoundary)) { + string uncheckedKeyword = checkOverflow ? "unchecked" : ""; requiredHelpers.Add(IsBoundary, new string[] { - "/// Determines whether the specified index is a boundary.", - "[MethodImpl(MethodImplOptions.AggressiveInlining)]", - "internal static bool IsBoundary(ReadOnlySpan inputSpan, int index)", - "{", - " int indexMinus1 = index - 1;", - " return ((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=", - " ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));", - "", - " static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');", - "}", + $"/// Determines whether the specified index is a boundary.", + $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", + $"internal static bool IsBoundary(ReadOnlySpan inputSpan, int index)", + $"{{", + $" int indexMinus1 = index - 1;", + $" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=", + $" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));", + $"", + $" static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');", + $"}}", }); AddIsWordCharHelper(requiredHelpers); @@ -312,27 +338,27 @@ private static void AddIsBoundaryHelper(Dictionary requiredHel } /// Adds the IsECMABoundary helper to the required helpers collection. - private static void AddIsECMABoundaryHelper(Dictionary requiredHelpers) + private static void AddIsECMABoundaryHelper(Dictionary requiredHelpers, bool checkOverflow) { const string IsECMABoundary = nameof(IsECMABoundary); if (!requiredHelpers.ContainsKey(IsECMABoundary)) { + string uncheckedKeyword = checkOverflow ? "unchecked" : ""; requiredHelpers.Add(IsECMABoundary, new string[] { - "/// Determines whether the specified index is a boundary (ECMAScript).", - "[MethodImpl(MethodImplOptions.AggressiveInlining)]", - "internal static bool IsECMABoundary(ReadOnlySpan inputSpan, int index)", - "{", - " int indexMinus1 = index - 1;", - " return ((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=", - " ((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));", - "", - " static bool IsECMAWordChar(char ch) =>", - " ((((uint)ch - 'A') & ~0x20) < 26) || // ASCII letter", - " (((uint)ch - '0') < 10) || // digit", - " ch == '_' || // underscore", - " ch == '\\u0130'; // latin capital letter I with dot above", - "}", + $"/// Determines whether the specified index is a boundary (ECMAScript).", + $"[MethodImpl(MethodImplOptions.AggressiveInlining)]", + $"internal static bool IsECMABoundary(ReadOnlySpan inputSpan, int index)", + $"{{", + $" int indexMinus1 = index - 1;", + $" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=", + $" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));", + $"", + $" static bool IsECMAWordChar(char ch) =>", + $" char.IsAsciiLetterOrDigit(ch) ||", + $" ch == '_' ||", + $" ch == '\\u0130'; // latin capital letter I with dot above", + $"}}", }); } } @@ -426,7 +452,7 @@ FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start or /// Emits the body of the TryFindNextPossibleStartingPosition. private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary requiredHelpers) { - RegexOptions options = (RegexOptions)rm.Options; + RegexOptions options = rm.Options; RegexTree regexTree = rm.Tree; bool rtl = (options & RegexOptions.RightToLeft) != 0; @@ -1034,7 +1060,7 @@ void EmitLiteralAfterAtomicLoop() } /// Emits the body of the TryMatchAtCurrentPosition. - private static void EmitTryMatchAtCurrentPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary requiredHelpers) + private static void EmitTryMatchAtCurrentPosition(IndentedTextWriter writer, RegexMethod rm, Dictionary requiredHelpers, bool checkOverflow) { // In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled // version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via @@ -2677,14 +2703,14 @@ void EmitBoundary(RegexNode node) call = node.Kind is RegexNodeKind.Boundary ? $"!{HelpersTypeName}.IsBoundary" : $"{HelpersTypeName}.IsBoundary"; - AddIsBoundaryHelper(requiredHelpers); + AddIsBoundaryHelper(requiredHelpers, checkOverflow); } else { call = node.Kind is RegexNodeKind.ECMABoundary ? $"!{HelpersTypeName}.IsECMABoundary" : $"{HelpersTypeName}.IsECMABoundary"; - AddIsECMABoundaryHelper(requiredHelpers); + AddIsECMABoundaryHelper(requiredHelpers, checkOverflow); } using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))")) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs index 15645d72fd7a3..5ea1e95748c7f 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs @@ -36,13 +36,24 @@ public partial class RegexGenerator : IIncrementalGenerator "#pragma warning disable CS0219 // Variable assigned but never used", }; + private sealed record CompilationData(bool AllowUnsafe = false, bool CheckOverflow = false); + public void Initialize(IncrementalGeneratorInitializationContext context) { + // To avoid invalidating every regex's output when anything from the compilation changes, + // we extract from it the only things we care about. + IncrementalValueProvider compilationDataProvider = + context.CompilationProvider + .Select((x, _) => + x.Options is CSharpCompilationOptions options ? + new CompilationData(options.AllowUnsafe, options.CheckOverflow) : + new CompilationData()); + // Produces one entry per generated regex. This may be: // - Diagnostic in the case of a failure that should end the compilation // - (RegexMethod regexMethod, string runnerFactoryImplementation, Dictionary requiredHelpers) in the case of valid regex // - (RegexMethod regexMethod, string reason, Diagnostic diagnostic) in the case of a limited-support regex - IncrementalValueProvider> codeOrDiagnostics = + IncrementalValueProvider> results = context.SyntaxProvider // Find all MethodDeclarationSyntax nodes attributed with GeneratedRegex and gather the required information. @@ -52,9 +63,13 @@ public void Initialize(IncrementalGeneratorInitializationContext context) GetSemanticTargetForGeneration) .Where(static m => m is not null) + // Incorporate the compilation data, as it impacts code generation. + .Combine(compilationDataProvider) + // Generate the RunnerFactory for each regex, if possible. This is where the bulk of the implementation occurs. - .Select((state, _) => + .Select((methodOrDiagnosticAndCompilationData, _) => { + object? state = methodOrDiagnosticAndCompilationData.Left; if (state is not RegexMethod regexMethod) { Debug.Assert(state is Diagnostic); @@ -74,26 +89,16 @@ public void Initialize(IncrementalGeneratorInitializationContext context) var writer = new IndentedTextWriter(sw); writer.Indent += 2; writer.WriteLine(); - EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers); + EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers, methodOrDiagnosticAndCompilationData.Right.CheckOverflow); writer.Indent -= 2; - return (regexMethod, sw.ToString(), requiredHelpers); + return (regexMethod, sw.ToString(), requiredHelpers, methodOrDiagnosticAndCompilationData.Right); }) .Collect(); - // To avoid invalidating every regex's output when anything from the compilation changes, - // we extract from it the only things we care about: whether unsafe code is allowed, - // and a name based on the assembly's name, and only that information is then fed into - // RegisterSourceOutput along with all of the cached generated data from each regex. - IncrementalValueProvider<(bool AllowUnsafe, string? AssemblyName)> compilationDataProvider = - context.CompilationProvider - .Select((x, _) => (x.Options is CSharpCompilationOptions { AllowUnsafe: true }, x.AssemblyName)); - // When there something to output, take all the generated strings and concatenate them to output, // and raise all of the created diagnostics. - context.RegisterSourceOutput(codeOrDiagnostics.Combine(compilationDataProvider), static (context, compilationDataAndResults) => + context.RegisterSourceOutput(results, static (context, results) => { - ImmutableArray results = compilationDataAndResults.Left; - // Report any top-level diagnostics. bool allFailures = true; foreach (object result in results) @@ -150,7 +155,7 @@ public void Initialize(IncrementalGeneratorInitializationContext context) context.ReportDiagnostic(limitedSupportResult.Item3); regexMethod = limitedSupportResult.Item1; } - else if (result is ValueTuple> regexImpl) + else if (result is ValueTuple, CompilationData> regexImpl) { foreach (KeyValuePair helper in regexImpl.Item3) { @@ -214,11 +219,11 @@ public void Initialize(IncrementalGeneratorInitializationContext context) writer.WriteLine(); } } - else if (result is ValueTuple> regexImpl) + else if (result is ValueTuple, CompilationData> regexImpl) { if (!regexImpl.Item1.IsDuplicate) { - EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, compilationDataAndResults.Right.AllowUnsafe); + EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, regexImpl.Item4.AllowUnsafe); writer.WriteLine(); } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorHelper.netcoreapp.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorHelper.netcoreapp.cs index 04e9683bdd2cc..340ed7fbea5df 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorHelper.netcoreapp.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorHelper.netcoreapp.cs @@ -68,13 +68,13 @@ internal static byte[] CreateAssemblyImage(string source, string assemblyName) } private static async Task<(Compilation, GeneratorDriverRunResult)> RunGeneratorCore( - string code, LanguageVersion langVersion = LanguageVersion.Preview, MetadataReference[]? additionalRefs = null, bool allowUnsafe = false, CancellationToken cancellationToken = default) + string code, LanguageVersion langVersion = LanguageVersion.Preview, MetadataReference[]? additionalRefs = null, bool allowUnsafe = false, bool checkOverflow = true, CancellationToken cancellationToken = default) { var proj = new AdhocWorkspace() .AddSolution(SolutionInfo.Create(SolutionId.CreateNewId(), VersionStamp.Create())) .AddProject("RegexGeneratorTest", "RegexGeneratorTest.dll", "C#") .WithMetadataReferences(additionalRefs is not null ? References.Concat(additionalRefs) : References) - .WithCompilationOptions(new CSharpCompilationOptions(OutputKind.DynamicallyLinkedLibrary, allowUnsafe: allowUnsafe) + .WithCompilationOptions(new CSharpCompilationOptions(OutputKind.DynamicallyLinkedLibrary, allowUnsafe: allowUnsafe, checkOverflow: checkOverflow) .WithNullableContextOptions(NullableContextOptions.Enable)) .WithParseOptions(new CSharpParseOptions(langVersion)) .AddDocument("RegexGenerator.g.cs", SourceText.From(code, Encoding.UTF8)).Project; @@ -91,9 +91,9 @@ internal static byte[] CreateAssemblyImage(string source, string assemblyName) } internal static async Task> RunGenerator( - string code, bool compile = false, LanguageVersion langVersion = LanguageVersion.Preview, MetadataReference[]? additionalRefs = null, bool allowUnsafe = false, CancellationToken cancellationToken = default) + string code, bool compile = false, LanguageVersion langVersion = LanguageVersion.Preview, MetadataReference[]? additionalRefs = null, bool allowUnsafe = false, bool checkOverflow = true, CancellationToken cancellationToken = default) { - (Compilation comp, GeneratorDriverRunResult generatorResults) = await RunGeneratorCore(code, langVersion, additionalRefs, allowUnsafe, cancellationToken); + (Compilation comp, GeneratorDriverRunResult generatorResults) = await RunGeneratorCore(code, langVersion, additionalRefs, allowUnsafe, checkOverflow, cancellationToken); if (!compile) { return generatorResults.Diagnostics; @@ -114,9 +114,9 @@ internal static async Task> RunGenerator( } internal static async Task GenerateSourceText( - string code, LanguageVersion langVersion = LanguageVersion.Preview, MetadataReference[]? additionalRefs = null, bool allowUnsafe = false, CancellationToken cancellationToken = default) + string code, LanguageVersion langVersion = LanguageVersion.Preview, MetadataReference[]? additionalRefs = null, bool allowUnsafe = false, bool checkOverflow = true, CancellationToken cancellationToken = default) { - (Compilation comp, GeneratorDriverRunResult generatorResults) = await RunGeneratorCore(code, langVersion, additionalRefs, allowUnsafe, cancellationToken); + (Compilation comp, GeneratorDriverRunResult generatorResults) = await RunGeneratorCore(code, langVersion, additionalRefs, allowUnsafe, checkOverflow, cancellationToken); string generatedSource = string.Concat(generatorResults.GeneratedTrees.Select(t => t.ToString())); if (generatorResults.Diagnostics.Length != 0) @@ -197,6 +197,7 @@ internal static async Task SourceGenRegexAsync( .WithCompilationOptions( new CSharpCompilationOptions( OutputKind.DynamicallyLinkedLibrary, + checkOverflow: true, warningLevel: 9999, // docs recommend using "9999" to catch all warnings now and in the future specificDiagnosticOptions: ImmutableDictionary.Empty.Add("SYSLIB1044", ReportDiagnostic.Hidden)) // regex with limited support .WithNullableContextOptions(NullableContextOptions.Enable)) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorParserTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorParserTests.cs index 642bba87ad050..fa86ca59e6099 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorParserTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorParserTests.cs @@ -6,6 +6,7 @@ using Microsoft.DotNet.RemoteExecutor; using System.Collections.Generic; using System.Diagnostics; +using System.Linq; using System.Globalization; using System.Threading.Tasks; using Xunit; @@ -438,22 +439,28 @@ partial class C ", compile: true)); } + public static IEnumerable Valid_ClassWithNamespace_ConfigurationOptions_MemberData() => + from pattern in new[] { "", "ab", "ab*c|de*?f|(ghi){1,3}", "\\\\w\\\\W\\\\b\\\\B\\\\d\\\\D\\\\s\\\\S" } + from options in new[] { RegexOptions.None, RegexOptions.IgnoreCase, RegexOptions.ECMAScript, RegexOptions.RightToLeft } + from allowUnsafe in new[] { false, true } + from checkOverflow in new[] { false, true } + select new object[] { pattern, options, allowUnsafe, checkOverflow }; + [Theory] - [InlineData(false)] - [InlineData(true)] - public async Task Valid_ClassWithNamespace(bool allowUnsafe) + [MemberData(nameof(Valid_ClassWithNamespace_ConfigurationOptions_MemberData))] + public async Task Valid_ClassWithNamespace_ConfigurationOptions(string pattern, RegexOptions options, bool allowUnsafe, bool checkOverflow) { - Assert.Empty(await RegexGeneratorHelper.RunGenerator(@" + Assert.Empty(await RegexGeneratorHelper.RunGenerator($@" using System.Text.RegularExpressions; namespace A - { + {{ partial class C - { - [GeneratedRegex(""ab"")] + {{ + [GeneratedRegex(""{pattern}"", RegexOptions.{options})] private static partial Regex Valid(); - } - } - ", compile: true, allowUnsafe: allowUnsafe)); + }} + }} + ", compile: true, allowUnsafe: allowUnsafe, checkOverflow: checkOverflow)); } [Fact] From 04a2add2a8e47b90c00945943ac06d7f4282894f Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 11 Nov 2022 14:46:11 -0500 Subject: [PATCH 2/2] Address PR feedback --- .../System.Text.RegularExpressions/gen/RegexGenerator.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs index 5ea1e95748c7f..ed506320a1a8f 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs @@ -36,7 +36,7 @@ public partial class RegexGenerator : IIncrementalGenerator "#pragma warning disable CS0219 // Variable assigned but never used", }; - private sealed record CompilationData(bool AllowUnsafe = false, bool CheckOverflow = false); + private record struct CompilationData(bool AllowUnsafe, bool CheckOverflow); public void Initialize(IncrementalGeneratorInitializationContext context) { @@ -47,7 +47,7 @@ public void Initialize(IncrementalGeneratorInitializationContext context) .Select((x, _) => x.Options is CSharpCompilationOptions options ? new CompilationData(options.AllowUnsafe, options.CheckOverflow) : - new CompilationData()); + default); // Produces one entry per generated regex. This may be: // - Diagnostic in the case of a failure that should end the compilation