diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 4606e127d4456..4fc98880dc259 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -63,6 +63,7 @@ + @@ -82,10 +83,6 @@ - - - - diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs index ee124fdf205c1..572a47b4d4ed1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs @@ -26,23 +26,21 @@ internal static bool EnableDebugTracing } /// Unwind the regex and save the resulting state graph in DGML - /// roughly the maximum number of states, 0 means no bound - /// if true then hide state info - /// if true then pretend that there is a .* at the beginning - /// if true then unwind the regex backwards (addDotStar is then ignored) - /// if true then compute and save only general DFA info - /// dgml output is written here + /// Writer to which the DGML is written. + /// True to create an NFA instead of a DFA. + /// True to prepend .*? onto the pattern (outside of the implicit root capture). + /// If true, then unwind the regex backwards (and is ignored). + /// The approximate maximum number of states to include; less than or equal to 0 for no maximum. /// maximum length of labels in nodes anything over that length is indicated with .. - /// if true creates NFA instead of DFA [ExcludeFromCodeCoverage(Justification = "Debug only")] - internal void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA) + internal void SaveDGML(TextWriter writer, bool nfa, bool addDotStar, bool reverse, int maxStates, int maxLabelLength) { if (factory is not SymbolicRegexRunnerFactory srmFactory) { throw new NotSupportedException(); } - srmFactory._matcher.SaveDGML(writer, bound, hideStateInfo, addDotStar, inReverse, onlyDFAinfo, maxLabelLength, asNFA); + srmFactory._matcher.SaveDGML(writer, nfa, addDotStar, reverse, maxStates, maxLabelLength); } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/DgmlWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/DgmlWriter.cs deleted file mode 100644 index cbd134be45363..0000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/DgmlWriter.cs +++ /dev/null @@ -1,241 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -using System.Collections.Generic; -using System.Diagnostics; -using System.IO; - -namespace System.Text.RegularExpressions.Symbolic.DGML -{ - internal sealed class DgmlWriter - { - private readonly int _maxDgmlTransitionLabelLength; - private readonly TextWriter _tw; - private readonly bool _hideStateInfo; - private readonly bool _onlyDFAinfo; - - internal DgmlWriter(TextWriter tw, bool hideStateInfo, int maxDgmlTransitionLabelLength = -1, bool onlyDFAinfo = false) - { - _maxDgmlTransitionLabelLength = maxDgmlTransitionLabelLength; - _tw = tw; - _hideStateInfo = hideStateInfo; - _onlyDFAinfo = onlyDFAinfo; - } - - /// - /// Write the automaton in dgml format into the textwriter. - /// - public void Write(IAutomaton fa) - { - var nonEpsilonMoves = new Dictionary<(int, int), List>(); - var epsilonmoves = new List>(); - - var nonEpsilonStates = new HashSet(); - - foreach (Move move in fa.GetMoves()) - { - if (move.IsEpsilon) - { - epsilonmoves.Add(move); - } - else - { - nonEpsilonStates.Add(move.SourceState); - var p = (move.SourceState, move.TargetState); - if (!nonEpsilonMoves.TryGetValue(p, out List? rules)) - { - rules = new List(); - nonEpsilonMoves[p] = rules; - } - - Debug.Assert(move.Label is not null); - rules.Add(move.Label); - } - } - - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine("", GetDFAInfo(fa)); - _tw.WriteLine("", GetDFAInfo(fa)); - if (_onlyDFAinfo) - { - _tw.WriteLine(""); - } - else - { - foreach (int state in fa.GetStates()) - { - _tw.WriteLine("", state, _hideStateInfo ? "Collapsed" : "Expanded", GetStateInfo(fa, state)); - if (state == fa.InitialState) - { - _tw.WriteLine(""); - } - if (fa.IsFinalState(state)) - { - _tw.WriteLine(""); - } - _tw.WriteLine(""); - _tw.WriteLine("", state, GetStateInfo(fa, state)); - } - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine("", fa.InitialState, fa.DescribeStartLabel()); - _tw.WriteLine(""); - - foreach (Move move in epsilonmoves) - { - _tw.WriteLine("", move.SourceState, move.TargetState); - } - - foreach (KeyValuePair<(int, int), List> move in nonEpsilonMoves) - { - _tw.WriteLine(GetNonFinalRuleInfo(fa, move.Key.Item1, move.Key.Item2, move.Value)); - } - - foreach (int state in fa.GetStates()) - { - _tw.WriteLine("", state); - } - - _tw.WriteLine(""); - WriteCategoriesAndStyles(); - } - _tw.WriteLine(""); - } - - private string GetDFAInfo(IAutomaton fa) - { - StringBuilder sb = new(); - sb.Append("|Q|="); - sb.Append(fa.StateCount); - sb.Append(" "); - sb.Append('|'); - sb.Append(DeltaCapital); - sb.Append("|="); - sb.Append(fa.TransitionCount); - sb.Append(" "); - sb.Append('|'); - sb.Append(SigmalCapital); - sb.Append("|="); - sb.Append(fa.Alphabet.Length); - sb.Append(" "); - sb.Append(SigmalCapital); - sb.Append('='); - for (int i = 0; i < fa.Alphabet.Length; i++) - { - if (i > 0) - sb.Append(','); - sb.Append(fa.DescribeLabel(fa.Alphabet[i])); - } - return sb.ToString(); - } - - private const string DeltaCapital = "Δ"; - private const string SigmalCapital = "Σ"; - - private static string GetStateInfo(IAutomaton fa, int state) - { - StringBuilder sb = new(); - sb.Append(fa.DescribeState(state)); - return sb.ToString(); - } - - private string GetNonFinalRuleInfo(IAutomaton aut, int source, int target, List rules) - { - string lab = ""; - string info = ""; - for (int i = 0; i < rules.Count; i++) - { - lab += (lab == "" ? "" : ",\n ") + aut.DescribeLabel(rules[i]); - } - - int lab_length = lab.Length; - if (_maxDgmlTransitionLabelLength >= 0 && lab_length > _maxDgmlTransitionLabelLength) - { - info += $" FullLabel = \"{lab}\""; - lab = string.Concat(lab.AsSpan(0, _maxDgmlTransitionLabelLength), ".."); - } - - return $""; - } - - private void WriteCategoriesAndStyles() - { - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - //_tw.WriteLine(""); - //_tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - _tw.WriteLine(""); - } - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/IAutomaton.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/IAutomaton.cs deleted file mode 100644 index f6237967905f4..0000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/IAutomaton.cs +++ /dev/null @@ -1,66 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -using System.Collections.Generic; - -namespace System.Text.RegularExpressions.Symbolic.DGML -{ - /// - /// For accessing the key components of an automaton. - /// - /// type of labels in moves - internal interface IAutomaton - { - /// - /// Enumerates all moves of the automaton. - /// - IEnumerable> GetMoves(); - - /// - /// Enumerates all states of the automaton. - /// - IEnumerable GetStates(); - - /// - /// Returns the minterm partition of the alphabet. - /// - TLabel[] Alphabet { get; } - - /// - /// Provides a description of the state for visualization purposes. - /// - string DescribeState(int state); - - /// - /// Provides a description of the label for visualization purposes. - /// - string DescribeLabel(TLabel lab); - - /// - /// Provides a description of the start label for visualization purposes. - /// - string DescribeStartLabel(); - - /// - /// The initial state of the automaton. - /// - int InitialState { get; } - - /// - /// The number of states of the automaton. - /// - int StateCount { get; } - - /// - /// The number of transitions of the automaton. - /// - int TransitionCount { get; } - - /// - /// Returns true iff the state is a final state. - /// - bool IsFinalState(int state); - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/Move.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/Move.cs deleted file mode 100644 index e25e3b720a901..0000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/Move.cs +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -using System.Diagnostics.CodeAnalysis; - -namespace System.Text.RegularExpressions.Symbolic.DGML -{ - /// - /// Represents a move of a symbolic finite automaton. - /// The value default(L) is reserved to represent the label of an epsilon move. - /// Thus if S is a reference type the label of an epsilon move is null. - /// - /// the type of the labels on moves - internal sealed class Move - { - /// - /// Source state of the move - /// - public readonly int SourceState; - /// - /// Target state of the move - /// - public readonly int TargetState; - /// - /// Label of the move - /// - public readonly TLabel? Label; - - /// - /// Transition of an automaton. - /// - /// source state of the transition - /// target state of the transition - /// label of the transition - public Move(int sourceState, int targetState, TLabel? lab) - { - SourceState = sourceState; - TargetState = targetState; - Label = lab; - } - - /// - /// Creates a move. Creates an epsilon move if label is default(L). - /// - public static Move Create(int sourceState, int targetState, TLabel condition) => new Move(sourceState, targetState, condition); - - /// - /// Creates an epsilon move. Same as Create(sourceState, targetState, default(L)). - /// - public static Move Epsilon(int sourceState, int targetState) => new Move(sourceState, targetState, default); - - /// - /// Returns true if label equals default(S). - /// - public bool IsEpsilon => Equals(Label, default(TLabel)); - - /// - /// Returns true if the source state and the target state are identical - /// - public bool IsSelfLoop => SourceState == TargetState; - - /// - /// Returns true if obj is a move with the same source state, target state, and label. - /// - public override bool Equals([NotNullWhen(false)] object? obj) => - obj is Move t && - t.SourceState == SourceState && - t.TargetState == TargetState && - (t.Label is null ? Label is null : t.Label.Equals(Label)); - - public override int GetHashCode() => (SourceState, Label, TargetState).GetHashCode(); - - public override string ToString() => $"({SourceState},{(Equals(Label, default(TLabel)) ? "" : Label + ",")}{TargetState})"; - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/RegexAutomaton.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/RegexAutomaton.cs deleted file mode 100644 index ddcd9f8bf5bcb..0000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Dgml/RegexAutomaton.cs +++ /dev/null @@ -1,140 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -using System.Collections.Generic; -using System.Diagnostics; - -namespace System.Text.RegularExpressions.Symbolic.DGML -{ - /// - /// Used by DgmlWriter to unwind a regex into a DFA up to a bound that limits the number of states - /// - internal sealed class RegexAutomaton : IAutomaton<(SymbolicRegexNode?, T)> where T : notnull - { - private readonly DfaMatchingState _q0; - private readonly List _states = new(); - private readonly HashSet _stateSet = new(); - private readonly List?, T)>> _moves = new(); - private readonly SymbolicRegexBuilder _builder; - private readonly SymbolicNFA? _nfa; - - internal RegexAutomaton(SymbolicRegexMatcher srm, int bound, bool addDotStar, bool inReverse, bool asNFA) - { - _builder = srm._builder; - uint startId = inReverse ? - (srm._reversePattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0) : - (srm._pattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0); - - //inReverse only matters if Ar contains some line anchor - _q0 = _builder.CreateState(inReverse ? srm._reversePattern : (addDotStar ? srm._dotStarredPattern : srm._pattern), startId); - - if (asNFA) - { - _nfa = _q0.Node.Explore(bound); - for (int q = 0; q < _nfa.StateCount; q++) - { - _states.Add(q); - foreach ((T, SymbolicRegexNode?, int) branch in _nfa.EnumeratePaths(q)) - _moves.Add(Move<(SymbolicRegexNode?, T)>.Create(q, branch.Item3, (branch.Item2, branch.Item1))); - } - } - else - { - Dictionary<(int, int), T> normalizedmoves = new(); - Stack> stack = new(); - stack.Push(_q0); - _states.Add(_q0.Id); - _stateSet.Add(_q0.Id); - - T[]? partition = _builder._solver.GetMinterms(); - Debug.Assert(partition is not null); - //unwind until the stack is empty or the bound has been reached - while (stack.Count > 0 && (bound <= 0 || _states.Count < bound)) - { - DfaMatchingState q = stack.Pop(); - foreach (T c in partition) - { - DfaMatchingState p = q.Next(c); - - // check that p is not a dead-end - if (!p.IsNothing) - { - if (_stateSet.Add(p.Id)) - { - stack.Push(p); - _states.Add(p.Id); - } - - (int, int) qp = (q.Id, p.Id); - normalizedmoves[qp] = normalizedmoves.ContainsKey(qp) ? - _builder._solver.Or(normalizedmoves[qp], c) : - c; - } - } - } - - foreach (KeyValuePair<(int, int), T> entry in normalizedmoves) - _moves.Add(Move<(SymbolicRegexNode?, T)>.Create(entry.Key.Item1, entry.Key.Item2, (null, entry.Value))); - } - } - - public (SymbolicRegexNode?, T)[] Alphabet - { - get - { - T[]? alphabet = _builder._solver.GetMinterms(); - Debug.Assert(alphabet is not null); - var results = new (SymbolicRegexNode?, T)[alphabet.Length]; - for (int i = 0; i < alphabet.Length; i++) - { - results[i] = (null, alphabet[i]); - } - return results; - } - } - - public int InitialState => _nfa is not null ? 0 : _q0.Id; - - public int StateCount => _states.Count; - - public int TransitionCount => _moves.Count; - - public string DescribeLabel((SymbolicRegexNode?, T) lab) => - lab.Item1 is null ? Net.WebUtility.HtmlEncode(_builder._solver.PrettyPrint(lab.Item2)) : - // Conditional nullability based on anchors - Net.WebUtility.HtmlEncode($"{lab.Item1}/{_builder._solver.PrettyPrint(lab.Item2)}"); - - public string DescribeStartLabel() => ""; - - public string DescribeState(int state) - { - if (_nfa is not null) - { - Debug.Assert(state < _nfa.StateCount); - string? str = Net.WebUtility.HtmlEncode(_nfa.GetNode(state).ToString()); - return _nfa.IsUnexplored(state) ? $"Unexplored:{str}" : str; - } - - Debug.Assert(_builder._stateArray is not null); - return _builder._stateArray[state].DgmlView; - } - - public IEnumerable GetStates() => _states; - - public bool IsFinalState(int state) - { - if (_nfa is not null) - { - Debug.Assert(state < _nfa.StateCount); - return _nfa.CanBeNullable(state); - } - - Debug.Assert(_builder._stateArray is not null && state < _builder._stateArray.Length); - return _builder._stateArray[state].Node.CanBeNullable; - } - - public IEnumerable?, T)>> GetMoves() => _moves; - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs new file mode 100644 index 0000000000000..540a834b60a1f --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs @@ -0,0 +1,321 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if DEBUG +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Net; + +namespace System.Text.RegularExpressions.Symbolic +{ + [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] + internal static class DgmlWriter where T : notnull + { + /// Write the DFA or NFA in DGML format into the TextWriter. + /// The for the regular expression. + /// Writer to which the DGML is written. + /// True to create an NFA instead of a DFA. + /// True to prepend .*? onto the pattern (outside of the implicit root capture). + /// If true, then unwind the regex backwards (and is ignored). + /// The approximate maximum number of states to include; less than or equal to 0 for no maximum. + /// maximum length of labels in nodes anything over that length is indicated with .. + public static void Write( + TextWriter writer, SymbolicRegexMatcher matcher, + bool nfa = false, bool addDotStar = true, bool reverse = false, int maxStates = -1, int maxLabelLength = -1) + { + var explorer = new DfaExplorer(matcher, nfa, addDotStar, reverse, maxStates); + var nonEpsilonTransitions = new Dictionary<(int SourceState, int TargetState), List<(SymbolicRegexNode?, T)>>(); + var epsilonTransitions = new List(); + + foreach (Transition transition in explorer.GetTransitions()) + { + if (transition.IsEpsilon) + { + epsilonTransitions.Add(transition); + } + else + { + (int SourceState, int TargetState) p = (transition.SourceState, transition.TargetState); + if (!nonEpsilonTransitions.TryGetValue(p, out List<(SymbolicRegexNode?, T)>? rules)) + { + nonEpsilonTransitions[p] = rules = new List<(SymbolicRegexNode?, T)>(); + } + + rules.Add(transition.Label); + } + } + + writer.WriteLine(""); + writer.WriteLine(""); + writer.WriteLine(" "); + writer.WriteLine(" ", GetDFAInfo(explorer)); + writer.WriteLine(" ", GetDFAInfo(explorer)); + foreach (int state in explorer.GetStates()) + { + writer.WriteLine(" ", state, explorer.DescribeState(state)); + if (state == explorer.InitialState) + { + writer.WriteLine(" "); + } + if (explorer.IsFinalState(state)) + { + writer.WriteLine(" "); + } + writer.WriteLine(" "); + writer.WriteLine(" ", state, explorer.DescribeState(state)); + } + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" ", explorer.InitialState); + writer.WriteLine(" "); + + foreach (Transition transition in epsilonTransitions) + { + writer.WriteLine(" ", transition.SourceState, transition.TargetState); + } + + foreach (KeyValuePair<(int, int), List<(SymbolicRegexNode?, T)>> transition in nonEpsilonTransitions) + { + string label = string.Join($",{Environment.NewLine} ", DescribeLabels(explorer, transition.Value)); + string info = ""; + if (label.Length > (uint)maxLabelLength) + { + info = $"FullLabel = \"{label}\" "; + label = string.Concat(label.AsSpan(0, maxLabelLength), ".."); + } + + writer.WriteLine($" "); + } + + foreach (int state in explorer.GetStates()) + { + writer.WriteLine(" ", state); + } + + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(""); + } + + private static string GetDFAInfo(DfaExplorer explorer) + { + StringBuilder sb = new(); + sb.Append($"States = {explorer.StateCount} "); + sb.Append($"Transitions = {explorer.TransitionCount} "); + sb.Append($"Min Terms ({explorer._builder._solver.GetMinterms()!.Length}) = ").AppendJoin(',', DescribeLabels(explorer, explorer.Alphabet)); + return sb.ToString(); + } + + private static IEnumerable DescribeLabels(DfaExplorer explorer, IList<(SymbolicRegexNode?, T)> items) + { + for (int i = 0; i < items.Count; i++) + { + yield return explorer.DescribeLabel(items[i]); + } + } + + /// Used to unwind a regex into a DFA up to a bound that limits the number of states + private sealed class DfaExplorer + { + private readonly DfaMatchingState _initialState; + private readonly List _states = new(); + private readonly List _transitions = new(); + private readonly SymbolicNFA? _nfa; + internal readonly SymbolicRegexBuilder _builder; + + internal DfaExplorer(SymbolicRegexMatcher srm, bool nfa, bool addDotStar, bool reverse, int maxStates) + { + _builder = srm._builder; + uint startId = reverse ? + (srm._reversePattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0) : + (srm._pattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0); + + // Create the initial state + _initialState = _builder.CreateState( + reverse ? srm._reversePattern : + addDotStar ? srm._dotStarredPattern : + srm._pattern, startId); + + if (nfa) + { + _nfa = _initialState.Node.Explore(maxStates); + for (int q = 0; q < _nfa.StateCount; q++) + { + _states.Add(q); + foreach ((T, SymbolicRegexNode?, int) branch in _nfa.EnumeratePaths(q)) + { + _transitions.Add(new Transition(q, branch.Item3, (branch.Item2, branch.Item1))); + } + } + } + else + { + Dictionary<(int, int), T> normalizedMoves = new(); + Stack> stack = new(); + stack.Push(_initialState); + _states.Add(_initialState.Id); + + HashSet stateSet = new(); + stateSet.Add(_initialState.Id); + + T[]? minterms = _builder._solver.GetMinterms(); + Debug.Assert(minterms is not null); + + // Unwind until the stack is empty or the bound has been reached + while (stack.Count > 0 && (maxStates <= 0 || _states.Count < maxStates)) + { + DfaMatchingState q = stack.Pop(); + foreach (T c in minterms) + { + DfaMatchingState p = q.Next(c); + + // check that p is not a dead-end + if (!p.IsNothing) + { + if (stateSet.Add(p.Id)) + { + stack.Push(p); + _states.Add(p.Id); + } + + (int, int) qp = (q.Id, p.Id); + normalizedMoves[qp] = normalizedMoves.ContainsKey(qp) ? + _builder._solver.Or(normalizedMoves[qp], c) : + c; + } + } + } + + foreach (KeyValuePair<(int, int), T> entry in normalizedMoves) + { + _transitions.Add(new Transition(entry.Key.Item1, entry.Key.Item2, (null, entry.Value))); + } + } + } + + public (SymbolicRegexNode?, T)[] Alphabet + { + get + { + T[]? alphabet = _builder._solver.GetMinterms(); + Debug.Assert(alphabet is not null); + var results = new (SymbolicRegexNode?, T)[alphabet.Length]; + for (int i = 0; i < alphabet.Length; i++) + { + results[i] = (null, alphabet[i]); + } + return results; + } + } + + public int InitialState => _nfa is not null ? 0 : _initialState.Id; + + public int StateCount => _states.Count; + + public int TransitionCount => _transitions.Count; + + public string DescribeLabel((SymbolicRegexNode?, T) lab) => + WebUtility.HtmlEncode(lab.Item1 is null ? // Conditional nullability based on anchors + _builder._solver.PrettyPrint(lab.Item2) : + $"{lab.Item1}/{_builder._solver.PrettyPrint(lab.Item2)}"); + + public string DescribeState(int state) + { + if (_nfa is not null) + { + Debug.Assert(state < _nfa.StateCount); + string? str = WebUtility.HtmlEncode(_nfa.GetNode(state).ToString()); + return _nfa.IsUnexplored(state) ? $"Unexplored:{str}" : str; + } + + Debug.Assert(_builder._stateArray is not null); + return _builder._stateArray[state].DgmlView; + } + + public IEnumerable GetStates() => _states; + + public bool IsFinalState(int state) + { + if (_nfa is not null) + { + Debug.Assert(state < _nfa.StateCount); + return _nfa.CanBeNullable(state); + } + + Debug.Assert(_builder._stateArray is not null && state < _builder._stateArray.Length); + return _builder._stateArray[state].Node.CanBeNullable; + } + + public List GetTransitions() => _transitions; + } + + private record Transition(int SourceState, int TargetState, (SymbolicRegexNode?, T) Label) + { + public bool IsEpsilon => Label.Equals(default); + } + } +} +#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index e96525cf07a51..37010427734ac 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -16,15 +16,13 @@ internal abstract class SymbolicRegexMatcher { #if DEBUG /// Unwind the regex of the matcher and save the resulting state graph in DGML - /// roughly the maximum number of states, 0 means no bound - /// if true then hide state info - /// if true then pretend that there is a .* at the beginning - /// if true then unwind the regex backwards (addDotStar is then ignored) - /// if true then compute and save only genral DFA info - /// dgml output is written here + /// Writer to which the DGML is written. + /// True to create an NFA instead of a DFA. + /// True to prepend .*? onto the pattern (outside of the implicit root capture). + /// If true, then unwind the regex backwards. + /// The approximate maximum number of states to include; less than or equal to 0 for no maximum. /// maximum length of labels in nodes anything over that length is indicated with .. - /// if true creates NFA instead of DFA - public abstract void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA); + public abstract void SaveDGML(TextWriter writer, bool nfa, bool addDotStar, bool reverse, int maxStates, int maxLabelLength); /// /// Generates up to k random strings matched by the regex @@ -1276,12 +1274,8 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexBuilder< } #if DEBUG - public override void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA) - { - var graph = new DGML.RegexAutomaton(this, bound, addDotStar, inReverse, asNFA); - var dgml = new DGML.DgmlWriter(writer, hideStateInfo, maxLabelLength, onlyDFAinfo); - dgml.Write(graph); - } + public override void SaveDGML(TextWriter writer, bool nfa, bool addDotStar, bool reverse, int maxStates, int maxLabelLength) => + DgmlWriter.Write(writer, this, nfa, addDotStar, reverse, maxStates, maxLabelLength); public override IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative) => new SymbolicRegexSampler(_pattern, randomseed, negative).GenerateRandomMembers(k); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index bf4c9eb20671c..59522b71116bd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -1582,9 +1582,11 @@ internal void ToString(StringBuilder sb) case SymbolicRegexNodeKind.OrderedOr: Debug.Assert(_left is not null && _right is not null); + sb.Append('('); _left.ToString(sb); sb.Append('|'); _right.ToString(sb); + sb.Append(')'); return; case SymbolicRegexNodeKind.Concat: diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/CustomDerivedRegexScenarioTest.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/CustomDerivedRegexScenarioTest.cs index 5f40a3c2e56fc..cf691b3014b87 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/CustomDerivedRegexScenarioTest.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/CustomDerivedRegexScenarioTest.cs @@ -35,7 +35,7 @@ internal class CustomDerivedRegex : Regex public CustomDerivedRegex() { - pattern = /*lang=regex*/@"\G(\d{1,3})(?=(?:\d{3})+\b)"; + pattern = @"\G(\d{1,3})(?=(?:\d{3})+\b)"; roptions = RegexOptions.Compiled; internalMatchTimeout = Timeout.InfiniteTimeSpan; factory = new CustomRegexRunnerFactory(); diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Tests.Common.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Tests.Common.cs index e1792c623be21..e94e98140734a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Tests.Common.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Tests.Common.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; using System.Linq; using System.Threading.Tasks; using Xunit; @@ -48,7 +49,7 @@ public static bool IsDefaultStart(string input, RegexOptions options, int start) return start == 0; } - public static async Task GetRegexAsync(RegexEngine engine, string pattern, RegexOptions options, Globalization.CultureInfo culture) + public static async Task GetRegexAsync(RegexEngine engine, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern, RegexOptions options, Globalization.CultureInfo culture) { using (new System.Tests.ThreadCultureChange(culture)) { @@ -100,7 +101,7 @@ public static IEnumerable AvailableEngines public static bool IsNonBacktracking(RegexEngine engine) => engine is RegexEngine.NonBacktracking or RegexEngine.NonBacktrackingSourceGenerated; - public static async Task GetRegexAsync(RegexEngine engine, string pattern, RegexOptions? options = null, TimeSpan? matchTimeout = null) + public static async Task GetRegexAsync(RegexEngine engine, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern, RegexOptions? options = null, TimeSpan? matchTimeout = null) { if (options is null) { diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexExperiment.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexExperiment.cs index dd7b552e49a90..aa5f1932118b8 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexExperiment.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexExperiment.cs @@ -8,6 +8,7 @@ using Xunit; using Xunit.Abstractions; using System.Threading.Tasks; +using System.Diagnostics.CodeAnalysis; namespace System.Text.RegularExpressions.Tests { @@ -49,42 +50,6 @@ public void RegenerateUnicodeTables() } } - /// Save the regex as a DFA in DGML format in the textwriter. - private static bool TrySaveDGML(Regex regex, TextWriter writer, int bound = -1, bool hideStateInfo = false, bool addDotStar = false, bool inReverse = false, bool onlyDFAinfo = false, int maxLabelLength = -1, bool asNFA = false) - { - MethodInfo? saveDgml = regex.GetType().GetMethod("SaveDGML", BindingFlags.NonPublic | BindingFlags.Instance); - if (saveDgml is null) - { - return false; - } - else - { - saveDgml.Invoke(regex, new object[] { writer, bound, hideStateInfo, addDotStar, inReverse, onlyDFAinfo, maxLabelLength, asNFA }); - return true; - } - } - - /// View the regex as a DFA in DGML format in VS. - internal static void ViewDGML(Regex regex, int bound = -1, bool hideStateInfo = true, bool addDotStar = false, bool inReverse = false, bool onlyDFAinfo = false, string name = "DFA", int maxLabelLength = 20, bool asNFA = false) - { - if (!Directory.Exists(DgmlOutputDirectoryPath)) - { - Directory.CreateDirectory(DgmlOutputDirectoryPath); - } - - var sw = new StringWriter(); - // If TrySaveDGML returns false then Regex.SaveDGML is not supported (in Release build) - if (TrySaveDGML(regex, sw, bound, hideStateInfo, addDotStar, inReverse, onlyDFAinfo, maxLabelLength, asNFA)) - { - if (asNFA) - { - name = "NFA"; - } - - File.WriteAllText(Path.Combine(DgmlOutputDirectoryPath, $"{(inReverse ? name + "r" : (addDotStar ? name + "1" : name))}.dgml"), sw.ToString()); - } - } - private static long MeasureMatchTime(Regex re, string input, out Match match) { try @@ -124,6 +89,10 @@ private static string And(params string[] regexes) /// private static string Not(string regex) => $"(?({regex})[0-[0]]|.*)"; + /// + /// When is set to return true, outputs DGML diagrams for the specified pattern. + /// This is useful for understanding what graphs the NonBacktracking engine creates for the specified pattern. + /// [Fact] public void ViewSampleRegexInDGML() { @@ -132,22 +101,37 @@ public void ViewSampleRegexInDGML() return; } + if (!Directory.Exists(DgmlOutputDirectoryPath)) + { + Directory.CreateDirectory(DgmlOutputDirectoryPath); + } + try { - //string rawregex = @"\bis\w*\b"; - string rawregex = And(".*[0-9].*[0-9].*", ".*[A-Z].*[A-Z].*", Not(".*(01|12).*")); - //string rawregex = "a.{4}$"; - Regex re = new Regex($@"{rawregex}", RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline); - ViewDGML(re); - ViewDGML(re, inReverse: true); - ViewDGML(re, addDotStar: true); - ViewDGML(re, asNFA: true, bound: 12); - ViewDGML(re, inReverse: true, asNFA: true, bound: 12); - ViewDGML(re, addDotStar: true, asNFA: true, bound: 12); + /*lang=regex*/ + string pattern = @"abc|cd"; + + ViewDGML(pattern, "DFA"); + ViewDGML(pattern, "DFA_DotStar", addDotStar: true); + + ViewDGML(pattern, "NFA", nfa: true, maxStates: 12); + ViewDGML(pattern, "NFA_DotStar", nfa: true, addDotStar: true, maxStates: 12); + + static void ViewDGML(string pattern, string name, bool nfa = false, bool addDotStar = false, bool reverse = false, int maxStates = -1, int maxLabelLength = 20) + { + var regex = new Regex(pattern, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline); + if (regex.GetType().GetMethod("SaveDGML", BindingFlags.NonPublic | BindingFlags.Instance) is MethodInfo saveDgml) + { + var sw = new StringWriter(); + saveDgml.Invoke(regex, new object[] { sw, nfa, addDotStar, reverse, maxStates, maxLabelLength }); + string path = Path.Combine(DgmlOutputDirectoryPath, $"{name}.dgml"); + File.WriteAllText(path, sw.ToString()); + Console.WriteLine(path); + } + } } - catch (NotSupportedException e) + catch (NotSupportedException e) when (e.Message.Contains("conditional")) { - Assert.Contains("conditional", e.Message); } } @@ -160,7 +144,7 @@ public void TestDGMLGeneration(string pattern, int explorationbound, string[] ex { StringWriter sw = new StringWriter(); var re = new Regex(pattern, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline); - if (TrySaveDGML(re, writer: sw, bound: explorationbound, inReverse: exploreInReverse, asNFA: exploreAsNFA)) + if (TrySaveDGML(re, sw, exploreAsNFA, addDotStar: false, exploreInReverse, explorationbound, maxLabelLength: -1)) { string str = sw.ToString(); Assert.StartsWith("", str); @@ -171,12 +155,12 @@ public void TestDGMLGeneration(string pattern, int explorationbound, string[] ex } } - static bool TrySaveDGML(Regex regex, TextWriter writer, int bound = -1, bool hideStateInfo = false, bool addDotStar = false, bool inReverse = false, bool onlyDFAinfo = false, int maxLabelLength = -1, bool asNFA = false) + static bool TrySaveDGML(Regex regex, TextWriter writer, bool nfa, bool addDotStar, bool reverse, int maxStates, int maxLabelLength) { MethodInfo saveDgml = regex.GetType().GetMethod("SaveDGML", BindingFlags.NonPublic | BindingFlags.Instance); if (saveDgml is not null) { - saveDgml.Invoke(regex, new object[] { writer, bound, hideStateInfo, addDotStar, inReverse, onlyDFAinfo, maxLabelLength, asNFA }); + saveDgml.Invoke(regex, new object[] { writer, nfa, addDotStar, reverse, maxStates, maxLabelLength }); return true; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexRunnerTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexRunnerTests.cs index 82f2ae5a0336b..b2cd20814c0fa 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexRunnerTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexRunnerTests.cs @@ -13,7 +13,7 @@ public class RegexRunnerTests [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] public async Task EnginesThrowNotImplementedForGoAndFFC(RegexEngine engine) { - Regex re = await RegexHelpers.GetRegexAsync(engine, /*lang=regex*/@"abc"); + Regex re = await RegexHelpers.GetRegexAsync(engine, @"abc"); // Use reflection to ensure the runner is created so it can be fetched. MethodInfo createRunnerMethod = typeof(Regex).GetMethod("CreateRunner", BindingFlags.Instance | BindingFlags.NonPublic); @@ -34,7 +34,7 @@ public async Task EnginesThrowNotImplementedForGoAndFFC(RegexEngine engine) [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] public async Task EnsureRunmatchValueIsNulledAfterIsMatch(RegexEngine engine) { - Regex re = await RegexHelpers.GetRegexAsync(engine, /*lang=regex*/@"abc"); + Regex re = await RegexHelpers.GetRegexAsync(engine, @"abc"); // First call IsMatch which should initialize runmatch on the runner. Assert.True(re.IsMatch("abcabcabc")); diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj index 35707651e3b74..4162d7b75fd26 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj @@ -38,6 +38,7 @@ +