Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Two-phase matching algorithm for NonBacktracking #68199

Merged
merged 4 commits into from
Apr 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ internal int FixedLength
/// <summary>If true then the state is a dead-end, rejects all inputs.</summary>
internal bool IsNothing => Node.IsNothing;

/// <summary>If true then state starts with a ^ or $ or \A or \z or \Z</summary>
/// <summary>If true then state starts with a ^ or $ or \Z</summary>
internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;

/// <summary>
Expand Down Expand Up @@ -134,7 +134,9 @@ internal DfaMatchingState<TSet> Next(TSet minterm)
// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
list.Add((Node._builder.CreateState(node, nextCharKind, capturing: true), effects));
DfaMatchingState<TSet> state = Node._builder.CreateState(node, nextCharKind, capturing: true);
if (!state.IsDeadend)
list.Add((state, effects));
}
return list;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ internal DfaExplorer(SymbolicRegexMatcher<TSet> srm, bool nfa, bool addDotStar,
{
_builder = srm._builder;
uint startId = reverse ?
(srm._reversePattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0) :
(srm._pattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0);
(srm._reversePattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0) :
(srm._pattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0);

// Create the initial state
_initialState = _builder.CreateState(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>
internal readonly SymbolicRegexNode<TSet> _nothing;
internal readonly SymbolicRegexNode<TSet> _anyChar;
internal readonly SymbolicRegexNode<TSet> _anyStar;
internal readonly SymbolicRegexNode<TSet> _anyStarLazy;

private SymbolicRegexNode<TSet>? _epsilon;
internal SymbolicRegexNode<TSet> Epsilon => _epsilon ??= SymbolicRegexNode<TSet>.CreateEpsilon(this);
Expand Down Expand Up @@ -173,6 +174,7 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
_nothing = SymbolicRegexNode<TSet>.CreateFalse(this);
_anyChar = SymbolicRegexNode<TSet>.CreateTrue(this);
_anyStar = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: false);
_anyStarLazy = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: true);

// --- initialize singletonCache ---
_singletonCache[_solver.Empty] = _nothing;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,14 @@ namespace System.Text.RegularExpressions.Symbolic
private const uint IsLazyMask = 4;
private const uint CanBeNullableMask = 8;
private const uint ContainsSomeAnchorMask = 16;
private const uint ContainsLineAnchorMask = 32;
private const uint ContainsSomeCharacterMask = 64;
private const uint StartsWithBoundaryAnchorMask = 128;
private const uint StartsWithSomeAnchorMask = 32;

private readonly uint _info;

private SymbolicRegexInfo(uint i) => _info = i;

internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false,
bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false,
bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true)
internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false,
bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, bool isLazy = true)
{
uint i = 0;

Expand All @@ -35,31 +32,21 @@ internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool can
}
}

if (startsWithLineAnchor || containsLineAnchor || startsWithBoundaryAnchor || containsSomeAnchor)
if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor)
{
i |= ContainsSomeAnchorMask;

if (startsWithLineAnchor || containsLineAnchor)
if (startsWithLineAnchor)
{
i |= ContainsLineAnchorMask;

if (startsWithLineAnchor)
{
i |= StartsWithLineAnchorMask;
}
i |= StartsWithLineAnchorMask;
}

if (startsWithBoundaryAnchor)
if (startsWithLineAnchor || startsWithSomeAnchor)
{
i |= StartsWithBoundaryAnchorMask;
i |= StartsWithSomeAnchorMask;
}
Comment on lines +39 to 47
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit:
To retain the style of the rest of the checks, this could be:

if (startsWithLineAnchor || startsWithSomeAnchor)
{
    i |= StartsWithSomeAnchorMask;

    if (startsWithLineAnchor)
    {
        i |= StartsWithLineAnchorMask
    }
}

}

if (containsSomeCharacter)
{
i |= ContainsSomeCharacterMask;
}

if (isLazy)
{
i |= IsLazyMask;
Expand All @@ -72,18 +59,12 @@ internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool can

public bool CanBeNullable => (_info & CanBeNullableMask) != 0;

public bool StartsWithSomeAnchor => (_info & (StartsWithLineAnchorMask | StartsWithBoundaryAnchorMask)) != 0;

public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;

public bool StartsWithBoundaryAnchor => (_info & StartsWithBoundaryAnchorMask) != 0;
public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;

public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;

public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;

public bool ContainsSomeCharacter => (_info & ContainsSomeCharacterMask) != 0;

public bool IsLazy => (_info & IsLazyMask) != 0;

public static SymbolicRegexInfo Or(params SymbolicRegexInfo[] infos)
Expand Down Expand Up @@ -121,20 +102,14 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos)
return new SymbolicRegexInfo(i);
}

public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info)
{
bool isNullable = left_info.IsNullable && right_info.IsNullable;
bool canBeNullable = left_info.CanBeNullable && right_info.CanBeNullable;
bool isLazy = left_info.IsLazy && right_info.IsLazy;

bool startsWithLineAnchor = left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor);
bool startsWithBoundaryAnchor = left_info.StartsWithBoundaryAnchor || (left_info.CanBeNullable && right_info.StartsWithBoundaryAnchor);
bool containsSomeAnchor = left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor;
bool containsLineAnchor = left_info.ContainsLineAnchor || right_info.ContainsLineAnchor;
bool containsSomeCharacter = left_info.ContainsSomeCharacter || right_info.ContainsSomeCharacter;

return Create(isNullable, canBeNullable, startsWithLineAnchor, startsWithBoundaryAnchor, containsSomeAnchor, containsLineAnchor, containsSomeCharacter, isLazy);
}
public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) =>
Create(
isAlwaysNullable: left_info.IsNullable && right_info.IsNullable,
canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable,
startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor),
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
isLazy: left_info.IsLazy && right_info.IsLazy);

public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound, bool isLazy)
{
Expand Down Expand Up @@ -171,10 +146,7 @@ public static SymbolicRegexInfo Not(SymbolicRegexInfo info) =>
Create(isAlwaysNullable: !info.CanBeNullable,
canBeNullable: !info.IsNullable,
startsWithLineAnchor: info.StartsWithLineAnchor,
startsWithBoundaryAnchor: info.StartsWithBoundaryAnchor,
containsSomeAnchor: info.ContainsSomeAnchor,
containsLineAnchor: info.ContainsLineAnchor,
containsSomeCharacter: info.ContainsSomeCharacter,
isLazy: info.IsLazy);

public override bool Equals(object? obj) => obj is SymbolicRegexInfo i && Equals(i);
Expand Down
Loading