Skip to content

Commit

Permalink
improved BDD Unicode table representation in NonBacktracking engine (#…
Browse files Browse the repository at this point in the history
…61142)

* improved BDD Unicode table representation in NonBacktracking engine

* remove line

Co-authored-by: Dan Moseley <danmose@microsoft.com>

* improved bounds-check elimination

Co-authored-by: Stephen Toub <stoub@microsoft.com>

* clearer notation of numbers

Co-authored-by: Dan Moseley <danmose@microsoft.com>

* fixed typo

Co-authored-by: Dan Moseley <danmose@microsoft.com>
Co-authored-by: Stephen Toub <stoub@microsoft.com>
  • Loading branch information
3 people authored Nov 4, 2021
1 parent c02a37c commit b67f978
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 107 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ internal sealed class BDD : IComparable
/// </summary>
private static readonly long[] s_trueRepresentation = new long[] { 1 };

/// <summary>
/// Representation of False for compact serialization of BDDs.
/// </summary>
private static readonly byte[] s_falseRepresentationCompact = new byte[] { 0 };

/// <summary>
/// Representation of True for compact serialization of BDDs.
/// </summary>
private static readonly byte[] s_trueRepresentationCompact = new byte[] { 1 };

internal BDD(int ordinal, BDD? one, BDD? zero)
{
One = one;
Expand Down Expand Up @@ -300,6 +310,49 @@ public long[] Serialize()
return res;
}

/// <summary>
/// Serialize this BDD into a byte array.
/// This method is not valid for MTBDDs where some elements may be negative.
/// </summary>
public byte[] SerializeToBytes()
{
if (IsEmpty)
return s_falseRepresentationCompact;

if (IsFull)
return s_trueRepresentationCompact;

// in other cases make use of the general serializer to long[]
long[] serialized = Serialize();

// get the maximal element from the array
long m = 0;
for (int i = 0; i < serialized.Length; i++)
{
// make sure this serialization is not applied to MTBDDs
Debug.Assert(serialized[i] > 0);
m = Math.Max(serialized[i], m);
}

// k is the number of bytes needed to represent the maximal element
int k = m <= 0xFFFF ? 2 : (m <= 0xFF_FFFF ? 3 : (m <= 0xFFFF_FFFF ? 4 : (m <= 0xFF_FFFF_FFFF ? 5 : (m <= 0xFFFF_FFFF_FFFF ? 6 : (m <= 0xFF_FFFF_FFFF_FFFF ? 7 : 8)))));

// the result will contain k as the first element and the number of serialized elements times k
byte[] result = new byte[(k * serialized.Length) + 1];
result[0] = (byte)k;
for (int i=0; i < serialized.Length; i += 1)
{
long serialized_i = serialized[i];
// add the serialized longs as k-byte subsequences
for (int j = 1; j <= k; j++)
{
result[(i * k) + j] = (byte)serialized_i;
serialized_i = serialized_i >> 8;
}
}
return result;
}

/// <summary>
/// Recreates a BDD from a ulong array that has been created using Serialize.
/// Is executed using a lock on algebra (if algebra != null) in a single thread mode.
Expand Down Expand Up @@ -353,6 +406,70 @@ public static BDD Deserialize(long[] arcs, BDDAlgebra algebra)
return nodes[k - 1];
}

/// <summary>
/// Recreates a BDD from a byte array that has been created using SerializeToBytes.
/// </summary>
public static BDD Deserialize(byte[] bytes, BDDAlgebra algebra)
{
if (bytes.Length == 1)
{
return bytes[0] == 0 ? False : True;
}

// here bytes represents an array of longs with k = the number of bytes used per long
int k = (int)bytes[0];

// gets the i'th element from the underlying array of longs represented by bytes
long Get(int i)
{
long l = 0;
for (int j = k; j > 0; j--)
{
l = (l << 8) | bytes[(k * i) + j];
}
return l;
}

// n is the total nr of longs that corresponds also to the total number of BDD nodes needed
int n = (bytes.Length - 1) / k;

// make sure the represented nr of longs divides precisely without remainder
Debug.Assert((bytes.Length - 1) % k == 0);

// the number of bits used for ordinals and node identifiers are stored in the first two longs
int ordinal_bits = (int)Get(0);
int node_bits = (int)Get(1);

// create bit masks for the sizes of ordinals and node identifiers
long ordinal_mask = (1 << ordinal_bits) - 1;
long node_mask = (1 << node_bits) - 1;
BitLayout(ordinal_bits, node_bits, out int zero_node_shift, out int one_node_shift, out int ordinal_shift);

// store BDD nodes by their id when they are created
BDD[] nodes = new BDD[n];
nodes[0] = False;
nodes[1] = True;

for (int i = 2; i < n; i++)
{
// represents the triple (ordinal, one, zero)
long arc = Get(i);

// reconstruct the ordinal and child identifiers for a non-terminal
int ord = (int)((arc >> ordinal_shift) & ordinal_mask);
int oneId = (int)((arc >> one_node_shift) & node_mask);
int zeroId = (int)((arc >> zero_node_shift) & node_mask);

// the BDD nodes for the children are guaranteed to exist already
// because of the topological order they were serialized by
Debug.Assert(oneId < i && zeroId < i);
nodes[i] = algebra.GetOrCreateBDD(ord, nodes[oneId], nodes[zeroId]);
}

//the result is the last BDD in the nodes array
return nodes[n - 1];
}

/// <summary>
/// Use this bit layout in the serialization
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ namespace System.Text.RegularExpressions.Symbolic
/// </summary>
internal sealed class CharSetSolver : BDDAlgebra, ICharAlgebra<BDD>
{
/// <summary>BDDs for all characters for fast lookup.</summary>
private readonly BDD[] _charPredTable = new BDD[char.MaxValue + 1];
/// <summary>BDDs for all ASCII characters for fast lookup.</summary>
private readonly BDD[] _charPredTable = new BDD[128];
private readonly Unicode.IgnoreCaseTransformer _ignoreCase;
internal readonly BDD _nonAscii;

Expand All @@ -40,10 +40,23 @@ public BDD CharConstraint(char c, bool ignoreCase = false, string? culture = nul
else
{
//individual character BDDs are always fixed
return _charPredTable[c] ??= CreateSetFrom(c, 15);
BDD[] charPredTable = _charPredTable;
return c < charPredTable.Length ?
charPredTable[c] ??= CreateBDDFromChar(c) :
CreateBDDFromChar(c);
}
}

private BDD CreateBDDFromChar(ushort c)
{
BDD bdd = BDD.True;
for (int k = 0; k < 16; k++)
{
bdd = (c & (1 << k)) == 0 ? GetOrCreateBDD(k, BDD.False, bdd) : GetOrCreateBDD(k, bdd, BDD.False);
}
return bdd;
}

/// <summary>
/// Make a CharSet from all the characters in the range from m to n.
/// Returns the empty set if n is less than m
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ public static void WriteInt64ArrayInitSyntax(StreamWriter sw, long[] values)
}
sw.Write("}");
}

public static void WriteByteArrayInitSyntax(StreamWriter sw, byte[] values)
{
sw.Write("new byte[] {");
for (int i = 0; i < values.Length; i++)
{
sw.Write($" 0x{values[i]:X}, ");
}
sw.Write("}");
}
}
#endif
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -41,70 +41,54 @@ private static void WriteIgnoreCaseBDD(StreamWriter sw)
sw.WriteLine(" /// <summary>Serialized BDD for mapping characters to their case-ignoring equivalence classes in the default (en-US) culture.</summary>");

var solver = new CharSetSolver();
Dictionary<char, BDD> ignoreCase = ComputeIgnoreCaseDictionary(solver, new CultureInfo(DefaultCultureName));
List<EquivalenceClass> ignoreCaseEquivalenceClasses = ComputeIgnoreCaseEquivalenceClasses(solver, new CultureInfo(DefaultCultureName));
BDD ignorecase = solver.False;
foreach (KeyValuePair<char, BDD> kv in ignoreCase)
foreach (EquivalenceClass ec in ignoreCaseEquivalenceClasses)
{
BDD a = solver.CreateCharSetFromRange(kv.Key, kv.Key);
BDD b = kv.Value;
ignorecase = solver.Or(ignorecase, solver.And(solver.ShiftLeft(a, 16), b));
// Create the Cartesian product of ec._set with itself
BDD crossproduct = solver.And(solver.ShiftLeft(ec._set, 16), ec._set);
// Add the product into the overall lookup table
ignorecase = solver.Or(ignorecase, crossproduct);
}

sw.Write(" public static readonly long[] IgnoreCaseEnUsSerializedBDD = ");
GeneratorHelper.WriteInt64ArrayInitSyntax(sw, ignorecase.Serialize());
sw.Write(" public static readonly byte[] IgnoreCaseEnUsSerializedBDD = ");
GeneratorHelper.WriteByteArrayInitSyntax(sw, ignorecase.SerializeToBytes());
sw.WriteLine(";");
}

private static Dictionary<char, BDD> ComputeIgnoreCaseDictionary(CharSetSolver solver, CultureInfo culture)
private static List<EquivalenceClass> ComputeIgnoreCaseEquivalenceClasses(CharSetSolver solver, CultureInfo culture)
{
CultureInfo originalCulture = CultureInfo.CurrentCulture;
try
{
CultureInfo.CurrentCulture = culture;
var ignoreCase = new Dictionary<char, EquivalenceClass>();
var sets = new List<EquivalenceClass>();

var ignoreCase = new Dictionary<char, BDD>();
for (uint i = 65; i <= 0xFFFF; i++)
{
char C = (char)i;
char c = char.ToLower(C, culture);

for (uint i = 0; i <= 0xFFFF; i++)
if (c == C)
{
char c = (char)i;
char cUpper = char.ToUpper(c);
char cLower = char.ToLower(c);

if (cUpper == cLower)
{
continue;
}

// c may be different from both cUpper as well as cLower.
// Make sure that the regex engine considers c as being equivalent to cUpper and cLower, else ignore c.
// In some cases c != cU but the regex engine does not consider the chacarters equivalent wrt the ignore-case option.
if (Regex.IsMatch($"{cUpper}{cLower}", $"^(?i:\\u{i:X4}\\u{i:X4})$"))
{
BDD equiv = solver.False;

if (ignoreCase.ContainsKey(c))
equiv = solver.Or(equiv, ignoreCase[c]);

if (ignoreCase.ContainsKey(cUpper))
equiv = solver.Or(equiv, ignoreCase[cUpper]);

if (ignoreCase.ContainsKey(cLower))
equiv = solver.Or(equiv, ignoreCase[cLower]);

// Make sure all characters are included initially or when some is still missing
equiv = solver.Or(equiv, solver.Or(solver.CreateCharSetFromRange(c, c), solver.Or(solver.CreateCharSetFromRange(cUpper, cUpper), solver.CreateCharSetFromRange(cLower, cLower))));

// Update all the members with their case-invariance equivalence classes
foreach (char d in solver.GenerateAllCharacters(equiv))
ignoreCase[d] = equiv;
}
continue;
}

return ignoreCase;
EquivalenceClass? ec;
if (!ignoreCase.TryGetValue(c, out ec))
{
ec = new EquivalenceClass(solver.CharConstraint(c));
ignoreCase[c] = ec;
sets.Add(ec);
}
ec._set = solver.Or(ec._set, solver.CharConstraint(C));
}
finally
return sets;
}

private class EquivalenceClass
{
public BDD _set;
public EquivalenceClass(BDD set)
{
CultureInfo.CurrentCulture = originalCulture;
_set = set;
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ private IgnoreCaseRelation EnsureDefault()
if (_relationDefault is null)
{
BDD instance = BDD.Deserialize(Unicode.IgnoreCaseRelation.IgnoreCaseEnUsSerializedBDD, _solver);
byte[] tmp = instance.SerializeToBytes();
BDD instance2 = BDD.Deserialize(tmp, _solver);
if (instance != instance2)
throw new Exception();
BDD instanceDomain = _solver.ShiftRight(instance, 16); // represents the set of all case-sensitive characters in the default culture.
_relationDefault = new IgnoreCaseRelation(instance, instanceDomain);
}
Expand Down
Loading

0 comments on commit b67f978

Please sign in to comment.