Skip to content

Commit

Permalink
Add tests for CharacterClass, fix character escaping bugs, optimize f…
Browse files Browse the repository at this point in the history
…or duplicates, and error if ] is not escaped.
  • Loading branch information
lilith committed Jun 12, 2024
1 parent f8faf12 commit a7efb1a
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 15 deletions.
120 changes: 105 additions & 15 deletions src/Imazen.Routing/Matching/CharacterClass.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using System.Diagnostics.CodeAnalysis;
using System.Text;
using Imazen.Abstractions.Internal;
using Imazen.Routing.Helpers;

namespace Imazen.Routing.Matching;

Expand All @@ -14,6 +15,50 @@ public record CharacterClass(
ReadOnlyCollection<CharacterClass.CharRange> Ranges,
ReadOnlyCollection<char> Characters)
{

public CharacterClass ToOrdinalCaseInsensitive()
{
// For any characters or ranges that are in the ASCII range, we can add the lowercase and uppercase versions
var newRanges = new List<CharRange>(Ranges.Count + 1);
foreach (var range in Ranges)
{
if (range.Start >= 'A' && range.End <= 'Z')
{
newRanges.AddIfUnique(new CharRange((char)(range.Start + 32), (char)(range.End + 32)));
}
else if (range.Start >= 'a' && range.End <= 'z')
{
newRanges.AddIfUnique(new CharRange((char)(range.Start - 32), (char)(range.End - 32)));
}
else
{
newRanges.AddIfUnique(range);
}
}
// Add the lowercase and uppercase versions of the characters
var newChars = new List<char>(Characters.Count * 2);
foreach (var c in Characters)
{
// No need to add if it's covered by a range already
if (WithinRanges(c, Ranges))
{
continue;
}
if (c >= 'A' && c <= 'Z')
{
newChars.AddIfUnique((char)(c + 32));
}
else if (c >= 'a' && c <= 'z')
{
newChars.AddIfUnique((char)(c - 32));
}
else
{
newChars.AddIfUnique(c);
}
}
return new CharacterClass(IsNegated, new ReadOnlyCollection<CharRange>(newRanges), new ReadOnlyCollection<char>(newChars));
}
public override string ToString()
{
// 5 is our guess on escaped chars
Expand Down Expand Up @@ -58,6 +103,15 @@ private bool WithinSet(char c)
}
return false;
}
private static bool WithinRanges(char c, ICollection<CharRange>? ranges)
{
if (ranges is null) return false;
foreach (var range in ranges)
{
if (c >= range.Start && c <= range.End) return true;
}
return false;
}


private static ulong HashSpan(ReadOnlySpan<char> span)
Expand Down Expand Up @@ -87,8 +141,15 @@ private record ParseResult(bool Success, string Input, CharacterClass? Result, s
}
return default;
}



public static CharacterClass ParseInterned(string syntax)
{
if (!TryParseInterned(syntax.AsMemory(), true, out var result, out var error))
{
throw new ArgumentException($"Failed to parse character class '{syntax}': {error}");
}
return result!;
}
public static bool TryParseInterned(ReadOnlyMemory<char> syntax, bool storeIfMissing,
[NotNullWhen(true)] out CharacterClass? result,
[NotNullWhen(false)] out string? error)
Expand Down Expand Up @@ -172,7 +233,21 @@ private readonly record struct LexToken(LexTokenType Type, char Value)
private static readonly char[] SuspiciousCharsToEscape = ['d', 'D', 's', 'S', 'w', 'W', 'b', 'B'];

private static readonly char[] ValidCharsToEscape =
['t', 'n', 'r', 'f', 'v', '0', '[', ']', '\\', '-', '^', ',', '(', ')', '{', '}', '|'];
['t', 'n', 'r', 'f', 'v', '0', '[', ']', '\\', '-', '^', ',', '(', ')', '{', '}', '|', '.','+','*', '/', '?'];

private static char MapEscapedChar(char c)
{
return c switch
{
't' => '\t',
'n' => '\n',
'r' => '\r',
'f' => '\f',
'v' => '\v',
'0' => '\0',
_ => c
};
}

private static readonly LexToken ControlDashToken = new LexToken(LexTokenType.ControlDash, '-');

Expand All @@ -184,9 +259,11 @@ private static IEnumerable<LexToken> LexInner(ReadOnlyMemory<char> syntax)
var c = syntax.Span[i];
if (c == '\\')
{
if (i == syntax.Length - 1)
if (i + 1 >= syntax.Length)
{
yield return new LexToken(LexTokenType.DanglingEscape, '\\');
i++;
continue;
}
var c2 = syntax.Span[i + 1];

Expand All @@ -199,7 +276,8 @@ private static IEnumerable<LexToken> LexInner(ReadOnlyMemory<char> syntax)

if (ValidCharsToEscape.Contains(c2))
{
yield return new LexToken(LexTokenType.EscapedCharacter, c2);

yield return new LexToken(LexTokenType.EscapedCharacter, MapEscapedChar(c2));
i += 2;
continue;
}
Expand Down Expand Up @@ -245,6 +323,7 @@ private static bool TryParseInner(bool negated, ReadOnlyMemory<char> syntax,

var tokens = LexInner(syntax).ToList();
// Reject if we have dangling escape, incorrectly escaped character, or suspicious escaped character

if (tokens.Any(t => t.Type is LexTokenType.DanglingEscape))
{
error = "Dangling backslash in character class";
Expand All @@ -268,7 +347,13 @@ private static bool TryParseInner(bool negated, ReadOnlyMemory<char> syntax,
result = default;
return false;
}

// Also if we have a SingleCharacter ] in the middle of the syntax
if (tokens.Any(t => t.Type is LexTokenType.SingleCharacter && t.Value == ']'))
{
error = "Character ']' cannot be used in the middle of a character class unless escaped";
result = default;
return false;
}
// Search for ranges
int indexOfDash = tokens.IndexOf(ControlDashToken);
while (indexOfDash != -1)
Expand All @@ -295,7 +380,7 @@ private static bool TryParseInner(bool negated, ReadOnlyMemory<char> syntax,
}

ranges ??= new List<CharRange>();
ranges.Add(new CharRange(start, end));
ranges.AddIfUnique(new CharRange(start, end));
// Mutate the collection and re-search
tokens.RemoveRange(indexOfDash - 1, 3);
indexOfDash = tokens.IndexOf(ControlDashToken);
Expand All @@ -306,21 +391,20 @@ private static bool TryParseInner(bool negated, ReadOnlyMemory<char> syntax,
{
if (token.Type is LexTokenType.SingleCharacter or LexTokenType.EscapedCharacter)
{
characters ??= [];
characters.Add(token.Value);
characters ??= new List<char>();
characters.AddIfUnique(token.Value);

}
else if (token.Type is LexTokenType.PredefinedClass)
{
if (token.Value == 'w')
{
ranges ??= [];
ranges.AddRange(new[]
{
new CharRange('a', 'z'), new CharRange('A', 'Z'),
new CharRange('0', '9')
});
ranges.AddIfUnique(new CharRange('a', 'z'));
ranges.AddIfUnique(new CharRange('A', 'Z'));
ranges.AddIfUnique(new CharRange('0', '9'));
characters ??= [];
characters.Add('_');
characters.AddIfUnique('_');
}
else
{
Expand All @@ -332,6 +416,12 @@ private static bool TryParseInner(bool negated, ReadOnlyMemory<char> syntax,
throw new InvalidOperationException($"Unexpected token type {token.Type}");
}
}
// We remove chars that are within ranges
if (characters is not null)
{
characters.RemoveAll(c => WithinRanges(c, ranges));
}


characters ??= [];
ranges ??= [];
Expand Down
67 changes: 67 additions & 0 deletions tests/ImazenShared.Tests/Routing/Matching/CharacterClassTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
using System;
using Imazen.Routing.Matching;
using Xunit;

public class CharacterClassTests
{
[Theory]
[InlineData("[0-9]", true, "0123456789")]
[InlineData("[a-zA-Z]", true, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")]
[InlineData("[^/]", false, "/")]
[InlineData(@"[\t\n\r]", true, "\t\n\r")]
[InlineData(@"[\[\]\{\}\,]", true, "[]{},")]
[InlineData("[0-9a-fA-F]", true, "0123456789abcdefABCDEF")]
[InlineData("[\\w]", true, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")]
[InlineData("[a-z0-9_]", true, "abcdefghijklmnopqrstuvwxyz0123456789_")]
[InlineData("[^a-z]", false, "abcdefghijklmnopqrstuvwxyz")]
[InlineData("[^0-9]", false, "0123456789")]
[InlineData("[a-zA-Z0-9]", true, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")]
[InlineData("[^\\w]", false, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")]
[InlineData("[\\.]", true, ".")]
[InlineData(@"[\+\-\*\/]", true, "+-*/")]
[InlineData(@"[\(\)\[\]\{\}]", true, "()[]{}")]
public void ValidCharacterClass_ShouldParseSuccessfully(string syntax, bool shouldMatch, string testChars)
{
// Arrange
var success = CharacterClass.TryParse(syntax.AsMemory(), out var result, out var error);

// Assert
Assert.True(success);
Assert.Null(error);
Assert.NotNull(result);

foreach (var c in testChars)
{
Assert.Equal(shouldMatch, result!.Contains(c));
}
}

[Theory]
[InlineData("[]")]
[InlineData("[a-]")]
[InlineData("[-a]")]
[InlineData("[a--b]")]
[InlineData("[z-a]")]
[InlineData("[\\d]")]
[InlineData("[\\s]")]
[InlineData("[a\\]")]
[InlineData("[a\\q]")]
[InlineData("[a-z-A-Z]")]
[InlineData("[0-9-a-z]")]
[InlineData("[^]")]
[InlineData("[^a-z-0-9]")]
[InlineData("[a-z-^]")]
[InlineData("[\\w-\\d]")]
[InlineData("[a-z\\d-\\w]")]
[InlineData("[a-z]|[a-z]")]
public void InvalidCharacterClass_ShouldFailParsing(string syntax)
{
// Arrange
var success = CharacterClass.TryParse(syntax.AsMemory(), out var result, out var error);

// Assert
Assert.False(success);
Assert.NotNull(error);
Assert.Null(result);
}
}

0 comments on commit a7efb1a

Please sign in to comment.