Skip to content

Commit

Permalink
Fix bug with captures groups.
Browse files Browse the repository at this point in the history
When extracting the data matching capture groups we'd take it from the
beginning of the stream, not the beginning of the current view, even
though the latter is what we are matching against.

Closes #1846.

(cherry picked from commit 5f14c36)

# Conflicts:
#	spicy/toolchain/src/compiler/codegen/parsers/literals.cc
  • Loading branch information
rsmmr authored and bbannier committed Oct 1, 2024
1 parent 2222444 commit ca5ded1
Show file tree
Hide file tree
Showing 21 changed files with 266 additions and 233 deletions.
2 changes: 1 addition & 1 deletion hilti/lib/hilti.hlt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public type Captures = vector<bytes>;
public type Profiler = __library_type("hilti::rt::Profiler");

public type MatchState = struct {
method Captures captures(stream data);
method Captures captures(view<stream> data);
} &cxxname="hilti::rt::regexp::MatchState";

public type StreamStatistics = struct {
Expand Down
2 changes: 1 addition & 1 deletion hilti/runtime/include/types/regexp.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class MatchState {
* support for that, or when matching has not finished
* successfully), the return vector will be empty.
*/
Captures captures(const Stream& data) const;
Captures captures(const stream::View& data) const;

private:
// Returns (rc, bytes-consumed). Note that the latter can be negative if
Expand Down
16 changes: 8 additions & 8 deletions hilti/runtime/src/tests/regexp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ TEST_CASE("advance") {
{
auto ms_std_1 = re_std.tokenMatcher();
CHECK_EQ(ms_std_1.advance("Xa"_b, false), std::make_tuple(0, 0));
CHECK_EQ(ms_std_1.captures(Stream("XabbbcdefgX"_b)), Vector<Bytes>());
CHECK_EQ(ms_std_1.captures(Stream("XabbbcdefgX"_b).view()), Vector<Bytes>());
}

{
Expand All @@ -247,13 +247,13 @@ TEST_CASE("advance") {
CHECK_EQ(ms_std_2.advance("bc"_b, false), std::make_tuple(-1, 2));
CHECK_EQ(ms_std_2.advance("de"_b, false), std::make_tuple(-1, 2));
CHECK_EQ(ms_std_2.advance("fgX"_b, true), std::make_tuple(1, 2));
CHECK_EQ(ms_std_2.captures(Stream("abbbcdefg"_b)), Vector<Bytes>({"abbbcdefg"_b, "bbb"_b, "def"_b}));
CHECK_EQ(ms_std_2.captures(Stream("abbbcdefg"_b).view()), Vector<Bytes>({"abbbcdefg"_b, "bbb"_b, "def"_b}));
}

{
auto ms_no_sub_1 = re_no_sub.tokenMatcher();
CHECK_EQ(ms_no_sub_1.advance("Xa"_b, false), std::make_tuple(0, 0));
CHECK_EQ(ms_no_sub_1.captures(Stream("XabbbcdefgX"_b)), Vector<Bytes>());
CHECK_EQ(ms_no_sub_1.captures(Stream("XabbbcdefgX"_b).view()), Vector<Bytes>());
}

{
Expand All @@ -263,7 +263,7 @@ TEST_CASE("advance") {
CHECK_EQ(ms_no_sub_2.advance("bc"_b, false), std::make_tuple(-1, 2));
CHECK_EQ(ms_no_sub_2.advance("de"_b, false), std::make_tuple(-1, 2));
CHECK_EQ(ms_no_sub_2.advance("fgX"_b, true), std::make_tuple(1, 2));
CHECK_EQ(ms_no_sub_2.captures(Stream("XabbbcdefgX"_b)), Vector<Bytes>());
CHECK_EQ(ms_no_sub_2.captures(Stream("XabbbcdefgX"_b).view()), Vector<Bytes>());
}

// Check that patterns stop when current match cannot be possible expanded anymore.
Expand All @@ -284,27 +284,27 @@ TEST_CASE("advance") {
{
auto ms_std_1 = re_std.tokenMatcher();
CHECK_EQ(ms_std_1.advance("Xabbc"_b, false), std::make_tuple(0, 0));
CHECK_EQ(ms_std_1.captures(Stream("XabbcyX"_b)), Vector<Bytes>({}));
CHECK_EQ(ms_std_1.captures(Stream("XabbcyX"_b).view()), Vector<Bytes>({}));
}

{
auto ms_std_2 = re_std.tokenMatcher();
CHECK_EQ(ms_std_2.advance("abbc"_b, false), std::make_tuple(-1, 4));
CHECK_EQ(ms_std_2.advance("yX"_b, true), std::make_tuple(20, 1));
CHECK_EQ(ms_std_2.captures(Stream("abbcyX"_b)), Vector<Bytes>({"abbcy"_b, "bbcy"_b}));
CHECK_EQ(ms_std_2.captures(Stream("abbcyX"_b).view()), Vector<Bytes>({"abbcy"_b, "bbcy"_b}));
}

{
auto ms_no_sub_1 = re_no_sub.tokenMatcher();
CHECK_EQ(ms_no_sub_1.advance("Xabbc"_b, false), std::make_tuple(0, 0));
CHECK_EQ(ms_no_sub_1.captures(Stream("XabbcyX"_b)), Vector<Bytes>({}));
CHECK_EQ(ms_no_sub_1.captures(Stream("XabbcyX"_b).view()), Vector<Bytes>({}));
}

{
auto ms_no_sub_2 = re_no_sub.tokenMatcher();
CHECK_EQ(ms_no_sub_2.advance("abbc"_b, false), std::make_tuple(-1, 4));
CHECK_EQ(ms_no_sub_2.advance("yX"_b, true), std::make_tuple(20, 1));
CHECK_EQ(ms_no_sub_2.captures(Stream("abbcyX"_b)), Vector<Bytes>({}));
CHECK_EQ(ms_no_sub_2.captures(Stream("abbcyX"_b).view()), Vector<Bytes>({}));
}
}

Expand Down
4 changes: 2 additions & 2 deletions hilti/runtime/src/types/regexp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ std::pair<int32_t, int64_t> regexp::MatchState::_advance(const stream::View& dat
return std::make_pair(_pimpl->_acc, _pimpl->_ms.offset - start_ms_offset);
}

regexp::Captures regexp::MatchState::captures(const Stream& data) const {
regexp::Captures regexp::MatchState::captures(const stream::View& data) const {
if ( _pimpl->_re->_flags.no_sub || _pimpl->_acc <= 0 || ! _pimpl->_done )
return Captures();

Expand All @@ -213,7 +213,7 @@ regexp::Captures regexp::MatchState::captures(const Stream& data) const {
// internally as well: if not both are set, just skip (and
// don't count) the group.
if ( groups[i].rm_so >= 0 || groups[i].rm_eo >= 0 )
captures.emplace_back(data.view(false).sub(groups[i].rm_so, groups[i].rm_eo).data());
captures.emplace_back(data.sub(groups[i].rm_so, groups[i].rm_eo).data());
}
}

Expand Down
14 changes: 13 additions & 1 deletion spicy/toolchain/src/compiler/codegen/parser-builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -585,11 +585,19 @@ struct ProductionVisitor : public production::Visitor {
std::optional<Expression*> pre_container_offset;
std::optional<PathTracker> path_tracker;
Expression* profiler = nullptr;
std::shared_ptr<Builder> block;

if ( is_field_owner ) {
path_tracker = PathTracker(&_path, field->id());
auto offset = builder()->memberCall(state().cur, "offset");
profiler = builder()->startProfiler(fmt("spicy/unit/%s", hilti::util::join(_path, "::")), offset);

if ( ! field->isAnonymous() ) {
// Set up a per-field block for scoping.
block = builder()->addBlock();
pushBuilder(block);
}

pre_container_offset = preParseField(*p, meta);
}

Expand Down Expand Up @@ -636,9 +644,13 @@ struct ProductionVisitor : public production::Visitor {

endProduction(*p);

if ( is_field_owner ) {
if ( is_field_owner )
postParseField(*p, meta, pre_container_offset);

if ( block )
popBuilder(); // Per-field block.

if ( is_field_owner ) {
if ( profiler ) {
auto offset = builder()->memberCall(state().cur, "offset");
builder()->stopProfiler(profiler, offset);
Expand Down
2 changes: 1 addition & 1 deletion spicy/toolchain/src/compiler/codegen/parsers/literals.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ struct Visitor : public visitor::PreOrder {
if ( state().literal_mode != LiteralMode::Skip ) {
if ( state().captures )
builder()->addAssign(*state().captures,
builder()->memberCall(builder()->id("ms"), "captures", {state().data}));
builder()->memberCall(builder()->id("ms"), "captures", {state().cur}));

builder()->addAssign(result, builder()->memberCall(state().cur, "sub",
{builder()->begin(builder()->id("ncur"))}));
Expand Down
44 changes: 21 additions & 23 deletions tests/Baseline/hilti.ast.basic-module/debug.log
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@
[debug/compiler] [HILTI] building scopes
[debug/compiler] [HILTI] resolving AST
[debug/compiler] -> modified
[debug/ast-stats] garbage collected 537 nodes in 12 rounds, 5272 left retained
[debug/ast-stats] garbage collected 545 nodes in 12 rounds, 5270 left retained
[debug/compiler] processing ASTs, round 2
[debug/compiler] [HILTI] building scopes
[debug/compiler] [HILTI] resolving AST
[debug/compiler] -> modified
[debug/ast-stats] garbage collected 223 nodes in 11 rounds, 5528 left retained
[debug/ast-stats] garbage collected 223 nodes in 11 rounds, 5526 left retained
[debug/compiler] processing ASTs, round 3
[debug/compiler] [HILTI] building scopes
[debug/compiler] [HILTI] resolving AST
[debug/ast-stats] garbage collected 29 nodes in 2 rounds, 5556 left retained
[debug/ast-stats] garbage collected 29 nodes in 2 rounds, 5552 left retained
[debug/ast-final] # [HILTI] Final AST (round 3)
[debug/ast-final] - ASTRoot [no parent] (<root>) [@a:XXX]
[debug/ast-final] | Foo -> declaration::Module <canonical-id="Foo" declaration="D2" dependencies="" ext=".hlt" fqid="Foo" id="Foo" linkage="public" path="<...>/basic-module.hlt" scope="" skip-implementation="false"> [parent @a:XXX] [@d:XXX] ([@d:XXX])
Expand Down Expand Up @@ -444,23 +444,21 @@
[debug/ast-final] - type::ValueReference <declaration="-" type="-" unified="value_ref(name(hilti::MatchState))" wildcard="false"> [parent @q:XXX] (resolved) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] [@q:XXX]
[debug/ast-final] - AttributeSet [parent @d:XXX] [@a:XXX]
[debug/ast-final] - declaration::Field <canonical-id="hilti::captures" cc="<unset>" declaration="-" fqid="hilti::MatchState::captures" id="captures" linkage="struct" linked-type="T9"> [parent @t:XXX] (hilti.hlt:20:5-20:42) (resolved) [@d:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @d:XXX] (hilti.hlt:20:5-20:42) [@q:XXX]
[debug/ast-final] - type::Function <declaration="-" flavor="method" type="-" unified="function(result:vector(bytes), stream)" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:5-20:42) (resolved) [@t:XXX]
[debug/ast-final] - declaration::Field <canonical-id="hilti::captures" cc="<unset>" declaration="-" fqid="hilti::MatchState::captures" id="captures" linkage="struct" linked-type="T9"> [parent @t:XXX] (hilti.hlt:20:5-20:48) (resolved) [@d:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @d:XXX] (hilti.hlt:20:5-20:48) [@q:XXX]
[debug/ast-final] - type::Function <declaration="-" flavor="method" type="-" unified="function(result:vector(bytes), view::stream)" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:5-20:48) (resolved) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:20:12-20:19) [@q:XXX]
[debug/ast-final] - type::Name <builtin="false" declaration="-" id="Captures" resolved-type="T11" type="-" unified="vector(bytes)" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:12-20:19) (resolved) [@t:XXX]
[debug/ast-final] - declaration::Parameter <canonical-id="hilti::data" declaration="-" fqid="data" id="data" is_type_param="false" kind="in" linkage="private"> [parent @t:XXX] (hilti.hlt:20:30-20:40) (resolved) [@d:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @d:XXX] (hilti.hlt:20:30-20:35) [@q:XXX]
[debug/ast-final] - type::Stream <declaration="-" type="-" unified="stream" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:30-20:35) (resolved) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:20:30-20:35) [@q:XXX]
[debug/ast-final] - type::stream::View <declaration="-" type="-" unified="view::stream" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:30-20:35) (resolved) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:20:30-20:35) [@q:XXX]
[debug/ast-final] - type::stream::Iterator <declaration="-" type="-" unified="iterator(stream)" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:30-20:35) (resolved) [@t:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @t:XXX] (hilti.hlt:20:30-20:35) [@q:XXX]
[debug/ast-final] - type::UnsignedInteger <declaration="-" type="-" unified="uint8" width="8" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:30-20:35) (resolved) [@t:XXX]
[debug/ast-final] - declaration::Parameter <canonical-id="hilti::data" declaration="-" fqid="data" id="data" is_type_param="false" kind="in" linkage="private"> [parent @t:XXX] (hilti.hlt:20:30-20:46) (resolved) [@d:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @d:XXX] (hilti.hlt:20:35-20:40) [@q:XXX]
[debug/ast-final] - type::stream::View <declaration="-" type="-" unified="view::stream" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:35-20:40) (resolved) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:20:35-20:40) [@q:XXX]
[debug/ast-final] - type::stream::Iterator <declaration="-" type="-" unified="iterator(stream)" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:35-20:40) (resolved) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:20:35-20:40) [@q:XXX]
[debug/ast-final] - type::UnsignedInteger <declaration="-" type="-" unified="uint8" width="8" wildcard="false"> [parent @q:XXX] (hilti.hlt:20:35-20:40) (resolved) [@t:XXX]
[debug/ast-final] - <empty>
[debug/ast-final] - AttributeSet [parent @d:XXX] (hilti.hlt:20:41-20:40) [@a:XXX]
[debug/ast-final] - AttributeSet [parent @d:XXX] (hilti.hlt:20:42-20:41) [@a:XXX]
[debug/ast-final] - AttributeSet [parent @d:XXX] (hilti.hlt:20:47-20:46) [@a:XXX]
[debug/ast-final] - AttributeSet [parent @d:XXX] (hilti.hlt:20:48-20:47) [@a:XXX]
[debug/ast-final] - <empty>
[debug/ast-final] - AttributeSet [parent @d:XXX] (hilti.hlt:21:2-21:1) [@a:XXX]
[debug/ast-final] - Attribute <tag="&cxxname"> [parent @a:XXX] (hilti.hlt:21:2-21:1) [@a:XXX]
Expand Down Expand Up @@ -970,16 +968,16 @@
[debug/ast-final] [T15] hilti::RecoverableFailure [type::Exception] (hilti.hlt:56:34-56:56)
[debug/ast-stats] # [HILTI] AST statistics:
[debug/ast-stats] - # AST rounds 3
[debug/ast-stats] - max tree depth: 16
[debug/ast-stats] - max tree depth: 14
[debug/ast-stats] - # context declarations: 23
[debug/ast-stats] - # context types: 16
[debug/ast-stats] - # context modules: 2
[debug/ast-stats] - # nodes reachable in AST: 705
[debug/ast-stats] - # nodes live: 5556
[debug/ast-stats] - # nodes retained: 5556
[debug/ast-stats] - # nodes reachable in AST: 703
[debug/ast-stats] - # nodes live: 5552
[debug/ast-stats] - # nodes retained: 5552
[debug/ast-stats] - # nodes live > 1%:
[debug/ast-stats] - AttributeSet: 64
[debug/ast-stats] - QualifiedType: 1944
[debug/ast-stats] - QualifiedType: 1942
[debug/ast-stats] - expression::Ctor: 69
[debug/ast-stats] - type::Bool: 140
[debug/ast-stats] - type::Bytes: 80
Expand All @@ -1000,4 +998,4 @@
[debug/compiler] codegen module Foo to C++
[debug/compiler] generating C++ for module Foo
[debug/compiler] finalizing module Foo
[debug/ast-stats] garbage collected 5557 nodes in 17 rounds, 0 left retained
[debug/ast-stats] garbage collected 5553 nodes in 13 rounds, 0 left retained
Loading

0 comments on commit ca5ded1

Please sign in to comment.