Skip to content

Commit

Permalink
misc: compile regexes only once
Browse files Browse the repository at this point in the history
This is a big change. The previous design was currently compiling a
regex each time the script was going through it, meaning if my script
was s/foo/bar/ and my input was a million lines, then we would compile
foo a million times. Now we're compiling once, which has improved the
performance tremendously.

The generated code is now much harder to understand though, but there is
room for improvement.
  • Loading branch information
lhoursquentin committed Apr 14, 2020
1 parent 41c1df5 commit 503041a
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 60 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and GCC (9.2.1)*

Say you want to compile the following sed script called `binary-add.sed` (see
the `samples` directory):

```sed
s/[[:blank:]]//g
h
Expand Down Expand Up @@ -112,17 +113,16 @@ Not much practical use to this, here are some thoughts:
- One might find this useful for obfuscation or maybe to limit the scope of sed?
- Better speed? Since the generated code is specific to a script, one might
expect it to be much faster than using `sed`, since we can skip parsing,
walking the AST etc. Though with the current implementation a compiled script
is roughly 4 times slower than GNU sed, this is mostly due to having to
compile all regexes each time, instead of once, which I'm still working on.
walking the AST etc. I didn't do any serious measurements yet, but so far it
seems slightly faster than GNU sed, and much faster than busybox sed.

# Translating the translator

The basic idea of this project is to translate **sed** code to **C** code, to
compile it and have a resulting binary with the same behavior as the original
script.

Now since the translator from sed to C is written is sed, we should be able to
Now since the translator from sed to C is written in sed, we should be able to
translate the translator, compile it and then be able to use the compiled
version to translate other sed scripts.

Expand Down
32 changes: 15 additions & 17 deletions address.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@

bool addr_rr(
Status *const status,
const char *const start,
const char *const end,
Regex *const start,
Regex *const end,
const int id
) {
int *const range_ids = status->range_ids;
Expand Down Expand Up @@ -61,7 +61,7 @@ bool addr_rr(

bool addr_rn(
Status *const status,
const char *const start,
Regex *const start,
const int end,
const int id
) {
Expand Down Expand Up @@ -98,7 +98,7 @@ bool addr_rn(
bool addr_nr(
Status *const status,
const int start,
const char *const end,
Regex *const end,
const int id
) {
/*
Expand Down Expand Up @@ -177,23 +177,21 @@ bool addr_nn(
return false;
}

bool addr_r(Status *const status, const char *const regex) {
status->last_pattern = regex;
const char *const pattern_space = status->pattern_space;
regex_t regex_obj;
bool addr_r(Status *const status, Regex *const regex) {
status->last_regex = regex;
regex_t *const regex_obj = &regex->obj;

if (regcomp(&regex_obj, regex, 0)) {
regfree(&regex_obj);
assert(false);
if (!regex->compiled) {
if (regcomp(regex_obj, regex->str, 0)) {
assert(false);
} else {
regex->compiled = true;
}
}

if (regexec(&regex_obj, pattern_space, 0, NULL, 0)) {
regfree(&regex_obj);
return false;
}
const char *const pattern_space = status->pattern_space;

regfree(&regex_obj);
return true;
return !regexec(regex_obj, pattern_space, 0, NULL, 0);
}

bool addr_n(const Status *status, const int line_nb) {
Expand Down
8 changes: 4 additions & 4 deletions address.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

#include "status.h"

bool addr_rr(Status *const status, const char *const start, const char *const end, const int id);
bool addr_rn(Status *const status, const char *const start, const int end, const int id);
bool addr_nr(Status *const status, const int start, const char *const end, const int id);
bool addr_rr(Status *const status, Regex *const start, Regex *const end, const int id);
bool addr_rn(Status *const status, Regex *const start, const int end, const int id);
bool addr_nr(Status *const status, const int start, Regex *const end, const int id);
bool addr_nn(Status *const status, const int start, const int end, const int id);
bool addr_r(Status *const status, const char * const regex);
bool addr_r(Status *const status, Regex *const regex);
bool addr_n(const Status *const status, const int line);

#endif /* ADDRESS_H */
24 changes: 11 additions & 13 deletions operations.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ static int substitution(
pmatch,
first_sub_done ? REG_NOTBOL : 0
)) {
regfree(regex);
return -1;
}

Expand Down Expand Up @@ -165,18 +164,19 @@ static int substitution(

void s(
Status *const status,
const char *const pattern,
Regex *const regex,
const char *const replace,
const int opts)
{
status->last_pattern = pattern;
regex_t regex;

// FIXME we should compile only once, both loops and each line processed can
// lead to compiling the same regex many times
if (regcomp(&regex, pattern, 0)) {
regfree(&regex);
assert(false);
status->last_regex = regex;
regex_t *const regex_obj = &regex->obj;

if (!regex->compiled) {
if (regcomp(regex_obj, regex->str, 0)) {
assert(false);
} else {
regex->compiled = true;
}
}

// TODO nth/w opts
Expand All @@ -188,7 +188,7 @@ void s(
bool first_sub_done = false;
do {
pattern_offset = substitution(
&regex,
regex_obj,
pattern_space,
replace,
first_sub_done
Expand All @@ -202,8 +202,6 @@ void s(
pattern_space += pattern_offset;
} while (opt_g && pattern_space[0] && pattern_offset);

regfree(&regex);

if (first_sub_done) {
status->sub_success = true;
if (opt_p) {
Expand Down
2 changes: 1 addition & 1 deletion operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void P(const Status *const status);
void q(const Status *const status);
void s(
Status *const status,
const char *const pattern,
Regex *const regex,
const char *const replace,
const int opts
);
Expand Down
71 changes: 52 additions & 19 deletions par.sed
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

# The first line of the hold space is used for temporary storage in this script,
# never use it to store data longer than a single command.
# Second line will act as an id to create unique variable names, though this
# line should be located from the bottom.

1{
x
s/^/\
/
reg_x/
x
}

Expand Down Expand Up @@ -40,7 +42,8 @@ t start
s|^#|//|; t comment
s/^b[[:blank:]]*\([^;}][^[:blank:];}]*\)/goto \1;\n/; t label_cmds
s/^t[[:blank:]]*\([^;}][^[:blank:];}]*\)/if (status.sub_success) { status.sub_success = false; goto \1; }\n/; t label_cmds
s/^:[[:blank:]]*\([^;}][^[:blank:];}]*\)/\1:\n/; t label_cmds
# semi-colon needed since declarations cannot directly follow a label in C
s/^:[[:blank:]]*\([^;}][^[:blank:];}]*\)/\1:;\n/; t label_cmds
s/^s//; t s_cmd
s/^[hHgGlpPqx]/&(\&status);\
/
Expand Down Expand Up @@ -136,6 +139,15 @@ s/^/r/
t regex_start_process

: s_cmd
# s cmd needs a scope since for the case:
# /foo/s/bar/baz/
# if addr("foo") static reg = ...; s(reg);
# Here static ends up alone in the if, which is no good, so we add a scope:
# if addr("foo") { static reg = ...; s(reg); }
# This issue cannot happen with addresses since they cannot be chained without
# brackets: /foo//bar/p -> invalid but /foo/{/bar/p} -> valid and not an issue.
i \
{

x
# at the top of the hold, track the number of delimiters encountered:
Expand All @@ -151,15 +163,15 @@ t regex_start_process
# If we are processing the second address in a range, we want to avoid adding a
# newline since we have the beginning of the C code for this range at the bottom
# of the hold.
/^.[^rn]/s/$/\
/
/^.[^rn]/s/$/\n/
s/$/\n/

# check if this is an empty pattern, in which case we want to use the last one
x
/^\(.\)\1/{
s//\1/
x
s/$/status.last_pattern/
s/\n$/status.last_regex/
t regex_valid_delim_eaten
}
x
Expand Down Expand Up @@ -233,25 +245,44 @@ t regex_eat_next

: regex_valid_delim_eaten

# Found end of second regex addr, swap chars since we insert from the beginning
s/^r\([nr]\)/\1r/
t addr_regex_handle_end

# Found end of single regex addr, a second address might follow
/^r[^nr]/b addr_regex_handle_end

# Found second delim for the s cmd
s/^s1\(.*\)$/s\1, 0/
t s_cmd_handle_options

# Found first delim for the s cmd
s/^s0\(.*\)$/s1\1, "/
x
t regex_eat_next
# case of regex closing a range: swap chars since we insert from the beginning
s/^r\([rn]\)/\1r/

b fail
# At this point if we do not have a string on the last line then that means
# we're in the last_regex case, skip regex creation
/"$/!b skip_regex_creation
s/\(.*\)\
\(.*\)\
\(.*\)\
\(.*\)/\1\
\2x\
\3\&\2\
static Regex \2 = {.compiled = false, .str = \4};/
# save current line we are working on
G
# save everything to hold
h
# only keep regex declaration and print it
s/.*\n\(.*\)\n.*/\1/p
# restore everything
g
# cleanup line we were working on
s/.*\n//
x
# get rid of regex declaration and saved current line
s/\(.*\)\n.*\n.*/\1/
: skip_regex_creation
# Found first delim for the s cmd
/^s0/{
s/^s0\(.*\)$/s1\1, "/
x
t regex_eat_next
}

: addr_regex_handle_end
x
# remove delim, we don't need to keep it anymore
s/.//
Expand Down Expand Up @@ -296,6 +327,8 @@ t s_cmd_eat_options
# why using the "=" command is not an option)
/^.[rn]/{
s/$/, __LINE__/
# TODO jump to next label here to avoid having the regex check the block
# below
}
/^.[rn]/!{
# single address, we need to check if another one follows
Expand All @@ -315,7 +348,7 @@ t s_cmd_eat_options
: s_or_addr_close_function
# close C function call + add ";" if not an address
s/$/)/
/^s/s/$/;/
/^s/s/$/;}/
x
# negative address
/^[[:blank:]]*!/{
Expand Down
2 changes: 1 addition & 1 deletion sed-bin.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ int main(int argc, char **argv) {
.line_nb = 0,
.last_line_nb = INT_MAX, // TODO UINT_MAX after cleaning up signed usage
.skip_read = false,
.last_pattern = NULL,
.last_regex = NULL,
.range_ids = (int [MAX_ACTIVE_RANGES]){},
.suppressed_range_ids = (int [MAX_ACTIVE_RANGES]){},
.pending_output = (const char *[MAX_PENDING_OUTPUT]){},
Expand Down
11 changes: 10 additions & 1 deletion status.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@ typedef enum {
} operation_ret;

#include <stdbool.h>
#include <regex.h>

typedef struct {
bool compiled;
union {
const char *str;
regex_t obj;
};
} Regex;

typedef struct {
char *pattern_space;
Expand All @@ -20,7 +29,7 @@ typedef struct {
unsigned int line_nb;
unsigned int last_line_nb;
bool skip_read;
const char *last_pattern;
Regex *last_regex;
int *const range_ids;
int *const suppressed_range_ids;
const char **const pending_output;
Expand Down

0 comments on commit 503041a

Please sign in to comment.