misc: compile regexes only once

This is a big change. The previous design was currently compiling a regex each time the script was going through it, meaning if my script was s/foo/bar/ and my input was a million lines, then we would compile foo a million times. Now we're compiling once, which has improved the performance tremendously. The generated code is now much harder to understand though, but there is room for improvement.
lhoursquentin · Apr 14, 2020 · 503041a · 503041a
1 parent 41c1df5
commit 503041a
Show file tree

Hide file tree

Showing 8 changed files with 98 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ and GCC (9.2.1)*
 
 Say you want to compile the following sed script called `binary-add.sed` (see
 the `samples` directory):
+
 ```sed
 s/[[:blank:]]//g
 h
@@ -112,17 +113,16 @@ Not much practical use to this, here are some thoughts:
 - One might find this useful for obfuscation or maybe to limit the scope of sed?
 - Better speed? Since the generated code is specific to a script, one might
   expect it to be much faster than using `sed`, since we can skip parsing,
-  walking the AST etc. Though with the current implementation a compiled script
-  is roughly 4 times slower than GNU sed, this is mostly due to having to
-  compile all regexes each time, instead of once, which I'm still working on.
+  walking the AST etc. I didn't do any serious measurements yet, but so far it
+  seems slightly faster than GNU sed, and much faster than busybox sed.
 
 # Translating the translator
 
 The basic idea of this project is to translate **sed** code to **C** code, to
 compile it and have a resulting binary with the same behavior as the original
 script.
 
-Now since the translator from sed to C is written is sed, we should be able to
+Now since the translator from sed to C is written in sed, we should be able to
 translate the translator, compile it and then be able to use the compiled
 version to translate other sed scripts.
 

diff --git a/address.c b/address.c
@@ -28,8 +28,8 @@
 
 bool addr_rr(
   Status *const status,
-  const char *const start,
-  const char *const end,
+  Regex *const start,
+  Regex *const end,
   const int id
 ) {
   int *const range_ids = status->range_ids;
@@ -61,7 +61,7 @@ bool addr_rr(
 
 bool addr_rn(
   Status *const status,
-  const char *const start,
+  Regex *const start,
   const int end,
   const int id
 ) {
@@ -98,7 +98,7 @@ bool addr_rn(
 bool addr_nr(
   Status *const status,
   const int start,
-  const char *const end,
+  Regex *const end,
   const int id
 ) {
   /*
@@ -177,23 +177,21 @@ bool addr_nn(
   return false;
 }
 
-bool addr_r(Status *const status, const char *const regex) {
-  status->last_pattern = regex;
-  const char *const pattern_space = status->pattern_space;
-  regex_t regex_obj;
+bool addr_r(Status *const status, Regex *const regex) {
+  status->last_regex = regex;
+  regex_t *const regex_obj = &regex->obj;
 
-  if (regcomp(&regex_obj, regex, 0)) {
-    regfree(&regex_obj);
-    assert(false);
+  if (!regex->compiled) {
+    if (regcomp(regex_obj, regex->str, 0)) {
+      assert(false);
+    } else {
+      regex->compiled = true;
+    }
   }
 
-  if (regexec(&regex_obj, pattern_space, 0, NULL, 0)) {
-    regfree(&regex_obj);
-    return false;
-  }
+  const char *const pattern_space = status->pattern_space;
 
-  regfree(&regex_obj);
-  return true;
+  return !regexec(regex_obj, pattern_space, 0, NULL, 0);
 }
 
 bool addr_n(const Status *status, const int line_nb) {

diff --git a/address.h b/address.h
@@ -5,11 +5,11 @@
 
 #include "status.h"
 
-bool addr_rr(Status *const status, const char *const start, const char *const end, const int id);
-bool addr_rn(Status *const status, const char *const start, const int end, const int id);
-bool addr_nr(Status *const status, const int start, const char *const end, const int id);
+bool addr_rr(Status *const status, Regex *const start, Regex *const end, const int id);
+bool addr_rn(Status *const status, Regex *const start, const int end, const int id);
+bool addr_nr(Status *const status, const int start, Regex *const end, const int id);
 bool addr_nn(Status *const status, const int start, const int end, const int id);
-bool addr_r(Status *const status, const char * const regex);
+bool addr_r(Status *const status, Regex *const regex);
 bool addr_n(const Status *const status, const int line);
 
 #endif /* ADDRESS_H */
diff --git a/operations.c b/operations.c
@@ -97,7 +97,6 @@ static int substitution(
         pmatch,
         first_sub_done ? REG_NOTBOL : 0
   )) {
-    regfree(regex);
     return -1;
   }
 
@@ -165,18 +164,19 @@ static int substitution(
 
 void s(
   Status *const status,
-  const char *const pattern,
+  Regex *const regex,
   const char *const replace,
   const int opts)
 {
-  status->last_pattern = pattern;
-  regex_t regex;
-
-  // FIXME we should compile only once, both loops and each line processed can
-  // lead to compiling the same regex many times
-  if (regcomp(&regex, pattern, 0)) {
-    regfree(&regex);
-    assert(false);
+  status->last_regex = regex;
+  regex_t *const regex_obj = &regex->obj;
+
+  if (!regex->compiled) {
+    if (regcomp(regex_obj, regex->str, 0)) {
+      assert(false);
+    } else {
+      regex->compiled = true;
+    }
   }
 
   // TODO nth/w opts
@@ -188,7 +188,7 @@ void s(
   bool first_sub_done = false;
   do {
     pattern_offset = substitution(
-      &regex,
+      regex_obj,
       pattern_space,
       replace,
       first_sub_done
@@ -202,8 +202,6 @@ void s(
     pattern_space += pattern_offset;
   } while (opt_g && pattern_space[0] && pattern_offset);
 
-  regfree(&regex);
-
   if (first_sub_done) {
     status->sub_success = true;
     if (opt_p) {

diff --git a/operations.h b/operations.h
@@ -25,7 +25,7 @@ void P(const Status *const status);
 void q(const Status *const status);
 void s(
   Status *const status,
-  const char *const pattern,
+  Regex *const regex,
   const char *const replace,
   const int opts
 );

diff --git a/par.sed b/par.sed
@@ -2,11 +2,13 @@
 
 # The first line of the hold space is used for temporary storage in this script,
 # never use it to store data longer than a single command.
+# Second line will act as an id to create unique variable names, though this
+# line should be located from the bottom.
 
 1{
   x
   s/^/\
-/
+reg_x/
   x
 }
 
@@ -40,7 +42,8 @@ t start
 s|^#|//|; t comment
 s/^b[[:blank:]]*\([^;}][^[:blank:];}]*\)/goto \1;\n/; t label_cmds
 s/^t[[:blank:]]*\([^;}][^[:blank:];}]*\)/if (status.sub_success) { status.sub_success = false; goto \1; }\n/; t label_cmds
-s/^:[[:blank:]]*\([^;}][^[:blank:];}]*\)/\1:\n/; t label_cmds
+# semi-colon needed since declarations cannot directly follow a label in C
+s/^:[[:blank:]]*\([^;}][^[:blank:];}]*\)/\1:;\n/; t label_cmds
 s/^s//; t s_cmd
 s/^[hHgGlpPqx]/&(\&status);\
 /
@@ -136,6 +139,15 @@ s/^/r/
 t regex_start_process
 
 : s_cmd
+# s cmd needs a scope since for the case:
+#   /foo/s/bar/baz/
+#   if addr("foo") static reg = ...; s(reg);
+# Here static ends up alone in the if, which is no good, so we add a scope:
+#   if addr("foo") { static reg = ...; s(reg); }
+# This issue cannot happen with addresses since they cannot be chained without
+# brackets: /foo//bar/p -> invalid but /foo/{/bar/p} -> valid and not an issue.
+i \
+{
 
 x
 # at the top of the hold, track the number of delimiters encountered:
@@ -151,15 +163,15 @@ t regex_start_process
 # If we are processing the second address in a range, we want to avoid adding a
 # newline since we have the beginning of the C code for this range at the bottom
 # of the hold.
-/^.[^rn]/s/$/\
-/
+/^.[^rn]/s/$/\n/
+s/$/\n/
 
 # check if this is an empty pattern, in which case we want to use the last one
 x
 /^\(.\)\1/{
   s//\1/
   x
-  s/$/status.last_pattern/
+  s/\n$/status.last_regex/
   t regex_valid_delim_eaten
 }
 x
@@ -233,25 +245,44 @@ t regex_eat_next
 
 : regex_valid_delim_eaten
 
-# Found end of second regex addr, swap chars since we insert from the beginning
-s/^r\([nr]\)/\1r/
-t addr_regex_handle_end
-
-# Found end of single regex addr, a second address might follow
-/^r[^nr]/b addr_regex_handle_end
-
 # Found second delim for the s cmd
 s/^s1\(.*\)$/s\1, 0/
 t s_cmd_handle_options
 
-# Found first delim for the s cmd
-s/^s0\(.*\)$/s1\1, "/
-x
-t regex_eat_next
+# case of regex closing a range: swap chars since we insert from the beginning
+s/^r\([rn]\)/\1r/
 
-b fail
+# At this point if we do not have a string on the last line then that means
+# we're in the last_regex case, skip regex creation
+/"$/!b skip_regex_creation
+s/\(.*\)\
+\(.*\)\
+\(.*\)\
+\(.*\)/\1\
+\2x\
+\3\&\2\
+static Regex \2 = {.compiled = false, .str = \4};/
+# save current line we are working on
+G
+# save everything to hold
+h
+# only keep regex declaration and print it
+s/.*\n\(.*\)\n.*/\1/p
+# restore everything
+g
+# cleanup line we were working on
+s/.*\n//
+x
+# get rid of regex declaration and saved current line
+s/\(.*\)\n.*\n.*/\1/
+: skip_regex_creation
+# Found first delim for the s cmd
+/^s0/{
+  s/^s0\(.*\)$/s1\1, "/
+  x
+  t regex_eat_next
+}
 
-: addr_regex_handle_end
 x
 # remove delim, we don't need to keep it anymore
 s/.//
@@ -296,6 +327,8 @@ t s_cmd_eat_options
   # why using the "=" command is not an option)
   /^.[rn]/{
     s/$/, __LINE__/
+    # TODO jump to next label here to avoid having the regex check the block
+    # below
   }
   /^.[rn]/!{
     # single address, we need to check if another one follows
@@ -315,7 +348,7 @@ t s_cmd_eat_options
 : s_or_addr_close_function
 # close C function call + add ";" if not an address
 s/$/)/
-/^s/s/$/;/
+/^s/s/$/;}/
 x
 # negative address
 /^[[:blank:]]*!/{

diff --git a/sed-bin.c b/sed-bin.c
@@ -17,7 +17,7 @@ int main(int argc, char **argv) {
     .line_nb = 0,
     .last_line_nb = INT_MAX, // TODO UINT_MAX after cleaning up signed usage
     .skip_read = false,
-    .last_pattern = NULL,
+    .last_regex = NULL,
     .range_ids = (int [MAX_ACTIVE_RANGES]){},
     .suppressed_range_ids = (int [MAX_ACTIVE_RANGES]){},
     .pending_output = (const char *[MAX_PENDING_OUTPUT]){},

diff --git a/status.h b/status.h
@@ -12,6 +12,15 @@ typedef enum {
 } operation_ret;
 
 #include <stdbool.h>
+#include <regex.h>
+
+typedef struct {
+  bool compiled;
+  union {
+    const char *str;
+    regex_t obj;
+  };
+} Regex;
 
 typedef struct {
   char *pattern_space;
@@ -20,7 +29,7 @@ typedef struct {
   unsigned int line_nb;
   unsigned int last_line_nb;
   bool skip_read;
-  const char *last_pattern;
+  Regex *last_regex;
   int *const range_ids;
   int *const suppressed_range_ids;
   const char **const pending_output;