unicode/fsm: switch to regex-automata 0.3

This changes the Unicode generation script to use 'regex-cli' instead of 'ucd-generate'. Then I generated the new DFAs and fixed the fallout in the code. The new APIs are a little more clunky, but such is life. Amazingly, everything Just Worked. And the generation script now takes 5 seconds instead of ~40 seconds(!).
BurntSushi · Jul 5, 2023 · 9db5584 · 9db5584
1 parent 88a12ae
commit 9db5584
Show file tree

Hide file tree

Showing 31 changed files with 313 additions and 65 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -30,9 +30,14 @@ serde = ["dep:serde"]
 
 [dependencies]
 memchr = { version = "2.4.0", default-features = false }
-regex-automata = { version = "0.1.5", default-features = false, optional = true }
 serde = { version = "1.0.85", default-features = false, optional = true }
 
+[dependencies.regex-automata]
+version = "0.3.0"
+default-features = false
+features = ["dfa-search"]
+optional = true
+
 [dev-dependencies]
 quickcheck = { version = "1", default-features = false }
 ucd-parse = "0.1.3"

diff --git a/scripts/generate-unicode-data b/scripts/generate-unicode-data
@@ -29,81 +29,116 @@ graphemes() {
     regex="$(sh "$D/regex/grapheme.sh")"
 
     echo "generating forward grapheme DFA"
-    ucd-generate dfa \
-        --name GRAPHEME_BREAK_FWD \
-        --sparse --minimize --anchored --state-size 2 \
-        src/unicode/fsm/ \
-        "$regex"
+    regex-cli generate serialize sparse dfa \
+      --minimize \
+      --start-kind anchored \
+      --shrink \
+      --rustfmt \
+      --safe \
+      GRAPHEME_BREAK_FWD \
+      src/unicode/fsm/ \
+      "$regex"
 
     echo "generating reverse grapheme DFA"
-    ucd-generate dfa \
-        --name GRAPHEME_BREAK_REV \
-        --reverse --longest \
-        --sparse --minimize --anchored --state-size 2 \
-        src/unicode/fsm/ \
-        "$regex"
+    regex-cli generate serialize sparse dfa \
+      --minimize \
+      --start-kind anchored \
+      --reverse \
+      --match-kind all \
+      --no-captures \
+      --shrink \
+      --rustfmt \
+      --safe \
+      GRAPHEME_BREAK_REV \
+      src/unicode/fsm/ \
+      "$regex"
 }
 
 words() {
     regex="$(sh "$D/regex/word.sh")"
 
     echo "generating forward word DFA (this can take a while)"
-    ucd-generate dfa \
-        --name WORD_BREAK_FWD \
-        --sparse --minimize --anchored --state-size 4 \
-        src/unicode/fsm/ \
-        "$regex"
+    regex-cli generate serialize sparse dfa \
+      --minimize \
+      --start-kind anchored \
+      --shrink \
+      --rustfmt \
+      --safe \
+      WORD_BREAK_FWD \
+      src/unicode/fsm/ \
+      "$regex"
 }
 
 sentences() {
     regex="$(sh "$D/regex/sentence.sh")"
 
     echo "generating forward sentence DFA (this can take a while)"
-    ucd-generate dfa \
-        --name SENTENCE_BREAK_FWD \
-        --minimize \
-        --sparse --anchored --state-size 4 \
-        src/unicode/fsm/ \
-        "$regex"
+    regex-cli generate serialize sparse dfa \
+      --minimize \
+      --start-kind anchored \
+      --shrink \
+      --rustfmt \
+      --safe \
+      SENTENCE_BREAK_FWD \
+      src/unicode/fsm/ \
+      "$regex"
 }
 
 regional_indicator() {
     # For finding all occurrences of region indicators. This is used to handle
     # regional indicators as a special case for the reverse grapheme iterator
     # and the reverse word iterator.
     echo "generating regional indicator DFA"
-    ucd-generate dfa \
-        --name REGIONAL_INDICATOR_REV \
-        --reverse \
-        --classes --minimize --anchored --premultiply --state-size 1 \
-        src/unicode/fsm/ \
-        "\p{gcb=Regional_Indicator}"
+    regex-cli generate serialize dense dfa \
+      --minimize \
+      --start-kind anchored \
+      --reverse \
+      --no-captures \
+      --shrink \
+      --rustfmt \
+      --safe \
+      REGIONAL_INDICATOR_REV \
+      src/unicode/fsm/ \
+      "\p{gcb=Regional_Indicator}"
 }
 
 simple_word() {
     echo "generating forward simple word DFA"
-    ucd-generate dfa \
-        --name SIMPLE_WORD_FWD \
-        --sparse --minimize --state-size 2 \
-        src/unicode/fsm/ \
-        "\w"
+    regex-cli generate serialize sparse dfa \
+      --minimize \
+      --start-kind anchored \
+      --shrink \
+      --rustfmt \
+      --safe \
+      SIMPLE_WORD_FWD \
+      src/unicode/fsm/ \
+      "\w"
 }
 
 whitespace() {
     echo "generating forward whitespace DFA"
-    ucd-generate dfa \
-        --name WHITESPACE_ANCHORED_FWD \
-        --anchored --classes --premultiply --minimize --state-size 1 \
-        src/unicode/fsm/ \
-        "\s+"
+    regex-cli generate serialize dense dfa \
+      --minimize \
+      --start-kind anchored \
+      --shrink \
+      --rustfmt \
+      --safe \
+      WHITESPACE_ANCHORED_FWD \
+      src/unicode/fsm/ \
+      "\s+"
 
     echo "generating reverse whitespace DFA"
-    ucd-generate dfa \
-        --name WHITESPACE_ANCHORED_REV \
-        --reverse \
-        --anchored --classes --premultiply --minimize --state-size 2 \
-        src/unicode/fsm/ \
-        "\s+"
+    regex-cli generate serialize dense dfa \
+      --minimize \
+      --start-kind anchored \
+      --reverse \
+      --no-captures \
+      --shrink \
+      --rustfmt \
+      --safe \
+      WHITESPACE_ANCHORED_REV \
+      src/unicode/fsm/ \
+      "\s+"
 }
 
 main() {
@@ -127,8 +162,14 @@ main() {
         exit
     fi
 
-    # ucd-generate is used to compile regexes into DFAs.
-    requires ucd-generate
+    # regex-cli is used to compile regexes into DFAs.
+    # To get regex-cli, run:
+    #
+    #     cargo install --git https://github.com/rust-lang/regex regex-cli
+    #
+    # regex-cli will build DFAs, serialize them to big endian and little endian
+    # files, and then generate the Rust code to deserialize them.
+    requires regex-cli
 
     mkdir -p src/unicode/fsm/
 

diff --git a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs
@@ -0,0 +1,19 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe GRAPHEME_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
+
+pub static GRAPHEME_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
+    #[cfg(target_endian = "big")]
+    static BYTES: &'static [u8] =
+        include_bytes!("grapheme_break_fwd.bigendian.dfa");
+    #[cfg(target_endian = "little")]
+    static BYTES: &'static [u8] =
+        include_bytes!("grapheme_break_fwd.littleendian.dfa");
+    let (dfa, _) =
+        DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
+    dfa
+});
diff --git a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
diff --git a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs
@@ -0,0 +1,19 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --reverse --match-kind all --no-captures --shrink --rustfmt --safe GRAPHEME_BREAK_REV src/unicode/fsm/ <snip: arg too long>
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
+
+pub static GRAPHEME_BREAK_REV: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
+    #[cfg(target_endian = "big")]
+    static BYTES: &'static [u8] =
+        include_bytes!("grapheme_break_rev.bigendian.dfa");
+    #[cfg(target_endian = "little")]
+    static BYTES: &'static [u8] =
+        include_bytes!("grapheme_break_rev.littleendian.dfa");
+    let (dfa, _) =
+        DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
+    dfa
+});
diff --git a/src/unicode/fsm/mod.rs b/src/unicode/fsm/mod.rs
@@ -0,0 +1,8 @@
+pub mod grapheme_break_fwd;
+pub mod grapheme_break_rev;
+pub mod regional_indicator_rev;
+pub mod sentence_break_fwd;
+pub mod simple_word_fwd;
+pub mod whitespace_anchored_fwd;
+pub mod whitespace_anchored_rev;
+pub mod word_break_fwd;
diff --git a/src/unicode/fsm/regional_indicator_rev.bigendian.dfa b/src/unicode/fsm/regional_indicator_rev.bigendian.dfa
diff --git a/src/unicode/fsm/regional_indicator_rev.littleendian.dfa b/src/unicode/fsm/regional_indicator_rev.littleendian.dfa
diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs
@@ -0,0 +1,24 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe REGIONAL_INDICATOR_REV src/unicode/fsm/ \p{gcb=Regional_Indicator}
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{
+    dfa::dense::DFA,
+    util::{lazy::Lazy, wire::AlignAs},
+};
+
+pub static REGIONAL_INDICATOR_REV: Lazy<DFA<&'static [u32]>> =
+    Lazy::new(|| {
+        static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+            _align: [],
+            #[cfg(target_endian = "big")]
+            bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
+            #[cfg(target_endian = "little")]
+            bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
+        };
+        let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
+            .expect("serialized DFA should be valid");
+        dfa
+    });
diff --git a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs
@@ -0,0 +1,19 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SENTENCE_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
+
+pub static SENTENCE_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
+    #[cfg(target_endian = "big")]
+    static BYTES: &'static [u8] =
+        include_bytes!("sentence_break_fwd.bigendian.dfa");
+    #[cfg(target_endian = "little")]
+    static BYTES: &'static [u8] =
+        include_bytes!("sentence_break_fwd.littleendian.dfa");
+    let (dfa, _) =
+        DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
+    dfa
+});
diff --git a/src/unicode/fsm/simple_word_fwd.bigendian.dfa b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/simple_word_fwd.littleendian.dfa b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs
@@ -0,0 +1,19 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SIMPLE_WORD_FWD src/unicode/fsm/ \w
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
+
+pub static SIMPLE_WORD_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
+    #[cfg(target_endian = "big")]
+    static BYTES: &'static [u8] =
+        include_bytes!("simple_word_fwd.bigendian.dfa");
+    #[cfg(target_endian = "little")]
+    static BYTES: &'static [u8] =
+        include_bytes!("simple_word_fwd.littleendian.dfa");
+    let (dfa, _) =
+        DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
+    dfa
+});
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.bigendian.dfa b/src/unicode/fsm/whitespace_anchored_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.littleendian.dfa b/src/unicode/fsm/whitespace_anchored_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs
@@ -0,0 +1,24 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize dense dfa --minimize --start-kind anchored --shrink --rustfmt --safe WHITESPACE_ANCHORED_FWD src/unicode/fsm/ \s+
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{
+    dfa::dense::DFA,
+    util::{lazy::Lazy, wire::AlignAs},
+};
+
+pub static WHITESPACE_ANCHORED_FWD: Lazy<DFA<&'static [u32]>> =
+    Lazy::new(|| {
+        static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+            _align: [],
+            #[cfg(target_endian = "big")]
+            bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
+            #[cfg(target_endian = "little")]
+            bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
+        };
+        let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
+            .expect("serialized DFA should be valid");
+        dfa
+    });
diff --git a/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa b/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa
diff --git a/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa b/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa
diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs
@@ -0,0 +1,24 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe WHITESPACE_ANCHORED_REV src/unicode/fsm/ \s+
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{
+    dfa::dense::DFA,
+    util::{lazy::Lazy, wire::AlignAs},
+};
+
+pub static WHITESPACE_ANCHORED_REV: Lazy<DFA<&'static [u32]>> =
+    Lazy::new(|| {
+        static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+            _align: [],
+            #[cfg(target_endian = "big")]
+            bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
+            #[cfg(target_endian = "little")]
+            bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
+        };
+        let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
+            .expect("serialized DFA should be valid");
+        dfa
+    });
diff --git a/src/unicode/fsm/word_break_fwd.bigendian.dfa b/src/unicode/fsm/word_break_fwd.bigendian.dfa
diff --git a/src/unicode/fsm/word_break_fwd.littleendian.dfa b/src/unicode/fsm/word_break_fwd.littleendian.dfa
diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs
@@ -0,0 +1,19 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe WORD_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
+
+pub static WORD_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
+    #[cfg(target_endian = "big")]
+    static BYTES: &'static [u8] =
+        include_bytes!("word_break_fwd.bigendian.dfa");
+    #[cfg(target_endian = "little")]
+    static BYTES: &'static [u8] =
+        include_bytes!("word_break_fwd.littleendian.dfa");
+    let (dfa, _) =
+        DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
+    dfa
+});