Skip to content

Commit

Permalink
unicode/fsm: switch to regex-automata 0.3
Browse files Browse the repository at this point in the history
This changes the Unicode generation script to use 'regex-cli' instead of
'ucd-generate'. Then I generated the new DFAs and fixed the fallout in
the code. The new APIs are a little more clunky, but such is life.

Amazingly, everything Just Worked. And the generation script now takes 5
seconds instead of ~40 seconds(!).
  • Loading branch information
BurntSushi committed Jul 5, 2023
1 parent 88a12ae commit 9db5584
Show file tree
Hide file tree
Showing 31 changed files with 313 additions and 65 deletions.
7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,14 @@ serde = ["dep:serde"]

[dependencies]
memchr = { version = "2.4.0", default-features = false }
regex-automata = { version = "0.1.5", default-features = false, optional = true }
serde = { version = "1.0.85", default-features = false, optional = true }

[dependencies.regex-automata]
version = "0.3.0"
default-features = false
features = ["dfa-search"]
optional = true

[dev-dependencies]
quickcheck = { version = "1", default-features = false }
ucd-parse = "0.1.3"
Expand Down
133 changes: 87 additions & 46 deletions scripts/generate-unicode-data
Original file line number Diff line number Diff line change
Expand Up @@ -29,81 +29,116 @@ graphemes() {
regex="$(sh "$D/regex/grapheme.sh")"

echo "generating forward grapheme DFA"
ucd-generate dfa \
--name GRAPHEME_BREAK_FWD \
--sparse --minimize --anchored --state-size 2 \
src/unicode/fsm/ \
"$regex"
regex-cli generate serialize sparse dfa \
--minimize \
--start-kind anchored \
--shrink \
--rustfmt \
--safe \
GRAPHEME_BREAK_FWD \
src/unicode/fsm/ \
"$regex"

echo "generating reverse grapheme DFA"
ucd-generate dfa \
--name GRAPHEME_BREAK_REV \
--reverse --longest \
--sparse --minimize --anchored --state-size 2 \
src/unicode/fsm/ \
"$regex"
regex-cli generate serialize sparse dfa \
--minimize \
--start-kind anchored \
--reverse \
--match-kind all \
--no-captures \
--shrink \
--rustfmt \
--safe \
GRAPHEME_BREAK_REV \
src/unicode/fsm/ \
"$regex"
}

words() {
regex="$(sh "$D/regex/word.sh")"

echo "generating forward word DFA (this can take a while)"
ucd-generate dfa \
--name WORD_BREAK_FWD \
--sparse --minimize --anchored --state-size 4 \
src/unicode/fsm/ \
"$regex"
regex-cli generate serialize sparse dfa \
--minimize \
--start-kind anchored \
--shrink \
--rustfmt \
--safe \
WORD_BREAK_FWD \
src/unicode/fsm/ \
"$regex"
}

sentences() {
regex="$(sh "$D/regex/sentence.sh")"

echo "generating forward sentence DFA (this can take a while)"
ucd-generate dfa \
--name SENTENCE_BREAK_FWD \
--minimize \
--sparse --anchored --state-size 4 \
src/unicode/fsm/ \
"$regex"
regex-cli generate serialize sparse dfa \
--minimize \
--start-kind anchored \
--shrink \
--rustfmt \
--safe \
SENTENCE_BREAK_FWD \
src/unicode/fsm/ \
"$regex"
}

regional_indicator() {
# For finding all occurrences of region indicators. This is used to handle
# regional indicators as a special case for the reverse grapheme iterator
# and the reverse word iterator.
echo "generating regional indicator DFA"
ucd-generate dfa \
--name REGIONAL_INDICATOR_REV \
--reverse \
--classes --minimize --anchored --premultiply --state-size 1 \
src/unicode/fsm/ \
"\p{gcb=Regional_Indicator}"
regex-cli generate serialize dense dfa \
--minimize \
--start-kind anchored \
--reverse \
--no-captures \
--shrink \
--rustfmt \
--safe \
REGIONAL_INDICATOR_REV \
src/unicode/fsm/ \
"\p{gcb=Regional_Indicator}"
}

simple_word() {
echo "generating forward simple word DFA"
ucd-generate dfa \
--name SIMPLE_WORD_FWD \
--sparse --minimize --state-size 2 \
src/unicode/fsm/ \
"\w"
regex-cli generate serialize sparse dfa \
--minimize \
--start-kind anchored \
--shrink \
--rustfmt \
--safe \
SIMPLE_WORD_FWD \
src/unicode/fsm/ \
"\w"
}

whitespace() {
echo "generating forward whitespace DFA"
ucd-generate dfa \
--name WHITESPACE_ANCHORED_FWD \
--anchored --classes --premultiply --minimize --state-size 1 \
src/unicode/fsm/ \
"\s+"
regex-cli generate serialize dense dfa \
--minimize \
--start-kind anchored \
--shrink \
--rustfmt \
--safe \
WHITESPACE_ANCHORED_FWD \
src/unicode/fsm/ \
"\s+"

echo "generating reverse whitespace DFA"
ucd-generate dfa \
--name WHITESPACE_ANCHORED_REV \
--reverse \
--anchored --classes --premultiply --minimize --state-size 2 \
src/unicode/fsm/ \
"\s+"
regex-cli generate serialize dense dfa \
--minimize \
--start-kind anchored \
--reverse \
--no-captures \
--shrink \
--rustfmt \
--safe \
WHITESPACE_ANCHORED_REV \
src/unicode/fsm/ \
"\s+"
}

main() {
Expand All @@ -127,8 +162,14 @@ main() {
exit
fi

# ucd-generate is used to compile regexes into DFAs.
requires ucd-generate
# regex-cli is used to compile regexes into DFAs.
# To get regex-cli, run:
#
# cargo install --git https://github.com/rust-lang/regex regex-cli
#
# regex-cli will build DFAs, serialize them to big endian and little endian
# files, and then generate the Rust code to deserialize them.
requires regex-cli

mkdir -p src/unicode/fsm/

Expand Down
Binary file added src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
Binary file not shown.
Binary file not shown.
19 changes: 19 additions & 0 deletions src/unicode/fsm/grapheme_break_fwd.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe GRAPHEME_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
//
// regex-cli 0.0.1 is available on crates.io.

use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};

pub static GRAPHEME_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
#[cfg(target_endian = "big")]
static BYTES: &'static [u8] =
include_bytes!("grapheme_break_fwd.bigendian.dfa");
#[cfg(target_endian = "little")]
static BYTES: &'static [u8] =
include_bytes!("grapheme_break_fwd.littleendian.dfa");
let (dfa, _) =
DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
dfa
});
Binary file added src/unicode/fsm/grapheme_break_rev.bigendian.dfa
Binary file not shown.
Binary file not shown.
19 changes: 19 additions & 0 deletions src/unicode/fsm/grapheme_break_rev.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --reverse --match-kind all --no-captures --shrink --rustfmt --safe GRAPHEME_BREAK_REV src/unicode/fsm/ <snip: arg too long>
//
// regex-cli 0.0.1 is available on crates.io.

use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};

pub static GRAPHEME_BREAK_REV: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
#[cfg(target_endian = "big")]
static BYTES: &'static [u8] =
include_bytes!("grapheme_break_rev.bigendian.dfa");
#[cfg(target_endian = "little")]
static BYTES: &'static [u8] =
include_bytes!("grapheme_break_rev.littleendian.dfa");
let (dfa, _) =
DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
dfa
});
8 changes: 8 additions & 0 deletions src/unicode/fsm/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pub mod grapheme_break_fwd;
pub mod grapheme_break_rev;
pub mod regional_indicator_rev;
pub mod sentence_break_fwd;
pub mod simple_word_fwd;
pub mod whitespace_anchored_fwd;
pub mod whitespace_anchored_rev;
pub mod word_break_fwd;
Binary file not shown.
Binary file not shown.
24 changes: 24 additions & 0 deletions src/unicode/fsm/regional_indicator_rev.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe REGIONAL_INDICATOR_REV src/unicode/fsm/ \p{gcb=Regional_Indicator}
//
// regex-cli 0.0.1 is available on crates.io.

use regex_automata::{
dfa::dense::DFA,
util::{lazy::Lazy, wire::AlignAs},
};

pub static REGIONAL_INDICATOR_REV: Lazy<DFA<&'static [u32]>> =
Lazy::new(|| {
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
_align: [],
#[cfg(target_endian = "big")]
bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
#[cfg(target_endian = "little")]
bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
};
let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
.expect("serialized DFA should be valid");
dfa
});
Binary file added src/unicode/fsm/sentence_break_fwd.bigendian.dfa
Binary file not shown.
Binary file not shown.
19 changes: 19 additions & 0 deletions src/unicode/fsm/sentence_break_fwd.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SENTENCE_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
//
// regex-cli 0.0.1 is available on crates.io.

use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};

pub static SENTENCE_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
#[cfg(target_endian = "big")]
static BYTES: &'static [u8] =
include_bytes!("sentence_break_fwd.bigendian.dfa");
#[cfg(target_endian = "little")]
static BYTES: &'static [u8] =
include_bytes!("sentence_break_fwd.littleendian.dfa");
let (dfa, _) =
DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
dfa
});
Binary file added src/unicode/fsm/simple_word_fwd.bigendian.dfa
Binary file not shown.
Binary file added src/unicode/fsm/simple_word_fwd.littleendian.dfa
Binary file not shown.
19 changes: 19 additions & 0 deletions src/unicode/fsm/simple_word_fwd.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SIMPLE_WORD_FWD src/unicode/fsm/ \w
//
// regex-cli 0.0.1 is available on crates.io.

use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};

pub static SIMPLE_WORD_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
#[cfg(target_endian = "big")]
static BYTES: &'static [u8] =
include_bytes!("simple_word_fwd.bigendian.dfa");
#[cfg(target_endian = "little")]
static BYTES: &'static [u8] =
include_bytes!("simple_word_fwd.littleendian.dfa");
let (dfa, _) =
DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
dfa
});
Binary file not shown.
Binary file not shown.
24 changes: 24 additions & 0 deletions src/unicode/fsm/whitespace_anchored_fwd.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize dense dfa --minimize --start-kind anchored --shrink --rustfmt --safe WHITESPACE_ANCHORED_FWD src/unicode/fsm/ \s+
//
// regex-cli 0.0.1 is available on crates.io.

use regex_automata::{
dfa::dense::DFA,
util::{lazy::Lazy, wire::AlignAs},
};

pub static WHITESPACE_ANCHORED_FWD: Lazy<DFA<&'static [u32]>> =
Lazy::new(|| {
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
_align: [],
#[cfg(target_endian = "big")]
bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
#[cfg(target_endian = "little")]
bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
};
let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
.expect("serialized DFA should be valid");
dfa
});
Binary file not shown.
Binary file not shown.
24 changes: 24 additions & 0 deletions src/unicode/fsm/whitespace_anchored_rev.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe WHITESPACE_ANCHORED_REV src/unicode/fsm/ \s+
//
// regex-cli 0.0.1 is available on crates.io.

use regex_automata::{
dfa::dense::DFA,
util::{lazy::Lazy, wire::AlignAs},
};

pub static WHITESPACE_ANCHORED_REV: Lazy<DFA<&'static [u32]>> =
Lazy::new(|| {
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
_align: [],
#[cfg(target_endian = "big")]
bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
#[cfg(target_endian = "little")]
bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
};
let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
.expect("serialized DFA should be valid");
dfa
});
Binary file added src/unicode/fsm/word_break_fwd.bigendian.dfa
Binary file not shown.
Binary file added src/unicode/fsm/word_break_fwd.littleendian.dfa
Binary file not shown.
19 changes: 19 additions & 0 deletions src/unicode/fsm/word_break_fwd.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe WORD_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
//
// regex-cli 0.0.1 is available on crates.io.

use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};

pub static WORD_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
#[cfg(target_endian = "big")]
static BYTES: &'static [u8] =
include_bytes!("word_break_fwd.bigendian.dfa");
#[cfg(target_endian = "little")]
static BYTES: &'static [u8] =
include_bytes!("word_break_fwd.littleendian.dfa");
let (dfa, _) =
DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
dfa
});
Loading

0 comments on commit 9db5584

Please sign in to comment.