-
-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
unicode/fsm: switch to regex-automata 0.3
This changes the Unicode generation script to use 'regex-cli' instead of 'ucd-generate'. Then I generated the new DFAs and fixed the fallout in the code. The new APIs are a little more clunky, but such is life. Amazingly, everything Just Worked. And the generation script now takes 5 seconds instead of ~40 seconds(!).
- Loading branch information
1 parent
88a12ae
commit 9db5584
Showing
31 changed files
with
313 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: | ||
// | ||
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe GRAPHEME_BREAK_FWD src/unicode/fsm/ <snip: arg too long> | ||
// | ||
// regex-cli 0.0.1 is available on crates.io. | ||
|
||
use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; | ||
|
||
pub static GRAPHEME_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { | ||
#[cfg(target_endian = "big")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("grapheme_break_fwd.bigendian.dfa"); | ||
#[cfg(target_endian = "little")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("grapheme_break_fwd.littleendian.dfa"); | ||
let (dfa, _) = | ||
DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); | ||
dfa | ||
}); |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: | ||
// | ||
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --reverse --match-kind all --no-captures --shrink --rustfmt --safe GRAPHEME_BREAK_REV src/unicode/fsm/ <snip: arg too long> | ||
// | ||
// regex-cli 0.0.1 is available on crates.io. | ||
|
||
use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; | ||
|
||
pub static GRAPHEME_BREAK_REV: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { | ||
#[cfg(target_endian = "big")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("grapheme_break_rev.bigendian.dfa"); | ||
#[cfg(target_endian = "little")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("grapheme_break_rev.littleendian.dfa"); | ||
let (dfa, _) = | ||
DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); | ||
dfa | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
pub mod grapheme_break_fwd; | ||
pub mod grapheme_break_rev; | ||
pub mod regional_indicator_rev; | ||
pub mod sentence_break_fwd; | ||
pub mod simple_word_fwd; | ||
pub mod whitespace_anchored_fwd; | ||
pub mod whitespace_anchored_rev; | ||
pub mod word_break_fwd; |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: | ||
// | ||
// regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe REGIONAL_INDICATOR_REV src/unicode/fsm/ \p{gcb=Regional_Indicator} | ||
// | ||
// regex-cli 0.0.1 is available on crates.io. | ||
|
||
use regex_automata::{ | ||
dfa::dense::DFA, | ||
util::{lazy::Lazy, wire::AlignAs}, | ||
}; | ||
|
||
pub static REGIONAL_INDICATOR_REV: Lazy<DFA<&'static [u32]>> = | ||
Lazy::new(|| { | ||
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { | ||
_align: [], | ||
#[cfg(target_endian = "big")] | ||
bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"), | ||
#[cfg(target_endian = "little")] | ||
bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"), | ||
}; | ||
let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) | ||
.expect("serialized DFA should be valid"); | ||
dfa | ||
}); |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: | ||
// | ||
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SENTENCE_BREAK_FWD src/unicode/fsm/ <snip: arg too long> | ||
// | ||
// regex-cli 0.0.1 is available on crates.io. | ||
|
||
use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; | ||
|
||
pub static SENTENCE_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { | ||
#[cfg(target_endian = "big")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("sentence_break_fwd.bigendian.dfa"); | ||
#[cfg(target_endian = "little")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("sentence_break_fwd.littleendian.dfa"); | ||
let (dfa, _) = | ||
DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); | ||
dfa | ||
}); |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: | ||
// | ||
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SIMPLE_WORD_FWD src/unicode/fsm/ \w | ||
// | ||
// regex-cli 0.0.1 is available on crates.io. | ||
|
||
use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; | ||
|
||
pub static SIMPLE_WORD_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { | ||
#[cfg(target_endian = "big")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("simple_word_fwd.bigendian.dfa"); | ||
#[cfg(target_endian = "little")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("simple_word_fwd.littleendian.dfa"); | ||
let (dfa, _) = | ||
DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); | ||
dfa | ||
}); |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: | ||
// | ||
// regex-cli generate serialize dense dfa --minimize --start-kind anchored --shrink --rustfmt --safe WHITESPACE_ANCHORED_FWD src/unicode/fsm/ \s+ | ||
// | ||
// regex-cli 0.0.1 is available on crates.io. | ||
|
||
use regex_automata::{ | ||
dfa::dense::DFA, | ||
util::{lazy::Lazy, wire::AlignAs}, | ||
}; | ||
|
||
pub static WHITESPACE_ANCHORED_FWD: Lazy<DFA<&'static [u32]>> = | ||
Lazy::new(|| { | ||
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { | ||
_align: [], | ||
#[cfg(target_endian = "big")] | ||
bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"), | ||
#[cfg(target_endian = "little")] | ||
bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"), | ||
}; | ||
let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) | ||
.expect("serialized DFA should be valid"); | ||
dfa | ||
}); |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: | ||
// | ||
// regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe WHITESPACE_ANCHORED_REV src/unicode/fsm/ \s+ | ||
// | ||
// regex-cli 0.0.1 is available on crates.io. | ||
|
||
use regex_automata::{ | ||
dfa::dense::DFA, | ||
util::{lazy::Lazy, wire::AlignAs}, | ||
}; | ||
|
||
pub static WHITESPACE_ANCHORED_REV: Lazy<DFA<&'static [u32]>> = | ||
Lazy::new(|| { | ||
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { | ||
_align: [], | ||
#[cfg(target_endian = "big")] | ||
bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"), | ||
#[cfg(target_endian = "little")] | ||
bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"), | ||
}; | ||
let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) | ||
.expect("serialized DFA should be valid"); | ||
dfa | ||
}); |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: | ||
// | ||
// regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe WORD_BREAK_FWD src/unicode/fsm/ <snip: arg too long> | ||
// | ||
// regex-cli 0.0.1 is available on crates.io. | ||
|
||
use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; | ||
|
||
pub static WORD_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { | ||
#[cfg(target_endian = "big")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("word_break_fwd.bigendian.dfa"); | ||
#[cfg(target_endian = "little")] | ||
static BYTES: &'static [u8] = | ||
include_bytes!("word_break_fwd.littleendian.dfa"); | ||
let (dfa, _) = | ||
DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); | ||
dfa | ||
}); |
Oops, something went wrong.