Skip to content

Commit

Permalink
Merge pull request #1198 from epage/generic
Browse files Browse the repository at this point in the history
perf(dict)!: Switch to PHF Map
  • Loading branch information
epage authored Dec 31, 2024
2 parents 086f9d1 + 7457534 commit 44cf2f8
Show file tree
Hide file tree
Showing 20 changed files with 909,326 additions and 705,237 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

168,511 changes: 84,256 additions & 84,255 deletions crates/codespell-dict/src/dict_codegen.rs

Large diffs are not rendered by default.

16 changes: 14 additions & 2 deletions crates/dictgen/src/gen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,19 @@ impl<'g> DictGen<'g> {

#[cfg(feature = "map")]
pub fn map(self) -> crate::MapGen<'g> {
crate::MapGen { gen: self }
crate::MapGen {
gen: self,
unicode: true,
unicase: true,
}
}

pub fn ordered_map(self) -> crate::OrderedMapGen<'g> {
crate::OrderedMapGen { gen: self }
crate::OrderedMapGen {
gen: self,
unicode: true,
unicase: true,
}
}

pub fn trie(self) -> crate::TrieGen<'g> {
Expand All @@ -49,6 +57,10 @@ impl<'g> DictGen<'g> {
limit: 64,
}
}

pub fn r#match(self) -> crate::MatchGen<'g> {
crate::MatchGen { gen: self }
}
}

impl Default for DictGen<'static> {
Expand Down
106 changes: 106 additions & 0 deletions crates/dictgen/src/insensitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@ impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {

impl Eq for InsensitiveStr<'_> {}

impl PartialOrd for InsensitiveStr<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}

impl Ord for InsensitiveStr<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.convert().cmp(&other.convert())
}
}

impl core::hash::Hash for InsensitiveStr<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
Expand Down Expand Up @@ -101,3 +113,97 @@ impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a
self
}
}

/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
#[derive(Copy, Clone)]
pub struct InsensitiveAscii<'s>(pub &'s str);

impl<'s> InsensitiveAscii<'s> {
pub fn convert(self) -> unicase::Ascii<&'s str> {
unicase::Ascii::new(self.0)
}

pub fn into_inner(self) -> &'s str {
self.0
}

pub fn is_empty(self) -> bool {
self.0.is_empty()
}

pub fn len(self) -> usize {
self.0.len()
}
}

impl<'s> From<unicase::Ascii<&'s str>> for InsensitiveAscii<'s> {
fn from(other: unicase::Ascii<&'s str>) -> Self {
Self(other.into_inner())
}
}

impl<'s2> PartialEq<InsensitiveAscii<'s2>> for InsensitiveAscii<'_> {
#[inline]
fn eq(&self, other: &InsensitiveAscii<'s2>) -> bool {
self.convert() == other.convert()
}
}

impl Eq for InsensitiveAscii<'_> {}

impl PartialOrd for InsensitiveAscii<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}

impl Ord for InsensitiveAscii<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.convert().cmp(&other.convert())
}
}

impl core::hash::Hash for InsensitiveAscii<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
self.convert().hash(hasher);
}
}

impl core::fmt::Debug for InsensitiveAscii<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(self.into_inner(), fmt)
}
}

impl core::fmt::Display for InsensitiveAscii<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(self.into_inner(), fmt)
}
}

#[cfg(feature = "map")]
impl phf_shared::PhfHash for InsensitiveAscii<'_> {
#[inline]
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
core::hash::Hash::hash(self, state);
}
}

#[cfg(feature = "map")]
impl phf_shared::FmtConst for InsensitiveAscii<'_> {
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.write_str("dictgen::InsensitiveAscii(")?;
self.into_inner().fmt_const(f)?;
f.write_str(")")
}
}

#[cfg(feature = "map")]
impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveAscii<'b>> for InsensitiveAscii<'a> {
fn borrow(&self) -> &InsensitiveAscii<'b> {
self
}
}
4 changes: 4 additions & 0 deletions crates/dictgen/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ mod gen;
mod insensitive;
#[cfg(feature = "map")]
mod map;
#[cfg(feature = "codegen")]
mod r#match;
mod ordered_map;
mod trie;

Expand All @@ -16,4 +18,6 @@ pub use insensitive::*;
#[cfg(feature = "map")]
pub use map::*;
pub use ordered_map::*;
#[cfg(feature = "codegen")]
pub use r#match::*;
pub use trie::*;
127 changes: 102 additions & 25 deletions crates/dictgen/src/map.rs
Original file line number Diff line number Diff line change
@@ -1,65 +1,120 @@
#[cfg(feature = "codegen")]
pub struct MapGen<'g> {
pub(crate) gen: crate::DictGen<'g>,
pub(crate) unicase: bool,
pub(crate) unicode: bool,
}

#[cfg(feature = "codegen")]
impl MapGen<'_> {
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
pub fn unicase(mut self, yes: bool) -> Self {
self.unicase = yes;
self
}

pub fn unicode(mut self, yes: bool) -> Self {
self.unicode = yes;
self
}

pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (&'d str, V)>,
data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));

let name = self.gen.name;
let key_type = self.key_type();
let value_type = self.gen.value_type;

let mut smallest = usize::MAX;
let mut largest = usize::MIN;
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| {
(
if key.is_ascii() {
crate::InsensitiveStr::Ascii(key)
} else {
crate::InsensitiveStr::Unicode(key)
},
value.to_string(),
)
})
.collect::<Vec<_>>();
for (key, value) in data.iter() {
for (key, _) in data.iter() {
let key = key.as_ref();
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
builder.entry(key, value.as_str());
}
let builder = builder.build();
if largest == 0 {
smallest = 0;
}

writeln!(
file,
"pub static {name}: dictgen::Map<{value_type}> = dictgen::Map {{"
"pub static {name}: dictgen::Map<{key_type}, {value_type}> = dictgen::Map {{"
)?;
writeln!(file, " map: {builder},")?;

match (self.unicase, self.unicode) {
(true, true) => {
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| {
let key = key.as_ref();
(
if key.is_ascii() {
crate::InsensitiveStr::Ascii(key)
} else {
crate::InsensitiveStr::Unicode(key)
},
value.to_string(),
)
})
.collect::<Vec<_>>();
for (key, value) in data.iter() {
builder.entry(key, value.as_str());
}
let builder = builder.build();
writeln!(file, " map: {builder},")?;
}
(true, false) => {
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| (crate::InsensitiveAscii(key.as_ref()), value.to_string()))
.collect::<Vec<_>>();
for (key, value) in data.iter() {
builder.entry(key, value.as_str());
}
let builder = builder.build();
writeln!(file, " map: {builder},")?;
}
(false, _) => {
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| (key, value.to_string()))
.collect::<Vec<_>>();
for (key, value) in data.iter() {
builder.entry(key.as_ref(), value.as_str());
}
let builder = builder.build();
writeln!(file, " map: {builder},")?;
}
}

writeln!(file, " range: {smallest}..={largest},")?;
writeln!(file, "}};")?;

Ok(())
}

fn key_type(&self) -> &'static str {
match (self.unicase, self.unicode) {
(true, true) => "dictgen::InsensitiveStr<'static>",
(true, false) => "dictgen::InsensitiveAscii<'static>",
(false, _) => "&'static str",
}
}
}

pub struct Map<V: 'static> {
pub map: phf::Map<crate::InsensitiveStr<'static>, V>,
pub struct Map<K: 'static, V: 'static> {
pub map: phf::Map<K, V>,
pub range: std::ops::RangeInclusive<usize>,
}

impl<V> Map<V> {
impl<V> Map<crate::InsensitiveStr<'_>, V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
if self.range.contains(&word.len()) {
Expand All @@ -69,3 +124,25 @@ impl<V> Map<V> {
}
}
}

impl<V> Map<crate::InsensitiveAscii<'_>, V> {
#[inline]
pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&V> {
if self.range.contains(&word.len()) {
self.map.get(&(*word).into())
} else {
None
}
}
}

impl<V> Map<&str, V> {
#[inline]
pub fn find(&self, word: &'_ &str) -> Option<&V> {
if self.range.contains(&word.len()) {
self.map.get(word)
} else {
None
}
}
}
37 changes: 37 additions & 0 deletions crates/dictgen/src/match.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#[cfg(feature = "codegen")]
pub struct MatchGen<'g> {
pub(crate) gen: crate::DictGen<'g>,
}

#[cfg(feature = "codegen")]
impl MatchGen<'_> {
pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));

let name = self.gen.name;
let value_type = self.gen.value_type;

writeln!(file, "pub struct {name};")?;
writeln!(file, "impl {name} {{")?;
writeln!(
file,
" pub fn find(&self, word: &&str) -> Option<&'static {value_type}> {{"
)?;
writeln!(file, " match *word {{")?;
for (key, value) in data.iter() {
let key = key.as_ref();
writeln!(file, " {key:?} => Some(&{value}.as_slice()),")?;
}
writeln!(file, " _ => None,")?;
writeln!(file, " }}")?;
writeln!(file, " }}")?;
writeln!(file, "}}")?;

Ok(())
}
}
Loading

0 comments on commit 44cf2f8

Please sign in to comment.