Skip to content

Commit

Permalink
Improve basic_string_clean features and speed
Browse files Browse the repository at this point in the history
  • Loading branch information
aarranz committed Jul 19, 2024
1 parent 87abdd3 commit 5677a3f
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 125 deletions.
127 changes: 18 additions & 109 deletions src/emoji.rs
Original file line number Diff line number Diff line change
@@ -1,57 +1,19 @@
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
("Emoji_Modifier", EMOJI_MODIFIER),
("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE),
("Emoji_Presentation", EMOJI_PRESENTATION),
pub const CHAR_TO_AVOID: &'static [(char, char)] = &[
('\u{0000}', '\u{001F}'), // Control chars 1
('\u{007F}', '\u{009F}'), // Control chars 2
('\u{FE00}', '\u{FE0F}'), // Variation Selectors
('\u{20D0}', '\u{20FF}'), // Combining Diacritical Marks for Symbols
('\u{2800}', '\u{28FF}'), // Braille Patterns
// ('\u{D800}', '\u{F8FF}'), // High Surrogates, High Private Use Surrogates, Low Surrogates and Private Use Area blocks
('\u{E000}', '\u{F8FF}'), // Private Use Area blocks
('\u{10000}', '\u{10FFFF}'), // Extra planes
];

pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('🏻', '🏿')];
pub const EXTRA_CHARS: &'static [(char, char)] =
&[('\u{FE0E}', '\u{FE0F}'), ('\u{20E2}', '\u{20E4}')];

pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[
pub const EMOJI: &'static [(char, char)] = &[
('🏻', '🏿'),
('☝', '☝'),
('⛹', '⛹'),
('✊', '✍'),
('🎅', '🎅'),
('🏂', '🏄'),
('🏇', '🏇'),
('🏊', '🏌'),
('👂', '👃'),
('👆', '👐'),
('👦', '👸'),
('👼', '👼'),
('💁', '💃'),
('💅', '💇'),
('💏', '💏'),
('💑', '💑'),
('💪', '💪'),
('🕴', '🕵'),
('🕺', '🕺'),
('🖐', '🖐'),
('🖕', '🖖'),
('🙅', '🙇'),
('🙋', '🙏'),
('🚣', '🚣'),
('🚴', '🚶'),
('🛀', '🛀'),
('🛌', '🛌'),
('🤌', '🤌'),
('🤏', '🤏'),
('🤘', '🤟'),
('🤦', '🤦'),
('🤰', '🤹'),
('🤼', '🤾'),
('🥷', '🥷'),
('🦵', '🦶'),
('🦸', '🦹'),
('🦻', '🦻'),
('🧍', '🧏'),
('🧑', '🧝'),
('🫃', '🫅'),
('🫰', '🫸'),
];

pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[
('⌚', '⌛'),
('⏩', '⏬'),
('⏰', '⏰'),
Expand Down Expand Up @@ -85,77 +47,24 @@ pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[
('⬛', '⬜'),
('⭐', '⭐'),
('⭕', '⭕'),
('🀄', '🀄'),
('🃏', '🃏'),
('🆎', '🆎'),
('🆑', '🆚'),
('🇦', '🇿'),
('🈁', '🈁'),
('🈚', '🈚'),
('🈯', '🈯'),
('🈲', '🈶'),
('🈸', '🈺'),
('🉐', '🉑'),
('🌀', '🌠'),
('🌭', '🌵'),
('🌷', '🍼'),
('🍾', '🎓'),
('🎠', '🏊'),
('🏏', '🏓'),
('🏠', '🏰'),
('🏴', '🏴'),
('🏸', '🐾'),
('👀', '👀'),
('👂', '📼'),
('📿', '🔽'),
('🕋', '🕎'),
('🕐', '🕧'),
('🕺', '🕺'),
('🖕', '🖖'),
('🖤', '🖤'),
('🗻', '🙏'),
('🚀', '🛅'),
('🛌', '🛌'),
('🛐', '🛒'),
('🛕', '🛗'),
('🛜', '🛟'),
('🛫', '🛬'),
('🛴', '🛼'),
('🟠', '🟫'),
('🟰', '🟰'),
('🤌', '🤺'),
('🤼', '🥅'),
('🥇', '🧿'),
('🩰', '🩼'),
('🪀', '🪈'),
('🪐', '🪽'),
('🪿', '🫅'),
('🫎', '🫛'),
('🫠', '🫨'),
('🫰', '🫸'),
];

pub trait IsEmoji {
fn is_emoji(&self) -> bool;
fn is_char_to_avoid(&self) -> bool;
}
impl IsEmoji for char {
fn is_emoji(&self) -> bool {
for (lc, hc) in EMOJI_PRESENTATION {
if self >= lc && self <= hc {
return true;
}
}
for (lc, hc) in EMOJI_MODIFIER {
for (lc, hc) in EMOJI {
if self >= lc && self <= hc {
return true;
}
}
for (lc, hc) in EMOJI_MODIFIER_BASE {
if self >= lc && self <= hc {
return true;
}
}
for (lc, hc) in EXTRA_CHARS {
false
}

fn is_char_to_avoid(&self) -> bool {
for (lc, hc) in CHAR_TO_AVOID {
if self >= lc && self <= hc {
return true;
}
Expand Down
49 changes: 33 additions & 16 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@ fn custom_normalization(
let mut result = String::with_capacity(str.len());
let mut previous_whitespace = false;
for c in str.chars() {
custom_character_normalization(
previous_whitespace = custom_character_normalization(
&mut result,
c,
&allow_chars,
collapse_whitespace,
previous_whitespace,
remove_emojis,
);
previous_whitespace = c.is_whitespace();
}
result.nfc().collect::<String>()
}
Expand All @@ -42,30 +41,48 @@ fn custom_character_normalization(
collapse_whitespace: bool,
previous_whitespace: bool,
remove_emojis: bool,
) {
) -> bool {
if allow_chars.contains(&c) {
str.push(c)
str.push(c);
return false;
} else if c.is_whitespace() {
if collapse_whitespace && previous_whitespace {
return;
} else {
if !collapse_whitespace || !previous_whitespace {
str.push(' ')
}
return true;
} else if remove_emojis && c.is_emoji() {
return;
return previous_whitespace;
}

let mut pushed = false;
decompose_compatible(c, |r| {
// Ignore characters outside the Basic Multilingual Plane, Control chars, etc
if !r.is_char_to_avoid() {
str.push(r);
pushed = true;
}
});

if pushed {
false
} else {
decompose_compatible(c, |r| {
// Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set
if r <= '\u{FFFF}' {
str.push(r)
}
})
previous_whitespace
}
}

#[pyfunction]
fn basic_string_clean(value: String) -> PyResult<String> {
Ok(custom_normalization(value, vec!['º', 'ª'], false, false)
#[pyo3(signature = (value, allow_tab=false, allow_eol=true, collapse_whitespace=false, remove_emojis=false))]
fn basic_string_clean(value: String, allow_tab: bool, allow_eol: bool, collapse_whitespace: bool, remove_emojis: bool) -> PyResult<String> {
let mut allowed_chars = vec!['º', 'ª'];
if allow_tab {
allowed_chars.push('\t');
}
if allow_eol {
allowed_chars.push('\n');
allowed_chars.push('\r');
}

Ok(custom_normalization(value, allowed_chars, collapse_whitespace, remove_emojis)
.trim()
.to_string())
}
Expand Down

0 comments on commit 5677a3f

Please sign in to comment.