diff --git a/src/emoji.rs b/src/emoji.rs index 5a5400b..1c003e4 100644 --- a/src/emoji.rs +++ b/src/emoji.rs @@ -1,57 +1,19 @@ -pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ - ("Emoji_Modifier", EMOJI_MODIFIER), - ("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE), - ("Emoji_Presentation", EMOJI_PRESENTATION), +pub const CHAR_TO_AVOID: &'static [(char, char)] = &[ + ('\u{0000}', '\u{001F}'), // Control chars 1 + ('\u{007F}', '\u{009F}'), // Control chars 2 + ('\u{FE00}', '\u{FE0F}'), // Variation Selectors + ('\u{20D0}', '\u{20FF}'), // Combining Diacritical Marks for Symbols + ('\u{2800}', '\u{28FF}'), // Braille Patterns + // ('\u{D800}', '\u{F8FF}'), // High Surrogates, High Private Use Surrogates, Low Surrogates and Private Use Area blocks + ('\u{E000}', '\u{F8FF}'), // Private Use Area blocks + ('\u{10000}', '\u{10FFFF}'), // Extra planes ]; -pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('🏻', '🏿')]; -pub const EXTRA_CHARS: &'static [(char, char)] = - &[('\u{FE0E}', '\u{FE0F}'), ('\u{20E2}', '\u{20E4}')]; - -pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[ +pub const EMOJI: &'static [(char, char)] = &[ + ('🏻', '🏿'), ('☝', '☝'), ('β›Ή', 'β›Ή'), ('✊', '✍'), - ('πŸŽ…', 'πŸŽ…'), - ('πŸ‚', 'πŸ„'), - ('πŸ‡', 'πŸ‡'), - ('🏊', '🏌'), - ('πŸ‘‚', 'πŸ‘ƒ'), - ('πŸ‘†', 'πŸ‘'), - ('πŸ‘¦', 'πŸ‘Έ'), - ('πŸ‘Ό', 'πŸ‘Ό'), - ('πŸ’', 'πŸ’ƒ'), - ('πŸ’…', 'πŸ’‡'), - ('πŸ’', 'πŸ’'), - ('πŸ’‘', 'πŸ’‘'), - ('πŸ’ͺ', 'πŸ’ͺ'), - ('πŸ•΄', 'πŸ•΅'), - ('πŸ•Ί', 'πŸ•Ί'), - ('πŸ–', 'πŸ–'), - ('πŸ–•', 'πŸ––'), - ('πŸ™…', 'πŸ™‡'), - ('πŸ™‹', 'πŸ™'), - ('🚣', '🚣'), - ('🚴', '🚢'), - ('πŸ›€', 'πŸ›€'), - ('πŸ›Œ', 'πŸ›Œ'), - ('🀌', '🀌'), - ('🀏', '🀏'), - ('🀘', '🀟'), - ('🀦', '🀦'), - ('🀰', '🀹'), - ('🀼', '🀾'), - ('πŸ₯·', 'πŸ₯·'), - ('🦡', '🦢'), - ('🦸', '🦹'), - ('🦻', '🦻'), - ('🧍', '🧏'), - ('πŸ§‘', '🧝'), - ('πŸ«ƒ', 'πŸ«…'), - ('🫰', '🫸'), -]; - -pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[ ('⌚', 'βŒ›'), ('⏩', '⏬'), ('⏰', '⏰'), @@ -85,77 +47,24 @@ pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[ ('⬛', '⬜'), ('⭐', '⭐'), ('β­•', 'β­•'), - ('πŸ€„', 'πŸ€„'), - ('πŸƒ', 'πŸƒ'), - ('πŸ†Ž', 'πŸ†Ž'), - ('πŸ†‘', 'πŸ†š'), - ('πŸ‡¦', 'πŸ‡Ώ'), - ('🈁', '🈁'), - ('🈚', '🈚'), - ('🈯', '🈯'), - ('🈲', '🈢'), - ('🈸', '🈺'), - ('πŸ‰', 'πŸ‰‘'), - ('πŸŒ€', '🌠'), - ('🌭', '🌡'), - ('🌷', '🍼'), - ('🍾', 'πŸŽ“'), - ('🎠', '🏊'), - ('🏏', 'πŸ“'), - ('🏠', '🏰'), - ('🏴', '🏴'), - ('🏸', '🐾'), - ('πŸ‘€', 'πŸ‘€'), - ('πŸ‘‚', 'πŸ“Ό'), - ('πŸ“Ώ', 'πŸ”½'), - ('πŸ•‹', 'πŸ•Ž'), - ('πŸ•', 'πŸ•§'), - ('πŸ•Ί', 'πŸ•Ί'), - ('πŸ–•', 'πŸ––'), - ('πŸ–€', 'πŸ–€'), - ('πŸ—»', 'πŸ™'), - ('πŸš€', 'πŸ›…'), - ('πŸ›Œ', 'πŸ›Œ'), - ('πŸ›', 'πŸ›’'), - ('πŸ›•', 'πŸ›—'), - ('πŸ›œ', 'πŸ›Ÿ'), - ('πŸ›«', 'πŸ›¬'), - ('πŸ›΄', 'πŸ›Ό'), - ('🟠', '🟫'), - ('🟰', '🟰'), - ('🀌', '🀺'), - ('🀼', 'πŸ₯…'), - ('πŸ₯‡', '🧿'), - ('🩰', '🩼'), - ('πŸͺ€', 'πŸͺˆ'), - ('πŸͺ', 'πŸͺ½'), - ('πŸͺΏ', 'πŸ«…'), - ('🫎', 'πŸ«›'), - ('🫠', '🫨'), - ('🫰', '🫸'), ]; pub trait IsEmoji { fn is_emoji(&self) -> bool; + fn is_char_to_avoid(&self) -> bool; } impl IsEmoji for char { fn is_emoji(&self) -> bool { - for (lc, hc) in EMOJI_PRESENTATION { - if self >= lc && self <= hc { - return true; - } - } - for (lc, hc) in EMOJI_MODIFIER { + for (lc, hc) in EMOJI { if self >= lc && self <= hc { return true; } } - for (lc, hc) in EMOJI_MODIFIER_BASE { - if self >= lc && self <= hc { - return true; - } - } - for (lc, hc) in EXTRA_CHARS { + false + } + + fn is_char_to_avoid(&self) -> bool { + for (lc, hc) in CHAR_TO_AVOID { if self >= lc && self <= hc { return true; } diff --git a/src/lib.rs b/src/lib.rs index 0518b98..3a25989 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,7 +22,7 @@ fn custom_normalization( let mut result = String::with_capacity(str.len()); let mut previous_whitespace = false; for c in str.chars() { - custom_character_normalization( + previous_whitespace = custom_character_normalization( &mut result, c, &allow_chars, @@ -30,7 +30,6 @@ fn custom_normalization( previous_whitespace, remove_emojis, ); - previous_whitespace = c.is_whitespace(); } result.nfc().collect::() } @@ -42,30 +41,48 @@ fn custom_character_normalization( collapse_whitespace: bool, previous_whitespace: bool, remove_emojis: bool, -) { +) -> bool { if allow_chars.contains(&c) { - str.push(c) + str.push(c); + return false; } else if c.is_whitespace() { - if collapse_whitespace && previous_whitespace { - return; - } else { + if !collapse_whitespace || !previous_whitespace { str.push(' ') } + return true; } else if remove_emojis && c.is_emoji() { - return; + return previous_whitespace; + } + + let mut pushed = false; + decompose_compatible(c, |r| { + // Ignore characters outside the Basic Multilingual Plane, Control chars, etc + if !r.is_char_to_avoid() { + str.push(r); + pushed = true; + } + }); + + if pushed { + false } else { - decompose_compatible(c, |r| { - // Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set - if r <= '\u{FFFF}' { - str.push(r) - } - }) + previous_whitespace } } #[pyfunction] -fn basic_string_clean(value: String) -> PyResult { - Ok(custom_normalization(value, vec!['ΒΊ', 'Βͺ'], false, false) +#[pyo3(signature = (value, allow_tab=false, allow_eol=true, collapse_whitespace=false, remove_emojis=false))] +fn basic_string_clean(value: String, allow_tab: bool, allow_eol: bool, collapse_whitespace: bool, remove_emojis: bool) -> PyResult { + let mut allowed_chars = vec!['ΒΊ', 'Βͺ']; + if allow_tab { + allowed_chars.push('\t'); + } + if allow_eol { + allowed_chars.push('\n'); + allowed_chars.push('\r'); + } + + Ok(custom_normalization(value, allowed_chars, collapse_whitespace, remove_emojis) .trim() .to_string()) }