Skip to content

Commit

Permalink
More casemapping fixes (#3544)
Browse files Browse the repository at this point in the history
* Add Writeable methods

* Proper docs for ExceptionBitsULE

* Remove usages of std

* Boilerplate lints and stuff

* Expose add_case_closure

* allow(unused) for validate()

* fix

* clippy
  • Loading branch information
Manishearth authored Jun 16, 2023
1 parent 0840ff2 commit 244a499
Show file tree
Hide file tree
Showing 13 changed files with 248 additions and 102 deletions.
3 changes: 2 additions & 1 deletion experimental/casemapping/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ readme = "README.md"
repository = "https://github.com/unicode-org/icu4x"
homepage = "https://icu4x.unicode.org"
license = "Unicode-DFS-2016"
categories = ["internationalization"]
# Keep this in sync with other crates unless there are exceptions
include = [
"src/**/*",
Expand Down Expand Up @@ -47,5 +48,5 @@ icu_casemapping_data = { path = "data", optional = true }
[features]
std = ["icu_collections/std", "icu_provider/std"]
serde = ["dep:serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde"]
datagen = ["serde", "std", "dep:databake", "zerovec/databake", "icu_collections/databake"]
datagen = ["serde", "dep:databake", "zerovec/databake", "icu_collections/databake"]
data = ["dep:icu_casemapping_data"]
116 changes: 105 additions & 11 deletions experimental/casemapping/src/casemapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
use crate::internals::{CaseMapLocale, FoldOptions};
use crate::provider::data::MappingKind;
use crate::provider::CaseMappingV1Marker;
use crate::set::ClosureSet;
use alloc::string::String;
use icu_locid::Locale;
use icu_provider::prelude::*;
use writeable::Writeable;
Expand All @@ -18,7 +20,7 @@ use writeable::Writeable;
/// of the icu meta-crate. Use with caution.
/// <a href="https://github.com/unicode-org/icu4x/issues/2535">#2535</a>
/// </div>
#[derive(Clone)]
#[derive(Clone, Debug)]
pub struct CaseMapping {
data: DataPayload<CaseMappingV1Marker>,
locale: CaseMapLocale,
Expand Down Expand Up @@ -139,53 +141,145 @@ impl CaseMapping {
.simple_fold(c, FoldOptions::with_turkic_mappings())
}

/// Returns the full lowercase mapping of the given string.
/// Returns the full lowercase mapping of the given string as a [`Writeable`].
/// This function is context and locale sensitive.
pub fn to_full_lowercase(&self, src: &str) -> String {
///
/// See [`Self::to_full_lowercase_string()`] for the equivalent convenience function that returns a String
pub fn to_full_lowercase<'a>(&'a self, src: &'a str) -> impl Writeable + 'a {
self.data
.get()
.full_helper_writeable(src, self.locale, MappingKind::Lower)
}

/// Returns the full uppercase mapping of the given string as a [`Writeable`].
/// This function is context and locale sensitive.
///
/// See [`Self::to_full_uppercase_string()`] for the equivalent convenience function that returns a String
pub fn to_full_uppercase<'a>(&'a self, src: &'a str) -> impl Writeable + 'a {
self.data
.get()
.full_helper_writeable(src, self.locale, MappingKind::Upper)
}

/// Returns the full titlecase mapping of the given string as a [`Writeable`].
/// This function is context and locale sensitive.
///
/// See [`Self::to_full_titlecase_string()`] for the equivalent convenience function that returns a String
pub fn to_full_titlecase<'a>(&'a self, src: &'a str) -> impl Writeable + 'a {
self.data
.get()
.full_helper_writeable(src, self.locale, MappingKind::Title)
}

/// Case-folds the characters in the given string as a [`Writeable`].
/// This function is locale-independent and context-insensitive.
///
/// See [`Self::full_fold_string()`] for the equivalent convenience function that returns a String
pub fn full_fold<'a>(&'a self, src: &'a str) -> impl Writeable + 'a {
self.data
.get()
.full_helper_writeable(src, CaseMapLocale::Root, MappingKind::Fold)
}

/// Case-folds the characters in the given string as a [`Writeable`],
/// using Turkic (T) mappings for dotted/dotless I.
/// This function is locale-independent and context-insensitive.
///
/// See [`Self::full_fold_turkic_string()`] for the equivalent convenience function that returns a String
pub fn full_fold_turkic<'a>(&'a self, src: &'a str) -> impl Writeable + 'a {
self.data
.get()
.full_helper_writeable(src, CaseMapLocale::Turkish, MappingKind::Fold)
}

/// Returns the full lowercase mapping of the given string as a String.
/// This function is context and locale sensitive.
///
/// See [`Self::to_full_lowercase()`] for the equivalent lower-level function that returns a [`Writeable`]
pub fn to_full_lowercase_string(&self, src: &str) -> String {
self.data
.get()
.full_helper_writeable(src, self.locale, MappingKind::Lower)
.write_to_string()
.into_owned()
}

/// Returns the full uppercase mapping of the given string.
/// Returns the full uppercase mapping of the given string as a String.
/// This function is context and locale sensitive.
pub fn to_full_uppercase(&self, src: &str) -> String {
///
/// See [`Self::to_full_uppercase()`] for the equivalent lower-level function that returns a [`Writeable`]
pub fn to_full_uppercase_string(&self, src: &str) -> String {
self.data
.get()
.full_helper_writeable(src, self.locale, MappingKind::Upper)
.write_to_string()
.into_owned()
}

/// Returns the full titlecase mapping of the given string.
/// Returns the full titlecase mapping of the given string as a String.
/// This function is context and locale sensitive.
pub fn to_full_titlecase(&self, src: &str) -> String {
///
/// See [`Self::to_full_titlecase()`] for the equivalent lower-level function that returns a [`Writeable`]
pub fn to_full_titlecase_string(&self, src: &str) -> String {
self.data
.get()
.full_helper_writeable(src, self.locale, MappingKind::Title)
.write_to_string()
.into_owned()
}

/// Case-folds the characters in the given string.
/// Case-folds the characters in the given string as a String.
/// This function is locale-independent and context-insensitive.
pub fn full_fold(&self, src: &str) -> String {
///
/// See [`Self::full_fold()`] for the equivalent lower-level function that returns a [`Writeable`]
pub fn full_fold_string(&self, src: &str) -> String {
self.data
.get()
.full_helper_writeable(src, CaseMapLocale::Root, MappingKind::Fold)
.write_to_string()
.into_owned()
}

/// Case-folds the characters in the given string, using Turkic (T) mappings for dotted/dotless I.
/// Case-folds the characters in the given string as a String,
/// using Turkic (T) mappings for dotted/dotless I.
/// This function is locale-independent and context-insensitive.
pub fn full_fold_turkic(&self, src: &str) -> String {
///
/// See [`Self::full_fold_turkic()`] for the equivalent lower-level function that returns a [`Writeable`]
pub fn full_fold_turkic_string(&self, src: &str) -> String {
self.data
.get()
.full_helper_writeable(src, CaseMapLocale::Turkish, MappingKind::Fold)
.write_to_string()
.into_owned()
}

/// Adds all simple case mappings and the full case folding for `c` to `set`.
/// Also adds special case closure mappings.
///
/// In other words, this adds all strings/characters that this casemaps to, as
/// well as all characters that may casemap to this one.
///
/// The character itself is not added.
///
/// For example, the mappings
/// - for s include long s
/// - for sharp s include ss
/// - for k include the Kelvin sign
pub fn add_case_closure<S: ClosureSet>(&self, c: char, set: &mut S) {
self.data.get().add_case_closure(c, set);
}

/// Maps the string to single code points and adds the associated case closure
/// mappings, if they exist.
///
/// The string is mapped to code points if it is their full case folding string.
/// In other words, this performs a reverse full case folding and then
/// adds the case closure items of the resulting code points.
/// If the string is found and its closure applied, then
/// the string itself is added as well as part of its code points' closure.
///
/// Returns true if the string was found
pub fn add_string_case_closure<S: ClosureSet>(&self, s: &str, set: &mut S) -> bool {
self.data.get().add_string_case_closure(s, set)
}
}
1 change: 1 addition & 0 deletions experimental/casemapping/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use icu_collections::codepointtrie::CodePointTrieError;
/// <a href="https://github.com/unicode-org/icu4x/issues/2535">#2535</a>
/// </div>
#[derive(Clone, Display, Debug, PartialEq)]
#[non_exhaustive]
pub enum Error {
/// An error occurred while building and validating the data
#[displaydoc("Failed to validate: {0}")]
Expand Down
68 changes: 22 additions & 46 deletions experimental/casemapping/src/internals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
use crate::provider::data::{DotType, MappingKind};
use crate::provider::exception_helpers::ExceptionSlot;
use crate::provider::CaseMappingV1;
use icu_collections::codepointinvlist::CodePointInversionListBuilder;
use crate::set::ClosureSet;
use core::fmt;
use icu_locid::Locale;
use std::fmt;
use writeable::Writeable;

// Used to control the behavior of CaseMapping::fold.
Expand Down Expand Up @@ -340,14 +340,15 @@ impl<'data> CaseMappingV1<'data> {
}

impl<'a> Writeable for FullCaseWriteable<'a> {
#[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds
fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
// To speed up the copying of long runs where nothing changes, we keep track
// of the start of the uncopied chunk, and don't copy it until we have to.
let mut last_uncopied_idx = 0;

let src = self.src;
for (i, c) in src.char_indices() {
let context = ContextIterator::new(src, i);
let context = ContextIterator::new(&src[..i], &src[i..]);
match self.data.full_helper(c, context, self.locale, self.mapping) {
FullMappingResult::CodePoint(c2) => {
if c == c2 {
Expand Down Expand Up @@ -386,14 +387,14 @@ impl<'data> CaseMappingV1<'data> {
}
}

// Adds all simple case mappings and the full case folding for `c` to `set`.
// Also adds special case closure mappings.
// The character itself is not added.
// For example, the mappings
// - for s include long s
// - for sharp s include ss
// - for k include the Kelvin sign
fn add_case_closure<S: ClosureSet>(&self, c: char, set: &mut S) {
/// Adds all simple case mappings and the full case folding for `c` to `set`.
/// Also adds special case closure mappings.
/// The character itself is not added.
/// For example, the mappings
/// - for s include long s
/// - for sharp s include ss
/// - for k include the Kelvin sign
pub(crate) fn add_case_closure<S: ClosureSet>(&self, c: char, set: &mut S) {
// Hardcode the case closure of i and its relatives and ignore the
// data file data for these characters.
// The Turkic dotless i and dotted I with their case mapping conditions
Expand Down Expand Up @@ -461,17 +462,11 @@ impl<'data> CaseMappingV1<'data> {
exception.add_full_and_closure_mappings(set);
}

// Maps the string to single code points and adds the associated case closure
// mappings.
// The string is mapped to code points if it is their full case folding string.
// In other words, this performs a reverse full case folding and then
// adds the case closure items of the resulting code points.
// If the string is found and its closure applied, then
// the string itself is added as well as part of its code points' closure.
//
// Returns true if the string was found
#[allow(dead_code)]
fn add_string_case_closure<S: ClosureSet>(&self, s: &str, set: &mut S) -> bool {
/// Maps the string to single code points and adds the associated case closure
/// mappings.
///
/// (see docs on CaseMapping::add_string_case_closure)
pub(crate) fn add_string_case_closure<S: ClosureSet>(&self, s: &str, set: &mut S) -> bool {
if s.chars().count() <= 1 {
// The string is too short to find any match.
return false;
Expand All @@ -492,7 +487,7 @@ impl<'data> CaseMappingV1<'data> {
// An internal representation of locale. Non-Root values of this
// enumeration imply that hard-coded special cases exist for this
// language.
#[derive(Copy, Clone, Eq, PartialEq)]
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum CaseMapLocale {
Root,
Turkish,
Expand Down Expand Up @@ -539,36 +534,17 @@ impl<'a> FullMappingResult<'a> {
}
}

// Interface for adding items to a closure set.
pub trait ClosureSet {
/// Add a character to the set
fn add_char(&mut self, c: char);
/// Add a string to the set
fn add_string(&mut self, string: &str);
}

impl ClosureSet for CodePointInversionListBuilder {
fn add_char(&mut self, c: char) {
self.add_char(c)
}

// The current version of CodePointInversionList doesn't include strings.
// Trying to add a string is a no-op that will be optimized away.
#[inline]
fn add_string(&mut self, _string: &str) {}
}

pub(crate) struct ContextIterator<'a> {
before: &'a str,
after: &'a str,
}

impl<'a> ContextIterator<'a> {
// Returns a context iterator with the characters before
// and after the character at a given index.
pub fn new(s: &'a str, idx: usize) -> Self {
let before = &s[..idx];
let mut char_and_after = s[idx..].chars();
// and after the character at a given index, given the preceding
// string and the succeding string including the character itself
pub fn new(before: &'a str, char_and_after: &'a str) -> Self {
let mut char_and_after = char_and_after.chars();
char_and_after.next(); // skip the character itself
let after = char_and_after.as_str();
Self { before, after }
Expand Down
18 changes: 18 additions & 0 deletions experimental/casemapping/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,33 @@
//!
//! [`ICU4X`]: ../icu/index.html

// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations_
#![cfg_attr(not(any(test, feature = "std")), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
missing_debug_implementations,
)
)]
#![warn(missing_docs)]

extern crate alloc;

mod casemapping;
pub mod provider;
mod set;

mod error;
mod internals;

pub use casemapping::CaseMapping;
pub use error::Error as CaseMappingError;
pub use set::ClosureSet;
#[doc(no_inline)]
pub use CaseMappingError as Error;
6 changes: 3 additions & 3 deletions experimental/casemapping/src/provider/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

//! The primary per-codepoint casefolding data

#[cfg(feature = "datagen")]
use alloc::collections::BTreeMap;
use core::num::TryFromIntError;
use icu_collections::codepointtrie::TrieValue;
#[cfg(feature = "datagen")]
use std::collections::HashMap;
use zerovec::ule::{AsULE, RawBytesULE, ULE};
use zerovec::ZeroVecError;

Expand Down Expand Up @@ -258,7 +258,7 @@ impl CaseMappingData {
// a mapping from old to new, this function updates the exception
// index if necessary.
#[cfg(feature = "datagen")]
pub(crate) fn with_updated_exception(self, updates: &HashMap<u16, u16>) -> Self {
pub(crate) fn with_updated_exception(self, updates: &BTreeMap<u16, u16>) -> Self {
let kind = if let CaseMappingDataKind::Exception(ty, index) = self.kind {
if let Some(updated_exception) = updates.get(&index) {
CaseMappingDataKind::Exception(ty, *updated_exception)
Expand Down
Loading

0 comments on commit 244a499

Please sign in to comment.