diff --git a/Cargo.lock b/Cargo.lock index 6e90be1c440..2f9ec8cebc4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1882,6 +1882,7 @@ name = "icu_provider" version = "1.4.0" dependencies = [ "bincode", + "criterion", "databake", "displaydoc", "erased-serde", diff --git a/components/locid/src/extensions/unicode/keywords.rs b/components/locid/src/extensions/unicode/keywords.rs index c2839fa44f8..9c425acf07a 100644 --- a/components/locid/src/extensions/unicode/keywords.rs +++ b/components/locid/src/extensions/unicode/keywords.rs @@ -6,10 +6,12 @@ use core::borrow::Borrow; use core::cmp::Ordering; use core::iter::FromIterator; use litemap::LiteMap; +use writeable::Writeable; use super::Key; use super::Value; use crate::helpers::ShortSlice; +#[allow(deprecated)] use crate::ordering::SubtagOrderingResult; /// A list of [`Key`]-[`Value`] pairs representing functional information @@ -303,7 +305,7 @@ impl Keywords { /// } /// ``` pub fn strict_cmp(&self, other: &[u8]) -> Ordering { - self.strict_cmp_iter(other.split(|b| *b == b'-')).end() + self.write_cmp_bytes(other) } /// Compare this [`Keywords`] with an iterator of BCP-47 subtags. @@ -340,6 +342,8 @@ impl Keywords { /// kwds.strict_cmp_iter(subtags.iter().copied()).end() /// ); /// ``` + #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")] + #[allow(deprecated)] pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult where I: Iterator, diff --git a/components/locid/src/langid.rs b/components/locid/src/langid.rs index eac8c83713e..f0fdc50320a 100644 --- a/components/locid/src/langid.rs +++ b/components/locid/src/langid.rs @@ -5,6 +5,7 @@ use core::cmp::Ordering; use core::str::FromStr; +#[allow(deprecated)] use crate::ordering::SubtagOrderingResult; use crate::parser::{ parse_language_identifier, parse_language_identifier_with_single_variant, ParserError, @@ -199,7 +200,7 @@ impl LanguageIdentifier { /// } /// ``` pub fn strict_cmp(&self, other: &[u8]) -> Ordering { - self.strict_cmp_iter(other.split(|b| *b == b'-')).end() + self.write_cmp_bytes(other) } /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags. @@ -235,6 +236,8 @@ impl LanguageIdentifier { /// loc.strict_cmp_iter(subtags.iter().copied()).end() /// ); /// ``` + #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")] + #[allow(deprecated)] pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult where I: Iterator, diff --git a/components/locid/src/lib.rs b/components/locid/src/lib.rs index 9c6c46ca512..89e93545773 100644 --- a/components/locid/src/lib.rs +++ b/components/locid/src/lib.rs @@ -75,6 +75,7 @@ mod parser; pub use langid::LanguageIdentifier; pub use locale::Locale; +#[allow(deprecated)] pub use ordering::SubtagOrderingResult; pub use parser::errors::ParserError; diff --git a/components/locid/src/locale.rs b/components/locid/src/locale.rs index e87cdf1a205..4963fa76725 100644 --- a/components/locid/src/locale.rs +++ b/components/locid/src/locale.rs @@ -2,6 +2,7 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +#[allow(deprecated)] use crate::ordering::SubtagOrderingResult; use crate::parser::{ parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, @@ -192,7 +193,7 @@ impl Locale { /// } /// ``` pub fn strict_cmp(&self, other: &[u8]) -> Ordering { - self.strict_cmp_iter(other.split(|b| *b == b'-')).end() + self.write_cmp_bytes(other) } /// Compare this [`Locale`] with an iterator of BCP-47 subtags. @@ -229,6 +230,8 @@ impl Locale { /// loc.strict_cmp_iter(subtags.iter().copied()).end() /// ); /// ``` + #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")] + #[allow(deprecated)] pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult where I: Iterator, diff --git a/components/locid/src/ordering.rs b/components/locid/src/ordering.rs index c877c60c395..982d7b9f93f 100644 --- a/components/locid/src/ordering.rs +++ b/components/locid/src/ordering.rs @@ -36,13 +36,17 @@ use core::cmp::Ordering; /// [`Locale::strict_cmp_iter`]: crate::Locale::strict_cmp_iter #[allow(clippy::exhaustive_enums)] // well-defined exhaustive enum semantics #[derive(Debug)] +#[deprecated(since = "1.5.0", note = "if you need this, please file an issue")] pub enum SubtagOrderingResult { /// Potentially remaining subtags after the comparison operation. + #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")] Subtags(I), /// Resolved ordering between the locale object and the subtags. + #[deprecated(since = "1.5.0", note = "if you need this, please file an issue")] Ordering(Ordering), } +#[allow(deprecated)] impl SubtagOrderingResult where I: Iterator, diff --git a/provider/core/Cargo.toml b/provider/core/Cargo.toml index 32902d7aee0..f7f7d91cdfd 100644 --- a/provider/core/Cargo.toml +++ b/provider/core/Cargo.toml @@ -50,7 +50,11 @@ serde_json = "1.0" icu_provider_adapters = { path = "../../provider/adapters" } icu_locid_transform = { path = "../../components/locid_transform" } +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] +criterion = "0.4" + [features] +bench = [] std = ["icu_locid/std"] sync = [] experimental = [] @@ -75,3 +79,10 @@ datagen = ["serde", "dep:erased-serde", "dep:databake", "std", "sync"] denylist = ["macros"] # We have tons of features here, limit the amount of tests we run max_combination_size = 3 + +[lib] +bench = false # This option is required for Benchmark CI + +[[bench]] +name = "data_locale_bench" +harness = false diff --git a/provider/core/benches/data_locale_bench.rs b/provider/core/benches/data_locale_bench.rs new file mode 100644 index 00000000000..4a0fa0aad8c --- /dev/null +++ b/provider/core/benches/data_locale_bench.rs @@ -0,0 +1,86 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +extern crate alloc; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use icu_provider::prelude::*; +use std::str::FromStr; +use writeable::Writeable; + +static BCP47_STRINGS: &[&str] = &[ + "ca", + "ca-ES", + "ca-ES-u-ca-buddhist", + "ca-ES-valencia", + "ca-ES-x-gbp", + "ca-ES-x-gbp-short", + "ca-ES-x-usd", + "ca-ES-xyzabc", + "ca-x-eur", + "cat", + "pl-Latn-PL", + "und", + "und-fonipa", + "und-u-ca-hebrew", + "und-u-ca-japanese", + "und-x-mxn", + "zh", +]; + +fn overview_bench(c: &mut Criterion) { + c.bench_function("data_locale/overview", |b| { + b.iter(|| { + for s in black_box(BCP47_STRINGS).iter() { + let loc = DataLocale::from_str(s).unwrap(); + let loc = loc.clone(); + let s = loc.write_to_string(); + loc.strict_cmp(s.as_bytes()); + } + }); + }); + + #[cfg(feature = "bench")] + data_locale_bench(c); +} + +#[cfg(feature = "bench")] +fn data_locale_bench(c: &mut Criterion) { + c.bench_function("data_locale/parse", |b| { + b.iter(|| { + for s in black_box(BCP47_STRINGS).iter() { + DataLocale::from_str(s).unwrap(); + } + }); + }); + + let data_locales: Vec = BCP47_STRINGS.iter().map(|s| s.parse().unwrap()).collect(); + + c.bench_function("data_locale/write_to_string", |b| { + b.iter(|| { + for loc in black_box(&data_locales).iter() { + loc.write_to_string(); + } + }); + }); + c.bench_function("data_locale/clone", |b| { + b.iter(|| { + for loc in black_box(&data_locales).iter() { + let _ = loc.clone(); + } + }); + }); + c.bench_function("data_locale/strict_cmp", |b| { + b.iter(|| { + for loc in black_box(&data_locales).iter() { + for s in black_box(BCP47_STRINGS).iter() { + loc.strict_cmp(s.as_bytes()); + } + } + }); + }); +} + +criterion_group!(benches, overview_bench,); +criterion_main!(benches); diff --git a/provider/core/src/request.rs b/provider/core/src/request.rs index 99808a8a9d0..848503669c6 100644 --- a/provider/core/src/request.rs +++ b/provider/core/src/request.rs @@ -11,7 +11,7 @@ use core::hash::Hash; use core::str::FromStr; use icu_locid::extensions::unicode as unicode_ext; use icu_locid::subtags::{Language, Region, Script, Variants}; -use icu_locid::{LanguageIdentifier, Locale, SubtagOrderingResult}; +use icu_locid::{LanguageIdentifier, Locale}; use writeable::{LengthHint, Writeable}; #[cfg(feature = "experimental")] @@ -342,34 +342,7 @@ impl DataLocale { /// } /// ``` pub fn strict_cmp(&self, other: &[u8]) -> Ordering { - let subtags = other.split(|b| *b == b'-'); - let mut subtag_result = self.langid.strict_cmp_iter(subtags); - if self.has_unicode_ext() { - let mut subtags = match subtag_result { - SubtagOrderingResult::Subtags(s) => s, - SubtagOrderingResult::Ordering(o) => return o, - }; - match subtags.next() { - Some(b"u") => (), - Some(s) => return s.cmp(b"u").reverse(), - None => return Ordering::Greater, - } - subtag_result = self.keywords.strict_cmp_iter(subtags); - } - #[cfg(feature = "experimental")] - if let Some(aux) = self.get_aux() { - let mut subtags = match subtag_result { - SubtagOrderingResult::Subtags(s) => s, - SubtagOrderingResult::Ordering(o) => return o, - }; - match subtags.next() { - Some(b"x") => (), - Some(s) => return s.cmp(b"x").reverse(), - None => return Ordering::Greater, - } - subtag_result = aux.strict_cmp_iter(subtags); - } - subtag_result.end() + self.write_cmp_bytes(other) } } @@ -987,23 +960,6 @@ impl AuxiliaryKeys { }) } - pub(crate) fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult - where - I: Iterator, - { - for subtag in self.value.split(Self::separator()) { - if let Some(other) = subtags.next() { - match subtag.as_bytes().cmp(other) { - Ordering::Equal => (), - not_equal => return SubtagOrderingResult::Ordering(not_equal), - } - } else { - return SubtagOrderingResult::Ordering(Ordering::Greater); - } - } - SubtagOrderingResult::Subtags(subtags) - } - /// Returns the internal separator byte used for auxiliary keys in data locales. /// /// This is, according to BCP-47, an ASCII hyphen. diff --git a/utils/writeable/src/cmp.rs b/utils/writeable/src/cmp.rs new file mode 100644 index 00000000000..aa5e5e4efd8 --- /dev/null +++ b/utils/writeable/src/cmp.rs @@ -0,0 +1,99 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::cmp::Ordering; +use core::fmt; + +pub(crate) struct WriteComparator<'a> { + string: &'a [u8], + result: Ordering, +} + +/// This is an infallible impl. Functions always return Ok, not Err. +impl<'a> fmt::Write for WriteComparator<'a> { + #[inline] + fn write_str(&mut self, other: &str) -> fmt::Result { + if self.result != Ordering::Equal { + return Ok(()); + } + let cmp_len = core::cmp::min(other.len(), self.string.len()); + let (this, remainder) = self.string.split_at(cmp_len); + self.string = remainder; + self.result = this.cmp(other.as_bytes()); + Ok(()) + } +} + +impl<'a> WriteComparator<'a> { + #[inline] + pub fn new(string: &'a (impl AsRef<[u8]> + ?Sized)) -> Self { + Self { + string: string.as_ref(), + result: Ordering::Equal, + } + } + + #[inline] + pub fn finish(self) -> Ordering { + if matches!(self.result, Ordering::Equal) && !self.string.is_empty() { + // Self is longer than Other + Ordering::Greater + } else { + self.result + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use core::fmt::Write; + + mod data { + include!("../tests/data/data.rs"); + } + + #[test] + fn test_write_char() { + for a in data::KEBAB_CASE_STRINGS { + for b in data::KEBAB_CASE_STRINGS { + let mut wc = WriteComparator::new(a); + for ch in b.chars() { + wc.write_char(ch).unwrap(); + } + assert_eq!(a.cmp(b), wc.finish(), "{a} <=> {b}"); + } + } + } + + #[test] + fn test_write_str() { + for a in data::KEBAB_CASE_STRINGS { + for b in data::KEBAB_CASE_STRINGS { + let mut wc = WriteComparator::new(a); + wc.write_str(b).unwrap(); + assert_eq!(a.cmp(b), wc.finish(), "{a} <=> {b}"); + } + } + } + + #[test] + fn test_mixed() { + for a in data::KEBAB_CASE_STRINGS { + for b in data::KEBAB_CASE_STRINGS { + let mut wc = WriteComparator::new(a); + let mut first = true; + for substr in b.split('-') { + if first { + first = false; + } else { + wc.write_char('-').unwrap(); + } + wc.write_str(substr).unwrap(); + } + assert_eq!(a.cmp(b), wc.finish(), "{a} <=> {b}"); + } + } + } +} diff --git a/utils/writeable/src/impls.rs b/utils/writeable/src/impls.rs index a1410fb9486..4f0bc1abc65 100644 --- a/utils/writeable/src/impls.rs +++ b/utils/writeable/src/impls.rs @@ -116,6 +116,11 @@ impl Writeable for str { fn write_to_string(&self) -> Cow { Cow::Borrowed(self) } + + #[inline] + fn write_cmp_bytes(&self, other: &[u8]) -> core::cmp::Ordering { + self.as_bytes().cmp(other) + } } impl Writeable for String { @@ -133,6 +138,11 @@ impl Writeable for String { fn write_to_string(&self) -> Cow { Cow::Borrowed(self) } + + #[inline] + fn write_cmp_bytes(&self, other: &[u8]) -> core::cmp::Ordering { + self.as_bytes().cmp(other) + } } impl Writeable for &T { @@ -155,6 +165,11 @@ impl Writeable for &T { fn write_to_string(&self) -> Cow { (*self).write_to_string() } + + #[inline] + fn write_cmp_bytes(&self, other: &[u8]) -> core::cmp::Ordering { + (*self).write_cmp_bytes(other) + } } #[test] diff --git a/utils/writeable/src/lib.rs b/utils/writeable/src/lib.rs index 0eb6be8d6df..5f1e67f7880 100644 --- a/utils/writeable/src/lib.rs +++ b/utils/writeable/src/lib.rs @@ -66,6 +66,7 @@ extern crate alloc; +mod cmp; mod impls; mod ops; @@ -270,6 +271,52 @@ pub trait Writeable { let _ = self.write_to(&mut output); Cow::Owned(output) } + + /// Compares the contents of this `Writeable` to the given bytes + /// without allocating a String to hold the `Writeable` contents. + /// + /// This returns a lexicographical comparison, the same as if the Writeable + /// were first converted to a String and then compared with `Ord`. For a + /// locale-sensitive string ordering, use an ICU4X Collator. + /// + /// # Examples + /// + /// ``` + /// use core::cmp::Ordering; + /// use core::fmt; + /// use writeable::Writeable; + /// + /// struct WelcomeMessage<'s> { + /// pub name: &'s str, + /// } + /// + /// impl<'s> Writeable for WelcomeMessage<'s> { + /// // see impl in Writeable docs + /// # fn write_to(&self, sink: &mut W) -> fmt::Result { + /// # sink.write_str("Hello, ")?; + /// # sink.write_str(self.name)?; + /// # sink.write_char('!')?; + /// # Ok(()) + /// # } + /// } + /// + /// let message = WelcomeMessage { name: "Alice" }; + /// let message_str = message.write_to_string(); + /// + /// assert_eq!(Ordering::Equal, message.write_cmp_bytes(b"Hello, Alice!")); + /// + /// assert_eq!(Ordering::Greater, message.write_cmp_bytes(b"Alice!")); + /// assert_eq!(Ordering::Greater, (*message_str).cmp("Alice!")); + /// + /// assert_eq!(Ordering::Less, message.write_cmp_bytes(b"Hello, Bob!")); + /// assert_eq!(Ordering::Less, (*message_str).cmp("Hello, Bob!")); + /// ``` + fn write_cmp_bytes(&self, other: &[u8]) -> core::cmp::Ordering { + let mut wc = cmp::WriteComparator::new(other); + #[allow(clippy::unwrap_used)] // infallible impl + self.write_to(&mut wc).unwrap(); + wc.finish().reverse() + } } /// Implements [`Display`](core::fmt::Display) for types that implement [`Writeable`]. diff --git a/utils/writeable/tests/data/data.rs b/utils/writeable/tests/data/data.rs new file mode 100644 index 00000000000..bc9dab6cc0d --- /dev/null +++ b/utils/writeable/tests/data/data.rs @@ -0,0 +1,26 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#[allow(dead_code)] +pub const KEBAB_CASE_STRINGS: &[&str] = &[ + "ca", + "ca-ÉS", + "ca-ÉS-u-ca-buddhist", + "ca-ÉS-valencia", + "ca-ÉS-x-gbp", + "ca-ÉS-x-gbp-short", + "ca-ÉS-x-usd", + "ca-ÉS-xyzabc", + "ca-x-eur", + "cat", + "cat-bus", + "cat-🚐", + "pl-Latn-PL", + "und", + "und-fonipa", + "und-u-ca-hebrew", + "und-u-ca-japanese", + "und-x-mxn", + "zh", +];