Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Writeable::write_cmp_bytes and use it in DataLocale and Locale #4402

Merged
merged 5 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion components/locid/src/extensions/unicode/keywords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ use core::borrow::Borrow;
use core::cmp::Ordering;
use core::iter::FromIterator;
use litemap::LiteMap;
use writeable::Writeable;

use super::Key;
use super::Value;
use crate::helpers::ShortSlice;
#[allow(deprecated)]
use crate::ordering::SubtagOrderingResult;

/// A list of [`Key`]-[`Value`] pairs representing functional information
Expand Down Expand Up @@ -303,7 +305,7 @@ impl Keywords {
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
self.write_cmp_bytes(other)
}

/// Compare this [`Keywords`] with an iterator of BCP-47 subtags.
Expand Down Expand Up @@ -340,6 +342,8 @@ impl Keywords {
/// kwds.strict_cmp_iter(subtags.iter().copied()).end()
/// );
/// ```
#[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
#[allow(deprecated)]
pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
where
I: Iterator<Item = &'l [u8]>,
Expand Down
5 changes: 4 additions & 1 deletion components/locid/src/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use core::cmp::Ordering;
use core::str::FromStr;

#[allow(deprecated)]
use crate::ordering::SubtagOrderingResult;
use crate::parser::{
parse_language_identifier, parse_language_identifier_with_single_variant, ParserError,
Expand Down Expand Up @@ -199,7 +200,7 @@ impl LanguageIdentifier {
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
self.write_cmp_bytes(other)
}

/// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
Expand Down Expand Up @@ -235,6 +236,8 @@ impl LanguageIdentifier {
/// loc.strict_cmp_iter(subtags.iter().copied()).end()
/// );
/// ```
#[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
#[allow(deprecated)]
pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
where
I: Iterator<Item = &'l [u8]>,
Expand Down
1 change: 1 addition & 0 deletions components/locid/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ mod parser;

pub use langid::LanguageIdentifier;
pub use locale::Locale;
#[allow(deprecated)]
pub use ordering::SubtagOrderingResult;
pub use parser::errors::ParserError;

Expand Down
5 changes: 4 additions & 1 deletion components/locid/src/locale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

#[allow(deprecated)]
use crate::ordering::SubtagOrderingResult;
use crate::parser::{
parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
Expand Down Expand Up @@ -192,7 +193,7 @@ impl Locale {
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
self.write_cmp_bytes(other)
}

/// Compare this [`Locale`] with an iterator of BCP-47 subtags.
Expand Down Expand Up @@ -229,6 +230,8 @@ impl Locale {
/// loc.strict_cmp_iter(subtags.iter().copied()).end()
/// );
/// ```
#[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
#[allow(deprecated)]
pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
where
I: Iterator<Item = &'l [u8]>,
Expand Down
4 changes: 4 additions & 0 deletions components/locid/src/ordering.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,17 @@ use core::cmp::Ordering;
/// [`Locale::strict_cmp_iter`]: crate::Locale::strict_cmp_iter
#[allow(clippy::exhaustive_enums)] // well-defined exhaustive enum semantics
#[derive(Debug)]
#[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
pub enum SubtagOrderingResult<I> {
/// Potentially remaining subtags after the comparison operation.
#[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
Subtags(I),
/// Resolved ordering between the locale object and the subtags.
#[deprecated(since = "1.5.0", note = "if you need this, please file an issue")]
Ordering(Ordering),
}

#[allow(deprecated)]
impl<I> SubtagOrderingResult<I>
where
I: Iterator,
Expand Down
11 changes: 11 additions & 0 deletions provider/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ serde_json = "1.0"
icu_provider_adapters = { path = "../../provider/adapters" }
icu_locid_transform = { path = "../../components/locid_transform" }

[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
criterion = "0.4"

[features]
bench = []
std = ["icu_locid/std"]
sync = []
experimental = []
Expand All @@ -75,3 +79,10 @@ datagen = ["serde", "dep:erased-serde", "dep:databake", "std", "sync"]
denylist = ["macros"]
# We have tons of features here, limit the amount of tests we run
max_combination_size = 3

[lib]
bench = false # This option is required for Benchmark CI

[[bench]]
name = "data_locale_bench"
harness = false
86 changes: 86 additions & 0 deletions provider/core/benches/data_locale_bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

extern crate alloc;

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use icu_provider::prelude::*;
use std::str::FromStr;
use writeable::Writeable;

static BCP47_STRINGS: &[&str] = &[
"ca",
"ca-ES",
"ca-ES-u-ca-buddhist",
"ca-ES-valencia",
"ca-ES-x-gbp",
"ca-ES-x-gbp-short",
"ca-ES-x-usd",
"ca-ES-xyzabc",
"ca-x-eur",
"cat",
"pl-Latn-PL",
"und",
"und-fonipa",
"und-u-ca-hebrew",
"und-u-ca-japanese",
"und-x-mxn",
"zh",
];

fn overview_bench(c: &mut Criterion) {
c.bench_function("data_locale/overview", |b| {
b.iter(|| {
for s in black_box(BCP47_STRINGS).iter() {
let loc = DataLocale::from_str(s).unwrap();
let loc = loc.clone();
let s = loc.write_to_string();
loc.strict_cmp(s.as_bytes());
}
});
});

#[cfg(feature = "bench")]
data_locale_bench(c);
}

#[cfg(feature = "bench")]
fn data_locale_bench(c: &mut Criterion) {
c.bench_function("data_locale/parse", |b| {
b.iter(|| {
for s in black_box(BCP47_STRINGS).iter() {
DataLocale::from_str(s).unwrap();
}
});
});

let data_locales: Vec<DataLocale> = BCP47_STRINGS.iter().map(|s| s.parse().unwrap()).collect();

c.bench_function("data_locale/write_to_string", |b| {
b.iter(|| {
for loc in black_box(&data_locales).iter() {
loc.write_to_string();
}
});
});
c.bench_function("data_locale/clone", |b| {
b.iter(|| {
for loc in black_box(&data_locales).iter() {
let _ = loc.clone();
}
});
});
c.bench_function("data_locale/strict_cmp", |b| {
b.iter(|| {
for loc in black_box(&data_locales).iter() {
for s in black_box(BCP47_STRINGS).iter() {
loc.strict_cmp(s.as_bytes());
}
}
});
});
}

criterion_group!(benches, overview_bench,);
criterion_main!(benches);
48 changes: 2 additions & 46 deletions provider/core/src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use core::hash::Hash;
use core::str::FromStr;
use icu_locid::extensions::unicode as unicode_ext;
use icu_locid::subtags::{Language, Region, Script, Variants};
use icu_locid::{LanguageIdentifier, Locale, SubtagOrderingResult};
use icu_locid::{LanguageIdentifier, Locale};
use writeable::{LengthHint, Writeable};

#[cfg(feature = "experimental")]
Expand Down Expand Up @@ -342,34 +342,7 @@ impl DataLocale {
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
let subtags = other.split(|b| *b == b'-');
let mut subtag_result = self.langid.strict_cmp_iter(subtags);
if self.has_unicode_ext() {
let mut subtags = match subtag_result {
SubtagOrderingResult::Subtags(s) => s,
SubtagOrderingResult::Ordering(o) => return o,
};
match subtags.next() {
Some(b"u") => (),
Some(s) => return s.cmp(b"u").reverse(),
None => return Ordering::Greater,
}
subtag_result = self.keywords.strict_cmp_iter(subtags);
}
#[cfg(feature = "experimental")]
if let Some(aux) = self.get_aux() {
let mut subtags = match subtag_result {
SubtagOrderingResult::Subtags(s) => s,
SubtagOrderingResult::Ordering(o) => return o,
};
match subtags.next() {
Some(b"x") => (),
Some(s) => return s.cmp(b"x").reverse(),
None => return Ordering::Greater,
}
subtag_result = aux.strict_cmp_iter(subtags);
}
subtag_result.end()
self.write_cmp_bytes(other)
}
}

Expand Down Expand Up @@ -987,23 +960,6 @@ impl AuxiliaryKeys {
})
}

pub(crate) fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
where
I: Iterator<Item = &'l [u8]>,
{
for subtag in self.value.split(Self::separator()) {
if let Some(other) = subtags.next() {
match subtag.as_bytes().cmp(other) {
Ordering::Equal => (),
not_equal => return SubtagOrderingResult::Ordering(not_equal),
}
} else {
return SubtagOrderingResult::Ordering(Ordering::Greater);
}
}
SubtagOrderingResult::Subtags(subtags)
}

/// Returns the internal separator byte used for auxiliary keys in data locales.
///
/// This is, according to BCP-47, an ASCII hyphen.
Expand Down
99 changes: 99 additions & 0 deletions utils/writeable/src/cmp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use core::cmp::Ordering;
use core::fmt;

pub(crate) struct WriteComparator<'a> {
string: &'a [u8],
result: Ordering,
}

/// This is an infallible impl. Functions always return Ok, not Err.
impl<'a> fmt::Write for WriteComparator<'a> {
#[inline]
fn write_str(&mut self, other: &str) -> fmt::Result {
if self.result != Ordering::Equal {
return Ok(());
}
let cmp_len = core::cmp::min(other.len(), self.string.len());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if other is longer than self we should be capping out, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If other is longer, then the only effect is that on the line below the remainder becomes empty and we compare the whole of other to the whole of self.string.

let (this, remainder) = self.string.split_at(cmp_len);
self.string = remainder;
self.result = this.cmp(other.as_bytes());
Ok(())
}
}

impl<'a> WriteComparator<'a> {
#[inline]
pub fn new(string: &'a (impl AsRef<[u8]> + ?Sized)) -> Self {
Self {
string: string.as_ref(),
result: Ordering::Equal,
}
}

#[inline]
pub fn finish(self) -> Ordering {
if matches!(self.result, Ordering::Equal) && !self.string.is_empty() {
// Self is longer than Other
Ordering::Greater
} else {
self.result
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use core::fmt::Write;

mod data {
include!("../tests/data/data.rs");
}

#[test]
fn test_write_char() {
for a in data::KEBAB_CASE_STRINGS {
for b in data::KEBAB_CASE_STRINGS {
let mut wc = WriteComparator::new(a);
for ch in b.chars() {
wc.write_char(ch).unwrap();
}
assert_eq!(a.cmp(b), wc.finish(), "{a} <=> {b}");
}
}
}

#[test]
fn test_write_str() {
for a in data::KEBAB_CASE_STRINGS {
for b in data::KEBAB_CASE_STRINGS {
let mut wc = WriteComparator::new(a);
wc.write_str(b).unwrap();
assert_eq!(a.cmp(b), wc.finish(), "{a} <=> {b}");
}
}
}

#[test]
fn test_mixed() {
for a in data::KEBAB_CASE_STRINGS {
for b in data::KEBAB_CASE_STRINGS {
let mut wc = WriteComparator::new(a);
let mut first = true;
for substr in b.split('-') {
if first {
first = false;
} else {
wc.write_char('-').unwrap();
}
wc.write_str(substr).unwrap();
}
assert_eq!(a.cmp(b), wc.finish(), "{a} <=> {b}");
}
}
}
}
Loading