Skip to content

Commit

Permalink
Remove AsRef and instead introduce Cow-returning canonicalize methods…
Browse files Browse the repository at this point in the history
… on locale/langid
  • Loading branch information
Manishearth committed Oct 23, 2024
1 parent d760116 commit 1ab3fde
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- New crate
- Allow `LocaleDirectionality` to wrap a `LocaleExpander` with user-controlled storage (https://github.com/unicode-org/icu4x/pull/5704)
- Allow `LocaleCanonicalizer` to wrap a `LocaleExpander` with user-controlled storage (https://github.com/unicode-org/icu4x/pull/5718)
- Split `canonicalize()` on `Locale` and `LanguageIdentifier` into `canonicalize()` and `canonicalize_utf8()`, and have it return a `Cow` (https://github.com/unicode-org/icu4x/pull/5727)
- `icu_locale_core`
- New crate, renamed from `icu_locid`
- Removed `Ord` and `PartialOrd` impl from `extensions::unicode::Unicode` (https://github.com/unicode-org/icu4x/pull/5617)
Expand Down
45 changes: 42 additions & 3 deletions components/locale_core/src/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::parser::{
ParserMode, SubtagIterator,
};
use crate::subtags;
use alloc::string::String;
use alloc::borrow::Cow;
use writeable::Writeable;

/// A core struct representing a [`Unicode BCP47 Language Identifier`].
Expand Down Expand Up @@ -168,6 +168,8 @@ impl LanguageIdentifier {
&& self.variants.is_empty()
}

/// Canonicalize the language identifier (operating on UTF-8 formatted byte slices)
///
/// This is a best-effort operation that performs all available levels of canonicalization.
///
/// At the moment the operation will normalize casing and the separator, but in the future
Expand All @@ -183,9 +185,46 @@ impl LanguageIdentifier {
/// Ok("pl-Latn-PL")
/// );
/// ```
pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParseError> {
pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
let lang_id = Self::try_from_utf8(input.as_ref())?;
Ok(lang_id.write_to_string().into_owned())
let cow = lang_id.write_to_string();
if cow.as_bytes() == input {
if let Ok(s) = core::str::from_utf8(input) {
Ok(s.into())
} else {
Ok(cow.into_owned().into())
}
} else {
Ok(cow.into_owned().into())
}
}

/// Canonicalize the language identifier (operating on strings)
///
/// This is a best-effort operation that performs all available levels of canonicalization.
///
/// At the moment the operation will normalize casing and the separator, but in the future
/// it may also validate and update from deprecated subtags to canonical ones.
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// assert_eq!(
/// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
/// Ok("pl-Latn-PL")
/// );
/// ```
pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
let lang_id = Self::try_from_str(input.as_ref())?;
let cow = lang_id.write_to_string();

if cow == input {
Ok(input.into())
} else {
Ok(cow.into_owned().into())
}
}

/// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
Expand Down
44 changes: 40 additions & 4 deletions components/locale_core/src/locale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::parser::{
};
use crate::subtags::Subtag;
use crate::{extensions, subtags, LanguageIdentifier};
use alloc::string::String;
use alloc::borrow::Cow;
use core::cmp::Ordering;
use core::str::FromStr;
use writeable::Writeable;
Expand Down Expand Up @@ -152,6 +152,8 @@ impl Locale {
}
}

/// Canonicalize the locale (operating on UTF-8 formatted byte slices)
///
/// This is a best-effort operation that performs all available levels of canonicalization.
///
/// At the moment the operation will normalize casing and the separator, but in the future
Expand All @@ -163,13 +165,47 @@ impl Locale {
/// use icu::locale::Locale;
///
/// assert_eq!(
/// Locale::canonicalize_utf8(b"pL_latn_pl-U-HC-H12").as_deref(),
/// Ok("pl-Latn-PL-u-hc-h12")
/// );
/// ```
pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
let locale = Self::try_from_utf8(input)?;
let cow = locale.write_to_string();
if cow.as_bytes() == input {
if let Ok(s) = core::str::from_utf8(input) {
Ok(s.into())
} else {
Ok(cow.into_owned().into())
}
} else {
Ok(cow.into_owned().into())
}
}

/// Canonicalize the locale (operating on strings)
///
/// At the moment the operation will normalize casing and the separator, but in the future
/// it may also validate and update from deprecated subtags to canonical ones.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// assert_eq!(
/// Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
/// Ok("pl-Latn-PL-u-hc-h12")
/// );
/// ```
pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParseError> {
let locale = Self::try_from_utf8(input.as_ref())?;
Ok(locale.write_to_string().into_owned())
pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
let locale = Self::try_from_str(input)?;
let cow = locale.write_to_string();
if cow == input {
Ok(input.into())
} else {
Ok(cow.into_owned().into())
}
}

/// Compare this [`Locale`] with BCP-47 bytes.
Expand Down

0 comments on commit 1ab3fde

Please sign in to comment.