Remove AsRef and instead introduce Cow-returning canonicalize methods…

… on locale/langid
Manishearth · Oct 23, 2024 · 1ab3fde · 1ab3fde
1 parent d760116
commit 1ab3fde
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@
     - New crate
     - Allow `LocaleDirectionality` to wrap a `LocaleExpander` with user-controlled storage (https://github.com/unicode-org/icu4x/pull/5704)
     - Allow `LocaleCanonicalizer` to wrap a `LocaleExpander` with user-controlled storage (https://github.com/unicode-org/icu4x/pull/5718)
+    - Split `canonicalize()` on `Locale` and `LanguageIdentifier` into `canonicalize()` and `canonicalize_utf8()`, and have it return a `Cow` (https://github.com/unicode-org/icu4x/pull/5727)
   - `icu_locale_core`
     - New crate, renamed from `icu_locid`
     - Removed `Ord` and `PartialOrd` impl from `extensions::unicode::Unicode` (https://github.com/unicode-org/icu4x/pull/5617)

diff --git a/components/locale_core/src/langid.rs b/components/locale_core/src/langid.rs
@@ -10,7 +10,7 @@ use crate::parser::{
     ParserMode, SubtagIterator,
 };
 use crate::subtags;
-use alloc::string::String;
+use alloc::borrow::Cow;
 use writeable::Writeable;
 
 /// A core struct representing a [`Unicode BCP47 Language Identifier`].
@@ -168,6 +168,8 @@ impl LanguageIdentifier {
             && self.variants.is_empty()
     }
 
+    /// Canonicalize the language identifier (operating on UTF-8 formatted byte slices)
+    ///
     /// This is a best-effort operation that performs all available levels of canonicalization.
     ///
     /// At the moment the operation will normalize casing and the separator, but in the future
@@ -183,9 +185,46 @@ impl LanguageIdentifier {
     ///     Ok("pl-Latn-PL")
     /// );
     /// ```
-    pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParseError> {
+    pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
         let lang_id = Self::try_from_utf8(input.as_ref())?;
-        Ok(lang_id.write_to_string().into_owned())
+        let cow = lang_id.write_to_string();
+        if cow.as_bytes() == input {
+            if let Ok(s) = core::str::from_utf8(input) {
+                Ok(s.into())
+            } else {
+                Ok(cow.into_owned().into())
+            }
+        } else {
+            Ok(cow.into_owned().into())
+        }
+    }
+
+    /// Canonicalize the language identifier (operating on strings)
+    ///
+    /// This is a best-effort operation that performs all available levels of canonicalization.
+    ///
+    /// At the moment the operation will normalize casing and the separator, but in the future
+    /// it may also validate and update from deprecated subtags to canonical ones.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use icu::locale::LanguageIdentifier;
+    ///
+    /// assert_eq!(
+    ///     LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
+    ///     Ok("pl-Latn-PL")
+    /// );
+    /// ```
+    pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
+        let lang_id = Self::try_from_str(input.as_ref())?;
+        let cow = lang_id.write_to_string();
+
+        if cow == input {
+            Ok(input.into())
+        } else {
+            Ok(cow.into_owned().into())
+        }
     }
 
     /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.

diff --git a/components/locale_core/src/locale.rs b/components/locale_core/src/locale.rs
@@ -8,7 +8,7 @@ use crate::parser::{
 };
 use crate::subtags::Subtag;
 use crate::{extensions, subtags, LanguageIdentifier};
-use alloc::string::String;
+use alloc::borrow::Cow;
 use core::cmp::Ordering;
 use core::str::FromStr;
 use writeable::Writeable;
@@ -152,6 +152,8 @@ impl Locale {
         }
     }
 
+    /// Canonicalize the locale (operating on UTF-8 formatted byte slices)
+    ///
     /// This is a best-effort operation that performs all available levels of canonicalization.
     ///
     /// At the moment the operation will normalize casing and the separator, but in the future
@@ -163,13 +165,47 @@ impl Locale {
     /// use icu::locale::Locale;
     ///
     /// assert_eq!(
+    ///     Locale::canonicalize_utf8(b"pL_latn_pl-U-HC-H12").as_deref(),
+    ///     Ok("pl-Latn-PL-u-hc-h12")
+    /// );
+    /// ```
+    pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
+        let locale = Self::try_from_utf8(input)?;
+        let cow = locale.write_to_string();
+        if cow.as_bytes() == input {
+            if let Ok(s) = core::str::from_utf8(input) {
+                Ok(s.into())
+            } else {
+                Ok(cow.into_owned().into())
+            }
+        } else {
+            Ok(cow.into_owned().into())
+        }
+    }
+
+    /// Canonicalize the locale (operating on strings)
+    ///
+    /// At the moment the operation will normalize casing and the separator, but in the future
+    /// it may also validate and update from deprecated subtags to canonical ones.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use icu::locale::Locale;
+    ///
+    /// assert_eq!(
     ///     Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
     ///     Ok("pl-Latn-PL-u-hc-h12")
     /// );
     /// ```
-    pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParseError> {
-        let locale = Self::try_from_utf8(input.as_ref())?;
-        Ok(locale.write_to_string().into_owned())
+    pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
+        let locale = Self::try_from_str(input)?;
+        let cow = locale.write_to_string();
+        if cow == input {
+            Ok(input.into())
+        } else {
+            Ok(cow.into_owned().into())
+        }
     }
 
     /// Compare this [`Locale`] with BCP-47 bytes.