diff --git a/unic-langid-impl/Cargo.toml b/unic-langid-impl/Cargo.toml index 66c6107b..3cb9f4a6 100644 --- a/unic-langid-impl/Cargo.toml +++ b/unic-langid-impl/Cargo.toml @@ -9,6 +9,9 @@ repository = "https://github.com/zbraniecki/unic-locale" license = "MIT/Apache-2.0" categories = ["internationalization"] +[dependencies] +tinystr = "0.1" + [dev-dependencies] criterion = "0.2" serde = { version = "1.0", features = ["derive"] } diff --git a/unic-langid-impl/benches/langid.rs b/unic-langid-impl/benches/langid.rs index ab1d2bb7..5812772d 100644 --- a/unic-langid-impl/benches/langid.rs +++ b/unic-langid-impl/benches/langid.rs @@ -1,95 +1,95 @@ use criterion::criterion_group; use criterion::criterion_main; use criterion::Criterion; +use criterion::Fun; +use tinystr::{TinyStr4, TinyStr8}; use unic_langid_impl::LanguageIdentifier; -fn language_identifier_from_str_bench(c: &mut Criterion) { - let strings = &[ - "en-US", - "en-GB", - "es-AR", - "it", - "zh-Hans-CN", - "de-AT", - "pl", - "fr-FR", - "de-AT", - "sr-Cyrl-SR", - "nb-NO", - "fr-FR", - "mk", - "uk", - ]; - c.bench_function("language_identifier_from_str", move |b| { - b.iter(|| { - for s in strings { - let _: Result = s.parse(); - } - }) - }); -} +static STRINGS: &[&str] = &[ + "en-US", + "en-GB", + "es-AR", + "it", + "zh-Hans-CN", + "de-AT", + "pl", + "fr-FR", + "de-AT", + "sr-Cyrl-SR", + "nb-NO", + "fr-FR", + "mk", + "uk", +]; -fn language_identifier_from_parts_bench(c: &mut Criterion) { - let entries: Vec<(Option<&str>, Option<&str>, Option<&str>, Option<&[&&str]>)> = vec![ - (Some("en"), None, Some("US"), None), - (Some("en"), None, Some("GB"), None), - (Some("es"), None, Some("AR"), None), - (Some("it"), None, None, None), - (Some("zh"), Some("Hans"), Some("CN"), None), - (Some("de"), None, Some("AT"), None), - (Some("pl"), None, None, None), - (Some("fr"), None, Some("FR"), None), - (Some("de"), None, Some("AT"), None), - (Some("sr"), Some("Cyrl"), Some("SR"), None), - (Some("nb"), None, Some("NO"), None), - (Some("fr"), None, Some("FR"), None), - (Some("mk"), None, None, None), - (Some("uk"), None, None, None), - ]; - c.bench_function("language_identifier_from_parts", move |b| { - b.iter(|| { - for (language, region, script, variants) in &entries { - let _ = LanguageIdentifier::from_parts( - language.as_ref(), - region.as_ref(), - script.as_ref(), - *variants, - ); - } - }) - }); +fn language_identifier_construct_bench(c: &mut Criterion) { + let langids: Vec = STRINGS + .iter() + .map(|s| -> LanguageIdentifier { s.parse().unwrap() }) + .collect(); - let entries2: Vec<(Option<&str>, Option<&str>, Option<&str>, Option<&[&str]>)> = vec![ - (Some("en"), None, Some("US"), None), - (Some("en"), None, Some("GB"), None), - (Some("es"), None, Some("AR"), None), - (Some("it"), None, None, None), - (Some("zh"), Some("Hans"), Some("CN"), None), - (Some("de"), None, Some("AT"), None), - (Some("pl"), None, None, None), - (Some("fr"), None, Some("FR"), None), - (Some("de"), None, Some("AT"), None), - (Some("sr"), Some("Cyrl"), Some("SR"), None), - (Some("nb"), None, Some("NO"), None), - (Some("fr"), None, Some("FR"), None), - (Some("mk"), None, None, None), - (Some("uk"), None, None, None), + let funcs = vec![ + Fun::new("from_str", |b, _| { + b.iter(|| { + for s in STRINGS { + let _: Result = s.parse(); + } + }) + }), + Fun::new("from_parts", |b, langids: &Vec| { + let entries: Vec<(Option<&str>, Option<&str>, Option<&str>, Vec<&str>)> = langids + .iter() + .map(|langid| { + let lang = Some(langid.get_language()).and_then(|s| { + if s == "und" { + None + } else { + Some(s) + } + }); + ( + lang, + langid.get_script(), + langid.get_region(), + langid.get_variants(), + ) + }) + .collect(); + b.iter(|| { + for (language, script, region, variants) in &entries { + let _ = LanguageIdentifier::from_parts(*language, *script, *region, variants); + } + }) + }), + Fun::new( + "from_parts_unchecked", + |b, langids: &Vec| { + let entries = langids + .iter() + .map(|langid| langid.clone().to_raw_parts()) + .collect::>(); + b.iter(|| { + for (language, script, region, variants) in &entries { + let _ = unsafe { + LanguageIdentifier::from_raw_parts_unchecked( + language.map(|l| TinyStr8::new_unchecked(l)), + script.map(|s| TinyStr4::new_unchecked(s)), + region.map(|r| TinyStr4::new_unchecked(r)), + variants + .into_iter() + .map(|v| TinyStr8::new_unchecked(*v)) + .collect(), + ) + }; + } + }) + }, + ), ]; - c.bench_function("language_identifier_from_parts_unchecked", move |b| { - b.iter(|| { - for (language, region, script, variants) in &entries2 { - let _ = LanguageIdentifier::from_parts_unchecked( - *language, *region, *script, *variants, - ); - } - }) - }); + + c.bench_functions("language_identifier_construct", funcs, langids); } -criterion_group!( - benches, - language_identifier_from_str_bench, - language_identifier_from_parts_bench,, -); +criterion_group!(benches, language_identifier_construct_bench,); criterion_main!(benches); diff --git a/unic-langid-impl/src/lib.rs b/unic-langid-impl/src/lib.rs index 8b37470f..0e3f424b 100644 --- a/unic-langid-impl/src/lib.rs +++ b/unic-langid-impl/src/lib.rs @@ -3,15 +3,16 @@ pub mod parser; pub mod subtags; use crate::errors::LanguageIdentifierError; -use std::borrow::Cow; use std::str::FromStr; +use tinystr::{TinyStr4, TinyStr8}; + #[derive(Default, Debug, PartialEq, Eq, Clone, Hash)] pub struct LanguageIdentifier { - language: Option>, - script: Option>, - region: Option>, - variants: Vec>, + language: Option, + script: Option, + region: Option, + variants: Box<[TinyStr8]>, } impl LanguageIdentifier { @@ -19,7 +20,7 @@ impl LanguageIdentifier { language: Option, script: Option, region: Option, - variants: Option<&[S]>, + variants: &[S], ) -> Result { let language = if let Some(subtag) = language { subtags::parse_language_subtag(subtag.as_ref())? @@ -36,38 +37,42 @@ impl LanguageIdentifier { } else { None }; - let mut variants_field = vec![]; - if let Some(variants) = variants { - for variant in variants { - variants_field.push(subtags::parse_variant_subtag(variant.as_ref())?); - } - variants_field.sort(); + let mut vars = Vec::with_capacity(variants.len()); + for variant in variants { + vars.push(subtags::parse_variant_subtag(variant.as_ref())?); } + vars.sort(); + vars.dedup(); Ok(Self { language, script, region, - variants: variants_field, + variants: vars.into_boxed_slice(), }) } - pub fn from_parts_unchecked( - language: Option<&'static str>, - script: Option<&'static str>, - region: Option<&'static str>, - variants: Option<&[&'static str]>, + pub fn to_raw_parts(self) -> (Option, Option, Option, Box<[u64]>) { + ( + self.language.map(|l| l.into()), + self.script.map(|s| s.into()), + self.region.map(|r| r.into()), + self.variants.into_iter().map(|v| (*v).into()).collect(), + ) + } + + pub const unsafe fn from_raw_parts_unchecked( + language: Option, + script: Option, + region: Option, + variants: Box<[TinyStr8]>, ) -> Self { Self { - language: language.map(|l| l.into()), - script: script.map(|s| s.into()), - region: region.map(|r| r.into()), - variants: variants.map_or(vec![], |v| { - v.iter() - .map(|v| -> Cow<'static, str> { Cow::Borrowed(v) }) - .collect() - }), + language, + script, + region, + variants, } } @@ -137,11 +142,14 @@ impl LanguageIdentifier { } pub fn set_variants(&mut self, variants: &[&str]) -> Result<(), LanguageIdentifierError> { - self.variants.clear(); + let mut result = Vec::with_capacity(variants.len()); for variant in variants { - self.variants.push(subtags::parse_variant_subtag(variant)?); + result.push(subtags::parse_variant_subtag(variant)?); } - self.variants.sort(); + result.sort(); + result.dedup(); + + self.variants = result.into_boxed_slice(); Ok(()) } } @@ -169,7 +177,7 @@ impl std::fmt::Display for LanguageIdentifier { if let Some(region) = self.get_region() { subtags.push(region); } - for variant in &self.variants { + for variant in self.variants.iter() { subtags.push(variant); } @@ -177,18 +185,18 @@ impl std::fmt::Display for LanguageIdentifier { } } -fn subtag_matches( - subtag1: &Option>, - subtag2: &Option>, +fn subtag_matches( + subtag1: &Option

, + subtag2: &Option

, as_range1: bool, as_range2: bool, ) -> bool { (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2 } -fn subtags_match( - subtag1: &[Cow<'static, str>], - subtag2: &[Cow<'static, str>], +fn subtags_match( + subtag1: &[P], + subtag2: &[P], as_range1: bool, as_range2: bool, ) -> bool { diff --git a/unic-langid-impl/src/parser/mod.rs b/unic-langid-impl/src/parser/mod.rs index 2686105b..4abba40b 100644 --- a/unic-langid-impl/src/parser/mod.rs +++ b/unic-langid-impl/src/parser/mod.rs @@ -45,11 +45,12 @@ pub fn parse_language_identifier(t: &str) -> Result Result>, ParserError> { +pub fn parse_language_subtag(subtag: &str) -> Result, ParserError> { let slen = subtag.len(); - if slen < 2 || slen > 8 || slen == 4 || subtag.contains(|c: char| !c.is_ascii_alphabetic()) { + let s: TinyStr8 = subtag.parse().map_err(|_| ParserError::InvalidLanguage)?; + if slen < 2 || slen > 8 || slen == 4 || !s.is_ascii_alphanumeric() { return Err(ParserError::InvalidLanguage); } - let value = subtag.to_ascii_lowercase(); + let value = s.to_ascii_lowercase(); if value == "und" { Ok(None) } else { - Ok(Some(Cow::from(value))) + Ok(Some(value)) } } -pub fn parse_script_subtag(subtag: &str) -> Result, ParserError> { +pub fn parse_script_subtag(subtag: &str) -> Result { let slen = subtag.len(); - if slen != 4 || subtag.contains(|c: char| !c.is_ascii_alphabetic()) { + let s: TinyStr4 = subtag.parse().map_err(|_| ParserError::InvalidSubtag)?; + if slen != 4 || !s.is_ascii_alphanumeric() { return Err(ParserError::InvalidSubtag); } - let mut result = subtag.to_ascii_lowercase(); - result[0..1].make_ascii_uppercase(); - Ok(result.into()) + Ok(s.to_ascii_titlecase()) } -pub fn parse_region_subtag(subtag: &str) -> Result, ParserError> { +pub fn parse_region_subtag(subtag: &str) -> Result { let slen = subtag.len(); - if slen == 2 && !subtag.contains(|c: char| !c.is_ascii_alphabetic()) - || slen == 3 && !subtag.contains(|c: char| !c.is_ascii_digit()) - { - Ok(subtag.to_ascii_uppercase().into()) - } else { - Err(ParserError::InvalidSubtag) + match slen { + 2 => { + let s: TinyStr4 = subtag.parse().map_err(|_| ParserError::InvalidSubtag)?; + if !s.is_ascii_alphanumeric() { + return Err(ParserError::InvalidSubtag); + } + Ok(s.to_ascii_uppercase()) + } + 3 => { + if subtag.contains(|c: char| !c.is_ascii_digit()) { + return Err(ParserError::InvalidSubtag); + } + Ok(subtag.parse().unwrap()) + } + _ => Err(ParserError::InvalidSubtag), } } -pub fn parse_variant_subtag(subtag: &str) -> Result, ParserError> { +pub fn parse_variant_subtag(subtag: &str) -> Result { let slen = subtag.len(); if slen < 4 || slen > 8 { @@ -58,5 +67,7 @@ pub fn parse_variant_subtag(subtag: &str) -> Result, ParserErr return Err(ParserError::InvalidSubtag); } - Ok(subtag.to_ascii_lowercase().into()) + let s: TinyStr8 = subtag.parse().unwrap(); + + Ok(s.to_ascii_lowercase()) } diff --git a/unic-langid-impl/tests/fixtures.rs b/unic-langid-impl/tests/fixtures.rs index 03735e42..7f79dbe7 100644 --- a/unic-langid-impl/tests/fixtures.rs +++ b/unic-langid-impl/tests/fixtures.rs @@ -16,7 +16,8 @@ struct LangIdTestOutputObject { language: Option, script: Option, region: Option, - variants: Option>, + #[serde(default)] + variants: Vec, } #[derive(Serialize, Deserialize, Debug)] @@ -49,10 +50,14 @@ fn test_langid_fixtures(path: &str) { match test.output { LangIdTestOutput::Object(o) => { let expected = LanguageIdentifier::from_parts( - o.language, - o.script, - o.region, - o.variants.as_ref().map(|v| v.as_slice()), + o.language.as_ref().map(String::as_str), + o.script.as_ref().map(String::as_str), + o.region.as_ref().map(String::as_str), + o.variants + .iter() + .map(|s| s.as_str()) + .collect::>() + .as_ref(), ) .expect("Parsing failed."); assert_eq!(langid, expected); diff --git a/unic-langid-impl/tests/fixtures/parsing.json b/unic-langid-impl/tests/fixtures/parsing.json index 4e3f1556..67959837 100644 --- a/unic-langid-impl/tests/fixtures/parsing.json +++ b/unic-langid-impl/tests/fixtures/parsing.json @@ -147,5 +147,14 @@ "output": { "script": "Latn" } + }, + { + "input": { + "string": "pl-macos-Windows-nedis-macos-nedis-aRabic" + }, + "output": { + "language": "pl", + "variants": ["arabic", "macos", "nedis", "windows"] + } } ] \ No newline at end of file diff --git a/unic-langid-impl/tests/language_identifier_test.rs b/unic-langid-impl/tests/language_identifier_test.rs index 68dff97c..73467792 100644 --- a/unic-langid-impl/tests/language_identifier_test.rs +++ b/unic-langid-impl/tests/language_identifier_test.rs @@ -1,3 +1,4 @@ +use tinystr::{TinyStr4, TinyStr8}; use unic_langid_impl::parser::parse_language_identifier; use unic_langid_impl::LanguageIdentifier; @@ -54,14 +55,25 @@ fn test_sorted_variants() { assert_eq!(&langid.to_string(), "en-macos-nedis"); let langid = - LanguageIdentifier::from_parts(Some("en"), None, None, Some(&["nedis", "macos"])).unwrap(); + LanguageIdentifier::from_parts(Some("en"), None, None, &["nedis", "macos"]).unwrap(); assert_eq!(&langid.to_string(), "en-macos-nedis"); } #[test] fn test_from_parts_unchecked() { - let langid = - LanguageIdentifier::from_parts_unchecked(Some("en"), None, None, Some(&["macos", "nedis"])); + let langid: LanguageIdentifier = "en-nedis-macos".parse().unwrap(); + let (lang, script, region, variants) = langid.to_raw_parts(); + let langid = unsafe { + LanguageIdentifier::from_raw_parts_unchecked( + lang.map(|l| TinyStr8::new_unchecked(l)), + script.map(|s| TinyStr4::new_unchecked(s)), + region.map(|r| TinyStr4::new_unchecked(r)), + variants + .into_iter() + .map(|v| TinyStr8::new_unchecked(*v)) + .collect(), + ) + }; assert_eq!(&langid.to_string(), "en-macos-nedis"); } diff --git a/unic-langid-macros-impl/Cargo.toml b/unic-langid-macros-impl/Cargo.toml index 103a2aa5..77199742 100644 --- a/unic-langid-macros-impl/Cargo.toml +++ b/unic-langid-macros-impl/Cargo.toml @@ -13,7 +13,7 @@ categories = ["internationalization"] proc_macro = true [dependencies] -unic-langid-impl = "0.4" +unic-langid-impl = { path = "../unic-langid-impl" } syn = "0.15" quote = "0.6" proc-macro-hack = "0.5" diff --git a/unic-langid-macros-impl/src/lib.rs b/unic-langid-macros-impl/src/lib.rs index 178537b4..e12e2fa8 100644 --- a/unic-langid-macros-impl/src/lib.rs +++ b/unic-langid-macros-impl/src/lib.rs @@ -13,32 +13,29 @@ pub fn langid(input: TokenStream) -> TokenStream { let id = parse_macro_input!(input as LitStr); let parsed: LanguageIdentifier = id.value().parse().expect("Malformed Language Identifier"); - let lang = parsed.get_language(); - let lang = if lang.is_empty() { - quote!(None) + let (lang, script, region, variants) = parsed.to_raw_parts(); + let lang = if let Some(lang) = lang { + quote!(Some($crate::TinyStr8::new_unchecked(#lang))) } else { - quote!(Some(#lang)) + quote!(None) }; - let script = parsed.get_script(); let script = if let Some(script) = script { - quote!(Some(#script)) + quote!(Some($crate::TinyStr4::new_unchecked(#script))) } else { quote!(None) }; - let region = parsed.get_region(); let region = if let Some(region) = region { - quote!(Some(#region)) + quote!(Some($crate::TinyStr4::new_unchecked(#region))) } else { quote!(None) }; - let variants = parsed.get_variants(); - let variants = if variants.is_empty() { - quote!(None) - } else { - quote!(Some(&[#(#variants,)*])) - }; + let variants: Vec<_> = variants + .into_iter() + .map(|v| quote!($crate::TinyStr8::new_unchecked(#v))) + .collect(); + let variants = quote!(Box::new([#(#variants,)*])); TokenStream::from(quote! { - $crate::LanguageIdentifier::from_parts_unchecked(#lang, #script, #region, #variants) + unsafe { $crate::LanguageIdentifier::from_raw_parts_unchecked(#lang, #script, #region, #variants) } }) } diff --git a/unic-langid-macros/Cargo.toml b/unic-langid-macros/Cargo.toml index 3241ac17..69f40693 100644 --- a/unic-langid-macros/Cargo.toml +++ b/unic-langid-macros/Cargo.toml @@ -11,5 +11,6 @@ categories = ["internationalization"] [dependencies] proc-macro-hack = "0.5" -unic-langid-macros-impl = "0.3" -unic-langid-impl = "0.4" +unic-langid-macros-impl = { path = "../unic-langid-macros-impl" } +unic-langid-impl = { path = "../unic-langid-impl" } +tinystr = "0.1" diff --git a/unic-langid-macros/src/lib.rs b/unic-langid-macros/src/lib.rs index d8507363..61f3348e 100644 --- a/unic-langid-macros/src/lib.rs +++ b/unic-langid-macros/src/lib.rs @@ -1,4 +1,5 @@ use proc_macro_hack::proc_macro_hack; +pub use tinystr::{TinyStr4, TinyStr8}; pub use unic_langid_impl::LanguageIdentifier; /// Add one to an expression. diff --git a/unic-langid/Cargo.toml b/unic-langid/Cargo.toml index 87eec2e6..702f35f5 100644 --- a/unic-langid/Cargo.toml +++ b/unic-langid/Cargo.toml @@ -10,11 +10,10 @@ license = "MIT/Apache-2.0" categories = ["internationalization"] [dependencies] -unic-langid-impl = "0.4" -unic-langid-macros = { version = "0.3", optional = true } - +unic-langid-impl = { path = "../unic-langid-impl" } +unic-langid-macros = { path = "../unic-langid-macros", optional = true } [dev-dependencies] -unic-langid-macros = "0.3" +unic-langid-macros = { path = "../unic-langid-macros" } [features] default = [] diff --git a/unic-langid/examples/simple-langid.rs b/unic-langid/examples/simple-langid.rs index 3a184f7d..a3ef653c 100644 --- a/unic-langid/examples/simple-langid.rs +++ b/unic-langid/examples/simple-langid.rs @@ -2,6 +2,9 @@ use unic_langid::langid; use unic_langid::LanguageIdentifier; +// This will become possible when Box can be produced in a const fn +// static LANGID: LanguageIdentifier = langid!("en-US"); + fn main() { let langid: LanguageIdentifier = "en-US".parse().unwrap(); println!("{:#?}", langid);