Skip to content

Commit

Permalink
Migrate unic-langid to use TinyStr
Browse files Browse the repository at this point in the history
  • Loading branch information
zbraniecki committed Aug 10, 2019
1 parent 57d38ce commit 378718b
Show file tree
Hide file tree
Showing 14 changed files with 219 additions and 169 deletions.
3 changes: 3 additions & 0 deletions unic-langid-impl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ repository = "https://github.com/zbraniecki/unic-locale"
license = "MIT/Apache-2.0"
categories = ["internationalization"]

[dependencies]
tinystr = "0.1"

[dev-dependencies]
criterion = "0.2"
serde = { version = "1.0", features = ["derive"] }
Expand Down
166 changes: 83 additions & 83 deletions unic-langid-impl/benches/langid.rs
Original file line number Diff line number Diff line change
@@ -1,95 +1,95 @@
use criterion::criterion_group;
use criterion::criterion_main;
use criterion::Criterion;
use criterion::Fun;

use tinystr::{TinyStr4, TinyStr8};
use unic_langid_impl::LanguageIdentifier;

fn language_identifier_from_str_bench(c: &mut Criterion) {
let strings = &[
"en-US",
"en-GB",
"es-AR",
"it",
"zh-Hans-CN",
"de-AT",
"pl",
"fr-FR",
"de-AT",
"sr-Cyrl-SR",
"nb-NO",
"fr-FR",
"mk",
"uk",
];
c.bench_function("language_identifier_from_str", move |b| {
b.iter(|| {
for s in strings {
let _: Result<LanguageIdentifier, _> = s.parse();
}
})
});
}
static STRINGS: &[&str] = &[
"en-US",
"en-GB",
"es-AR",
"it",
"zh-Hans-CN",
"de-AT",
"pl",
"fr-FR",
"de-AT",
"sr-Cyrl-SR",
"nb-NO",
"fr-FR",
"mk",
"uk",
];

fn language_identifier_from_parts_bench(c: &mut Criterion) {
let entries: Vec<(Option<&str>, Option<&str>, Option<&str>, Option<&[&&str]>)> = vec![
(Some("en"), None, Some("US"), None),
(Some("en"), None, Some("GB"), None),
(Some("es"), None, Some("AR"), None),
(Some("it"), None, None, None),
(Some("zh"), Some("Hans"), Some("CN"), None),
(Some("de"), None, Some("AT"), None),
(Some("pl"), None, None, None),
(Some("fr"), None, Some("FR"), None),
(Some("de"), None, Some("AT"), None),
(Some("sr"), Some("Cyrl"), Some("SR"), None),
(Some("nb"), None, Some("NO"), None),
(Some("fr"), None, Some("FR"), None),
(Some("mk"), None, None, None),
(Some("uk"), None, None, None),
];
c.bench_function("language_identifier_from_parts", move |b| {
b.iter(|| {
for (language, region, script, variants) in &entries {
let _ = LanguageIdentifier::from_parts(
language.as_ref(),
region.as_ref(),
script.as_ref(),
*variants,
);
}
})
});
fn language_identifier_construct_bench(c: &mut Criterion) {
let langids: Vec<LanguageIdentifier> = STRINGS
.iter()
.map(|s| -> LanguageIdentifier { s.parse().unwrap() })
.collect();

let entries2: Vec<(Option<&str>, Option<&str>, Option<&str>, Option<&[&str]>)> = vec![
(Some("en"), None, Some("US"), None),
(Some("en"), None, Some("GB"), None),
(Some("es"), None, Some("AR"), None),
(Some("it"), None, None, None),
(Some("zh"), Some("Hans"), Some("CN"), None),
(Some("de"), None, Some("AT"), None),
(Some("pl"), None, None, None),
(Some("fr"), None, Some("FR"), None),
(Some("de"), None, Some("AT"), None),
(Some("sr"), Some("Cyrl"), Some("SR"), None),
(Some("nb"), None, Some("NO"), None),
(Some("fr"), None, Some("FR"), None),
(Some("mk"), None, None, None),
(Some("uk"), None, None, None),
let funcs = vec![
Fun::new("from_str", |b, _| {
b.iter(|| {
for s in STRINGS {
let _: Result<LanguageIdentifier, _> = s.parse();
}
})
}),
Fun::new("from_parts", |b, langids: &Vec<LanguageIdentifier>| {
let entries: Vec<(Option<&str>, Option<&str>, Option<&str>, Vec<&str>)> = langids
.iter()
.map(|langid| {
let lang = Some(langid.get_language()).and_then(|s| {
if s == "und" {
None
} else {
Some(s)
}
});
(
lang,
langid.get_script(),
langid.get_region(),
langid.get_variants(),
)
})
.collect();
b.iter(|| {
for (language, script, region, variants) in &entries {
let _ = LanguageIdentifier::from_parts(*language, *script, *region, variants);
}
})
}),
Fun::new(
"from_parts_unchecked",
|b, langids: &Vec<LanguageIdentifier>| {
let entries = langids
.iter()
.map(|langid| langid.clone().to_raw_parts())
.collect::<Vec<_>>();
b.iter(|| {
for (language, script, region, variants) in &entries {
let _ = unsafe {
LanguageIdentifier::from_raw_parts_unchecked(
language.map(|l| TinyStr8::new_unchecked(l)),
script.map(|s| TinyStr4::new_unchecked(s)),
region.map(|r| TinyStr4::new_unchecked(r)),
variants
.into_iter()
.map(|v| TinyStr8::new_unchecked(*v))
.collect(),
)
};
}
})
},
),
];
c.bench_function("language_identifier_from_parts_unchecked", move |b| {
b.iter(|| {
for (language, region, script, variants) in &entries2 {
let _ = LanguageIdentifier::from_parts_unchecked(
*language, *region, *script, *variants,
);
}
})
});

c.bench_functions("language_identifier_construct", funcs, langids);
}

criterion_group!(
benches,
language_identifier_from_str_bench,
language_identifier_from_parts_bench,,
);
criterion_group!(benches, language_identifier_construct_bench,);
criterion_main!(benches);
80 changes: 44 additions & 36 deletions unic-langid-impl/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,24 @@ pub mod parser;
pub mod subtags;

use crate::errors::LanguageIdentifierError;
use std::borrow::Cow;
use std::str::FromStr;

use tinystr::{TinyStr4, TinyStr8};

#[derive(Default, Debug, PartialEq, Eq, Clone, Hash)]
pub struct LanguageIdentifier {
language: Option<Cow<'static, str>>,
script: Option<Cow<'static, str>>,
region: Option<Cow<'static, str>>,
variants: Vec<Cow<'static, str>>,
language: Option<TinyStr8>,
script: Option<TinyStr4>,
region: Option<TinyStr4>,
variants: Box<[TinyStr8]>,
}

impl LanguageIdentifier {
pub fn from_parts<S: AsRef<str>>(
language: Option<S>,
script: Option<S>,
region: Option<S>,
variants: Option<&[S]>,
variants: &[S],
) -> Result<Self, LanguageIdentifierError> {
let language = if let Some(subtag) = language {
subtags::parse_language_subtag(subtag.as_ref())?
Expand All @@ -36,38 +37,42 @@ impl LanguageIdentifier {
} else {
None
};
let mut variants_field = vec![];

if let Some(variants) = variants {
for variant in variants {
variants_field.push(subtags::parse_variant_subtag(variant.as_ref())?);
}
variants_field.sort();
let mut vars = Vec::with_capacity(variants.len());
for variant in variants {
vars.push(subtags::parse_variant_subtag(variant.as_ref())?);
}
vars.sort();
vars.dedup();

Ok(Self {
language,
script,
region,
variants: variants_field,
variants: vars.into_boxed_slice(),
})
}

pub fn from_parts_unchecked(
language: Option<&'static str>,
script: Option<&'static str>,
region: Option<&'static str>,
variants: Option<&[&'static str]>,
pub fn to_raw_parts(self) -> (Option<u64>, Option<u32>, Option<u32>, Box<[u64]>) {
(
self.language.map(|l| l.into()),
self.script.map(|s| s.into()),
self.region.map(|r| r.into()),
self.variants.into_iter().map(|v| (*v).into()).collect(),
)
}

pub const unsafe fn from_raw_parts_unchecked(
language: Option<TinyStr8>,
script: Option<TinyStr4>,
region: Option<TinyStr4>,
variants: Box<[TinyStr8]>,
) -> Self {
Self {
language: language.map(|l| l.into()),
script: script.map(|s| s.into()),
region: region.map(|r| r.into()),
variants: variants.map_or(vec![], |v| {
v.iter()
.map(|v| -> Cow<'static, str> { Cow::Borrowed(v) })
.collect()
}),
language,
script,
region,
variants,
}
}

Expand Down Expand Up @@ -137,11 +142,14 @@ impl LanguageIdentifier {
}

pub fn set_variants(&mut self, variants: &[&str]) -> Result<(), LanguageIdentifierError> {
self.variants.clear();
let mut result = Vec::with_capacity(variants.len());
for variant in variants {
self.variants.push(subtags::parse_variant_subtag(variant)?);
result.push(subtags::parse_variant_subtag(variant)?);
}
self.variants.sort();
result.sort();
result.dedup();

self.variants = result.into_boxed_slice();
Ok(())
}
}
Expand Down Expand Up @@ -169,26 +177,26 @@ impl std::fmt::Display for LanguageIdentifier {
if let Some(region) = self.get_region() {
subtags.push(region);
}
for variant in &self.variants {
for variant in self.variants.iter() {
subtags.push(variant);
}

f.write_str(&subtags.join("-"))
}
}

fn subtag_matches(
subtag1: &Option<Cow<'static, str>>,
subtag2: &Option<Cow<'static, str>>,
fn subtag_matches<P: PartialEq>(
subtag1: &Option<P>,
subtag2: &Option<P>,
as_range1: bool,
as_range2: bool,
) -> bool {
(as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2
}

fn subtags_match(
subtag1: &[Cow<'static, str>],
subtag2: &[Cow<'static, str>],
fn subtags_match<P: PartialEq>(
subtag1: &[P],
subtag2: &[P],
as_range1: bool,
as_range2: bool,
) -> bool {
Expand Down
3 changes: 2 additions & 1 deletion unic-langid-impl/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ pub fn parse_language_identifier(t: &str) -> Result<LanguageIdentifier, ParserEr
}

variants.sort();
variants.dedup();

Ok(LanguageIdentifier {
language,
script,
region,
variants,
variants: variants.into_boxed_slice(),
})
}
Loading

0 comments on commit 378718b

Please sign in to comment.