Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian committed Aug 20, 2024
1 parent a3f41f6 commit c13d18a
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 20 deletions.
3 changes: 1 addition & 2 deletions components/collator/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,7 @@ fn data_ce_to_primary(data_ce: u64, c: char) -> u32 {
#[icu_provider::data_struct(marker(
CollationDataV1Marker,
"collator/data@1",
// TODO(#3867): Use script fallback
fallback_by = "language",
fallback_by = "script",
attributes_domain = "collator",
))]
#[derive(Debug, PartialEq, Clone)]
Expand Down
32 changes: 22 additions & 10 deletions components/experimental/src/transliterate/compile/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use alloc::vec::Vec;
use core::cell::RefCell;
use icu_locale_core::Locale;
use icu_normalizer::provider::*;
use icu_locale::provider::*;
use icu_properties::{provider::*, sets};
use icu_provider::prelude::*;

Expand Down Expand Up @@ -126,24 +127,26 @@ impl RuleCollection {
#[cfg(feature = "compiled_data")]
pub fn as_provider(
&self,
) -> RuleCollectionProvider<'_, icu_properties::provider::Baked, icu_normalizer::provider::Baked>
) -> RuleCollectionProvider<'_, icu_properties::provider::Baked, icu_normalizer::provider::Baked, icu_locale::provider::Baked>
{
RuleCollectionProvider {
collection: self,
properties_provider: &icu_properties::provider::Baked,
normalizer_provider: &icu_normalizer::provider::Baked,
locale_provider: &icu_locale::provider::Baked,
xid_start: sets::xid_start().static_to_owned(),
xid_continue: sets::xid_continue().static_to_owned(),
pat_ws: sets::pattern_white_space().static_to_owned(),
}
}

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::as_provider)]
pub fn as_provider_unstable<'a, PP, NP>(
pub fn as_provider_unstable<'a, PP, NP, LP>(
&'a self,
properties_provider: &'a PP,
normalizer_provider: &'a NP,
) -> Result<RuleCollectionProvider<'a, PP, NP>, DataError>
locale_provider: &'a LP,
) -> Result<RuleCollectionProvider<'a, PP, NP, LP>, DataError>
where
PP: ?Sized
+ DataProvider<AsciiHexDigitV1Marker>
Expand Down Expand Up @@ -218,6 +221,7 @@ impl RuleCollection {
collection: self,
properties_provider,
normalizer_provider,
locale_provider,
xid_start: sets::load_xid_start(properties_provider)?,
xid_continue: sets::load_xid_continue(properties_provider)?,
pat_ws: sets::load_pattern_white_space(properties_provider)?,
Expand All @@ -227,16 +231,17 @@ impl RuleCollection {

/// A provider that is usable by [`Transliterator::try_new_unstable`](crate::Transliterator::try_new_unstable).
#[derive(Debug)]
pub struct RuleCollectionProvider<'a, PP: ?Sized, NP: ?Sized> {
pub struct RuleCollectionProvider<'a, PP: ?Sized, NP: ?Sized, LP: ?Sized> {
collection: &'a RuleCollection,
properties_provider: &'a PP,
normalizer_provider: &'a NP,
locale_provider: &'a LP,
xid_start: sets::CodePointSetData,
xid_continue: sets::CodePointSetData,
pat_ws: sets::CodePointSetData,
}

impl<PP, NP> DataProvider<TransliteratorRulesV1Marker> for RuleCollectionProvider<'_, PP, NP>
impl<PP, NP, LP> DataProvider<TransliteratorRulesV1Marker> for RuleCollectionProvider<'_, PP, NP, LP>
where
PP: ?Sized
+ DataProvider<AsciiHexDigitV1Marker>
Expand Down Expand Up @@ -376,28 +381,35 @@ where
}

macro_rules! redirect {
($($marker:ty),*) => {
($field:ident, $($marker:ty),*) => {
$(
impl<PP: ?Sized, NP: ?Sized + DataProvider<$marker>> DataProvider<$marker> for RuleCollectionProvider<'_, PP, NP> {
impl<PP: ?Sized, NP: ?Sized + DataProvider<$marker>, LP: ?Sized + DataProvider<$marker>> DataProvider<$marker> for RuleCollectionProvider<'_, PP, NP, LP> {
fn load(&self, req: DataRequest) -> Result<DataResponse<$marker>, DataError> {
self.normalizer_provider.load(req)
self.$field.load(req)
}
}
)*
}
}

redirect!(
normalizer_provider,
CanonicalDecompositionDataV1Marker,
CompatibilityDecompositionSupplementV1Marker,
CanonicalDecompositionTablesV1Marker,
CompatibilityDecompositionTablesV1Marker,
CanonicalCompositionsV1Marker
);

redirect!(
locale_provider,
ParentsV1Marker,
LikelySubtagsExtendedV1Marker
);

#[cfg(feature = "datagen")]
impl<PP, NP> IterableDataProvider<TransliteratorRulesV1Marker>
for RuleCollectionProvider<'_, PP, NP>
impl<PP, NP, LP> IterableDataProvider<TransliteratorRulesV1Marker>
for RuleCollectionProvider<'_, PP, NP, LP>
where
PP: ?Sized
+ DataProvider<AsciiHexDigitV1Marker>
Expand Down
96 changes: 88 additions & 8 deletions components/experimental/src/transliterate/transliterator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ use core::fmt::Debug;
use core::ops::Range;
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu_locale::fallback::{LocaleFallbackConfig, LocaleFallbackPriority, LocaleFallbacker};
use icu_locale::provider::*;
use icu_locale_core::Locale;
use icu_normalizer::provider::*;
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
Expand Down Expand Up @@ -180,13 +182,16 @@ impl Transliterator {
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
+ DataProvider<CanonicalCompositionsV1Marker>
+ DataProvider<ParentsV1Marker>
+ DataProvider<LikelySubtagsForLanguageV1Marker>
+ ?Sized,
{
Self::internal_try_new_with_override_unstable(
locale,
None::<&fn(&Locale) -> Option<Box<dyn CustomTransliterator>>>,
provider,
provider,
|| LocaleFallbacker::try_new_unstable(provider),
)
}

Expand Down Expand Up @@ -236,17 +241,26 @@ impl Transliterator {
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
+ DataProvider<CanonicalCompositionsV1Marker>
+ DataProvider<ParentsV1Marker>
+ DataProvider<LikelySubtagsForLanguageV1Marker>
+ ?Sized,
F: Fn(&Locale) -> Option<Box<dyn CustomTransliterator>>,
{
Self::internal_try_new_with_override_unstable(locale, Some(&lookup), provider, provider)
Self::internal_try_new_with_override_unstable(
locale,
Some(&lookup),
provider,
provider,
|| LocaleFallbacker::try_new_unstable(provider),
)
}

fn internal_try_new_with_override_unstable<PN, PT, F>(
locale: Locale,
lookup: Option<&F>,
transliterator_provider: &PT,
normalizer_provider: &PN,
fallbacker: impl Fn() -> Result<LocaleFallbacker, DataError>,
) -> Result<Transliterator, DataError>
where
PT: DataProvider<TransliteratorRulesV1Marker> + ?Sized,
Expand All @@ -258,19 +272,71 @@ impl Transliterator {
+ ?Sized,
F: Fn(&Locale) -> Option<Box<dyn CustomTransliterator>>,
{
let payload = Transliterator::load_rbt(
// TODO(#3950): How is fallback handled with special parts?

// first try loading of locale
let transliterator = if let Ok(transliterator) = Self::load_rbt(
#[allow(clippy::unwrap_used)] // infallible
DataMarkerAttributes::try_from_str(&locale.to_string().to_ascii_lowercase()).unwrap(),
transliterator_provider,
)?;
let rbt = payload.get();
) {
transliterator
} else {
let fallbacker = fallbacker()?;
let mut fallback_config = LocaleFallbackConfig::default();
fallback_config.priority = LocaleFallbackPriority::Script;
let fallbacker = fallbacker.for_config(fallback_config);

let mut transform_extensions = locale.extensions.transform;
let source_id = transform_extensions.lang.take().unwrap_or_default();
let target_id = locale.id;

let mut source_iterator = fallbacker.fallback_for(source_id.into());
let mut target_iterator = fallbacker.fallback_for(target_id.into());

'target: loop {
if target_iterator.get().is_default() {
Err(DataErrorKind::IdentifierNotFound
.with_marker(TransliteratorRulesV1Marker::INFO))?;
}
'source: loop {
if source_iterator.get().is_default() {
break 'source;
}
let mut candidate = target_iterator.get().clone().into_locale();
candidate.extensions.transform = transform_extensions.clone();
candidate.extensions.transform.lang = Some(icu_locale_core::LanguageIdentifier {
language: source_iterator.get().language,
script: source_iterator.get().script,
region: source_iterator.get().region,
variants: source_iterator
.get()
.variant
.map(icu_locale_core::subtags::Variants::from_variant)
.unwrap_or_default(),
});
if let Ok(t) = Self::load_rbt(
#[allow(clippy::unwrap_used)] // infallible
DataMarkerAttributes::try_from_str(
&candidate.to_string().to_ascii_lowercase(),
)
.unwrap(),
transliterator_provider,
) {
break 'target t;
}
source_iterator.step();
}
target_iterator.step();
}
};
let rbt = transliterator.get();

if !rbt.visibility {
// transliterator is internal
return Err(DataError::custom("internal only transliterator"));
}
let mut env = LiteMap::new();
// Avoid recursive load
env.insert(locale.to_string(), InternalTransliterator::Null);
Transliterator::load_dependencies_recursive(
rbt,
&mut env,
Expand All @@ -279,7 +345,7 @@ impl Transliterator {
normalizer_provider,
)?;
Ok(Transliterator {
transliterator: payload,
transliterator,
env,
})
}
Expand Down Expand Up @@ -406,9 +472,11 @@ impl Transliterator {
where
P: DataProvider<TransliteratorRulesV1Marker> + ?Sized,
{
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
let req = DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes(marker_attributes),
..Default::default()
metadata,
};
let payload = provider.load(req)?.payload;
let rbt = payload.get();
Expand Down Expand Up @@ -1334,6 +1402,18 @@ mod tests {
assert_eq!(t.transliterate(input.to_string()), output);
}

#[test]
fn test_de_ascii_fallback() {
// the actual, existing transliterator has source `und-Latn`. Check that the fallback chain from `fr-CH`
// eventually reaches `und-Latn` and gives us the expected transliterator.
let t = Transliterator::try_new_unstable("de-t-fr-ch-d0-ascii".parse().unwrap(), &TestingProvider).unwrap();
let input =
"Über ältere Lügner lästern ist sehr a\u{0308}rgerlich. Ja, SEHR ÄRGERLICH! - ꜵ";
let output =
"Ueber aeltere Luegner laestern ist sehr aergerlich. Ja, SEHR AERGERLICH! - ao";
assert_eq!(t.transliterate(input.to_string()), output);
}

#[test]
fn test_override() {
#[derive(Debug)]
Expand Down
4 changes: 4 additions & 0 deletions components/experimental/tests/transliterate/data/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ struct TestingProvider;

const _: () = {
use icu_normalizer_data::*;
use icu_locale_data::*;
mod icu {
pub(super) use super::icu_experimental as experimental;
pub(super) use icu_normalizer as normalizer;
pub(super) use icu_collections as collections;
pub(super) use icu_locale as locale;
}
self::make_provider!(TestingProvider);
impl_canonical_compositions_v1_marker!(TestingProvider);
Expand All @@ -17,5 +19,7 @@ const _: () = {
impl_compatibility_decomposition_supplement_v1_marker!(TestingProvider);
impl_compatibility_decomposition_tables_v1_marker!(TestingProvider);
impl_uts46_decomposition_supplement_v1_marker!(TestingProvider);
impl_parents_v1_marker!(TestingProvider);
impl_likely_subtags_for_language_v1_marker!(TestingProvider);
impl_transliterator_rules_v1!(TestingProvider);
};

0 comments on commit c13d18a

Please sign in to comment.