-
Notifications
You must be signed in to change notification settings - Fork 174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add TimeZoneIdMapper to icu_timezone #4774
Merged
Merged
Changes from 8 commits
Commits
Show all changes
35 commits
Select commit
Hold shift + click to select a range
1532c70
Add baked data for IanaToBcp47MapV2Marker
sffc f4845a9
Add new TimeZoneIdMapper type
sffc 4f3bbb4
Add test in datagen
sffc 1dfdc26
fmt
sffc 4ea5be8
features
sffc 60316b3
Merge branch 'main' into iana-canon-2
sffc 90824c9
Update ids.rs
sffc 107173c
Update ids.rs
sffc 4b2516a
Reduce allocations and DRY
sffc bf11327
Update utils/zerotrie/src/cursor.rs
sffc 8886642
Update components/timezone/src/ids.rs
sffc b9bc044
Improve examples; rename function
sffc 87f339c
impl Deref for TimeZoneBcp47Id
sffc 66355bb
Safety
sffc 3da0592
Apply bf1132716ebf349f8bfad3c75d0cd2619caa7961 to call site
sffc 8fab067
Return NormalizedIana
sffc 28ee49b
Add TimeZoneIdMapperWithFastCanonicalization
sffc 2d879c3
Deprecate and add FFI
sffc 8629048
FFI fixes
sffc 4310f68
Update public docs and things
sffc 7a59526
features
sffc 8985f9f
clippy, fmt
sffc 69f337d
Merge branch 'main' into iana-canon-2
sffc 92f4e7e
Add constructor Diplomat attr
sffc fe60770
impl Default
sffc c817256
Document Normalization vs Canonicalization
sffc 4692100
Line length
sffc 2647dcd
rm NormalizedIana
sffc 4e9d5ee
fmt
sffc 0a5c746
Merge branch 'main' into iana-canon-2
sffc 33015b8
Clippy
sffc 50f5b1e
Review feedback
sffc 63d5b8b
fmt
sffc ef3f8b0
Merge branch 'main' into iana-canon-2
sffc c74ce44
datagen
sffc File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,371 @@ | ||
// This file is part of ICU4X. For terms of use, please see the file | ||
// called LICENSE at the top level of the ICU4X source tree | ||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
|
||
use alloc::borrow::Cow; | ||
use alloc::string::String; | ||
use alloc::vec::Vec; | ||
use icu_provider::prelude::*; | ||
use zerotrie::cursor::ZeroAsciiIgnoreCaseTrieCursor; | ||
|
||
use crate::{ | ||
provider::names::{IanaToBcp47MapV2, IanaToBcp47MapV2Marker}, | ||
TimeZoneBcp47Id, | ||
}; | ||
|
||
/// A mapper between IANA time zone identifiers and BCP-47 time zone identifiers. | ||
/// | ||
/// This mapper supports two-way mapping, but it is optimized for the case of IANA to BCP-47. | ||
/// It also supports normalizing and canonicalizing the IANA strings. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use icu::timezone::TimeZoneIdMapper; | ||
/// | ||
/// let mapper = TimeZoneIdMapper::new(); | ||
/// let mapper = mapper.as_borrowed(); | ||
/// | ||
/// // The IANA zone "Australia/Melbourne" is the BCP-47 zone "aumel": | ||
/// assert_eq!( | ||
/// mapper.iana_to_bcp47("Australia/Melbourne"), | ||
/// Some("aumel".parse().unwrap()) | ||
/// ); | ||
/// | ||
/// // Lookup is ASCII-case insensitive: | ||
sffc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// assert_eq!( | ||
/// mapper.iana_to_bcp47("australia/melbourne"), | ||
/// Some("aumel".parse().unwrap()) | ||
/// ); | ||
/// | ||
/// // The IANA zone "Australia/Victoria" is an alias: | ||
/// assert_eq!( | ||
/// mapper.iana_to_bcp47("Australia/Victoria"), | ||
/// Some("aumel".parse().unwrap()) | ||
/// ); | ||
/// | ||
/// // We can recover the canonical name from the mapper: | ||
/// assert_eq!( | ||
/// mapper | ||
/// .canonicalize_iana("Australia/Victoria") | ||
/// .unwrap() | ||
/// .string, | ||
/// "Australia/Melbourne" | ||
/// ); | ||
/// ``` | ||
#[derive(Debug, Clone)] | ||
pub struct TimeZoneIdMapper { | ||
data: DataPayload<IanaToBcp47MapV2Marker>, | ||
} | ||
|
||
impl TimeZoneIdMapper { | ||
/// Creates a new [`TimeZoneIdMapper`] using compiled data. | ||
/// | ||
/// See [`TimeZoneIdMapper`] for an example. | ||
/// | ||
/// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
/// | ||
/// [📚 Help choosing a constructor](icu_provider::constructors) | ||
#[cfg(feature = "compiled_data")] | ||
pub fn new() -> Self { | ||
Self { | ||
data: DataPayload::from_static_ref( | ||
crate::provider::Baked::SINGLETON_TIME_ZONE_IANA_TO_BCP47_V2, | ||
), | ||
} | ||
} | ||
|
||
icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: DataError, | ||
#[cfg(skip)] | ||
functions: [ | ||
new, | ||
try_new_with_any_provider, | ||
try_new_with_buffer_provider, | ||
try_new_unstable, | ||
Self, | ||
] | ||
); | ||
|
||
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
pub fn try_new_unstable<P>(provider: &P) -> Result<Self, DataError> | ||
where | ||
P: DataProvider<IanaToBcp47MapV2Marker> + ?Sized, | ||
{ | ||
let data = provider.load(Default::default())?.take_payload()?; | ||
Ok(Self { data }) | ||
} | ||
|
||
/// Returns a borrowed version of the mapper that can be queried. | ||
/// | ||
/// This avoids a small potential cost of reading the data pointer. | ||
pub fn as_borrowed(&self) -> TimeZoneIdMapperBorrowed { | ||
TimeZoneIdMapperBorrowed { | ||
data: self.data.get(), | ||
} | ||
} | ||
} | ||
|
||
/// A borrowed wrapper around the time zone ID mapper, returned by | ||
/// [`TimeZoneIdMapper::as_borrowed()`]. More efficient to query. | ||
#[derive(Debug, Copy, Clone)] | ||
pub struct TimeZoneIdMapperBorrowed<'a> { | ||
data: &'a IanaToBcp47MapV2<'a>, | ||
} | ||
|
||
impl<'a> TimeZoneIdMapperBorrowed<'a> { | ||
/// Gets the BCP-47 time zone ID from an IANA time zone ID | ||
/// with a case-insensitive lookup. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use icu_timezone::TimeZoneBcp47Id; | ||
/// use icu_timezone::TimeZoneIdMapper; | ||
/// use tinystr::tinystr; | ||
/// | ||
/// let mapper = TimeZoneIdMapper::new(); | ||
/// let mapper = mapper.as_borrowed(); | ||
/// | ||
/// assert_eq!( | ||
/// mapper.iana_to_bcp47("AMERICA/chicago"), | ||
/// Some(TimeZoneBcp47Id(tinystr!(8, "uschi"))) | ||
/// ); | ||
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// ``` | ||
pub fn iana_to_bcp47(&self, iana_id: &str) -> Option<TimeZoneBcp47Id> { | ||
self.iana_lookup_quick(iana_id) | ||
.and_then(|trie_value| self.data.bcp47_ids.get(trie_value.index())) | ||
} | ||
|
||
/// Normalizes the syntax of an IANA time zone ID. | ||
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// | ||
/// Also returns the BCP-47 time zone ID. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use icu_timezone::TimeZoneBcp47Id; | ||
/// use icu_timezone::TimeZoneIdMapper; | ||
/// use tinystr::tinystr; | ||
/// | ||
/// let mapper = TimeZoneIdMapper::new(); | ||
/// let mapper = mapper.as_borrowed(); | ||
/// | ||
/// let normalized = mapper.normalize_iana("AMERICA/chicago").unwrap(); | ||
/// | ||
/// assert_eq!(normalized.string, "America/Chicago"); | ||
/// assert_eq!(normalized.bcp47_id, TimeZoneBcp47Id(tinystr!(8, "uschi"))); | ||
/// ``` | ||
pub fn normalize_iana<'s>(&self, iana_id: &'s str) -> Option<NormalizedIana<'s>> { | ||
let Some((trie_value, string)) = self.iana_lookup_with_normalization(iana_id, |_| {}) | ||
else { | ||
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return None; | ||
}; | ||
let Some(bcp47_id) = self.data.bcp47_ids.get(trie_value.index()) else { | ||
debug_assert!(false, "index should be in range"); | ||
return None; | ||
}; | ||
Some(NormalizedIana { string, bcp47_id }) | ||
} | ||
|
||
/// Returns the canonical, normalized name of the given IANA time zone. | ||
/// | ||
/// Also returns the BCP-47 time zone ID. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use icu_timezone::TimeZoneBcp47Id; | ||
/// use icu_timezone::TimeZoneIdMapper; | ||
/// use tinystr::tinystr; | ||
/// | ||
/// let mapper = TimeZoneIdMapper::new(); | ||
/// let mapper = mapper.as_borrowed(); | ||
/// | ||
/// let canonicalized = mapper.canonicalize_iana("Asia/Calcutta").unwrap(); | ||
/// | ||
/// assert_eq!(canonicalized.string, "Asia/Kolkata"); | ||
/// assert_eq!( | ||
/// canonicalized.bcp47_id, | ||
/// TimeZoneBcp47Id(tinystr!(8, "inccu")) | ||
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// ); | ||
/// ``` | ||
pub fn canonicalize_iana<'s>(&self, iana_id: &'s str) -> Option<NormalizedIana<'s>> { | ||
// Note: We collect the cursors into a stack so that we start probing | ||
// nearby the input IANA name. This should improve lookup time since | ||
// most renames share the same prefix like "Asia" or "Europe". | ||
let mut stack = Vec::with_capacity(iana_id.len()); | ||
let Some((trie_value, mut string)) = | ||
self.iana_lookup_with_normalization(iana_id, |cursor| { | ||
stack.push((cursor.clone(), 0, 1)); | ||
}) | ||
else { | ||
return None; | ||
}; | ||
let Some(bcp47_id) = self.data.bcp47_ids.get(trie_value.index()) else { | ||
debug_assert!(false, "index should be in range"); | ||
return None; | ||
}; | ||
if trie_value.is_canonical() { | ||
return Some(NormalizedIana { string, bcp47_id }); | ||
} | ||
// If we get here, we need to walk the trie to find the canonical IANA ID. | ||
loop { | ||
let Some((mut cursor, index, suffix_len)) = stack.pop() else { | ||
// Nothing left in the trie. | ||
debug_assert!(false, "every time zone should have a canonical IANA ID"); | ||
return None; | ||
}; | ||
// Check to see if there is a value at the current node. | ||
if let Some(candidate) = cursor.take_value().map(IanaTrieValue) { | ||
if candidate.index() == trie_value.index() && candidate.is_canonical() { | ||
// Success! Found what we were looking for. | ||
break; | ||
} | ||
} | ||
// Now check for children of the current node. | ||
let mut sub_cursor = cursor.clone(); | ||
if let Some(probe_result) = sub_cursor.probe(index) { | ||
// Found a child. Add the current byte edge to the string. | ||
if !probe_result.byte.is_ascii() { | ||
debug_assert!(false, "non-ASCII probe byte: {}", probe_result.byte); | ||
return None; | ||
} | ||
// Safety: the byte being added is ASCII as guarded above | ||
unsafe { string.to_mut().as_mut_vec().push(probe_result.byte) }; | ||
// Add the child to the stack, and also add back the current | ||
// node if there are more siblings to visit. | ||
if index + 1 < probe_result.total_siblings as usize { | ||
stack.push((cursor, index + 1, suffix_len)); | ||
stack.push((sub_cursor, 0, 1)); | ||
} else { | ||
stack.push((sub_cursor, 0, suffix_len + 1)); | ||
} | ||
} else { | ||
// No more children. Pop this node's bytes from the string. | ||
for _ in 0..suffix_len { | ||
// Safety: we check that the bytes being removed are ASCII | ||
let removed_byte = unsafe { string.to_mut().as_mut_vec().pop() }; | ||
if let Some(removed_byte) = removed_byte { | ||
if !removed_byte.is_ascii() { | ||
debug_assert!(false, "non-ASCII removed byte: {removed_byte}"); | ||
return None; | ||
} | ||
} else { | ||
debug_assert!(false, "could not remove another byte"); | ||
return None; | ||
} | ||
} | ||
} | ||
} | ||
Some(NormalizedIana { string, bcp47_id }) | ||
} | ||
|
||
/// Returns the canonical, normalized IANA ID of the given BCP-47 ID. | ||
/// | ||
/// Only use this function if you don't have the IANA ID. [`Self::canonicalize_iana()`] | ||
/// is much faster in the common case. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use icu_timezone::TimeZoneBcp47Id; | ||
/// use icu_timezone::TimeZoneIdMapper; | ||
/// use tinystr::tinystr; | ||
/// | ||
/// let mapper = TimeZoneIdMapper::new(); | ||
/// let mapper = mapper.as_borrowed(); | ||
/// | ||
/// let canonicalized = mapper.canonicalize_iana("Asia/Calcutta").unwrap(); | ||
/// | ||
/// assert_eq!(canonicalized.string, "Asia/Kolkata"); | ||
/// assert_eq!( | ||
/// canonicalized.bcp47_id, | ||
/// TimeZoneBcp47Id(tinystr!(8, "inccu")) | ||
/// ); | ||
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// ``` | ||
pub fn bcp47_to_iana_search(&self, bcp47_id: TimeZoneBcp47Id) -> Option<String> { | ||
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// TODO: This is not as efficient as .probe() since it allocates a string each time. | ||
for (string, raw_value) in self.data.map.iter() { | ||
let trie_value = IanaTrieValue(raw_value); | ||
if !trie_value.is_canonical() { | ||
continue; | ||
} | ||
let Some(candidate_bcp47_id) = self.data.bcp47_ids.get(trie_value.index()) else { | ||
debug_assert!(false, "index should be in range"); | ||
return None; | ||
}; | ||
if candidate_bcp47_id == bcp47_id { | ||
return Some(string); | ||
} | ||
} | ||
None | ||
} | ||
|
||
fn iana_lookup_quick(&self, iana_id: &str) -> Option<IanaTrieValue> { | ||
self.data.map.get(iana_id).map(IanaTrieValue) | ||
} | ||
|
||
fn iana_lookup_with_normalization<'l, 's>( | ||
&'l self, | ||
iana_id: &'s str, | ||
mut cursor_fn: impl FnMut(&ZeroAsciiIgnoreCaseTrieCursor<'l>), | ||
) -> Option<(IanaTrieValue, Cow<'s, str>)> { | ||
let mut cursor = self.data.map.cursor(); | ||
let mut string = Cow::Borrowed(iana_id); | ||
let mut i = 0; | ||
let trie_value = loop { | ||
cursor_fn(&cursor); | ||
let Some(input_byte) = string.as_bytes().get(i).copied() else { | ||
break cursor.take_value().map(IanaTrieValue); | ||
}; | ||
let Some(matched_byte) = cursor.step(input_byte) else { | ||
break None; | ||
}; | ||
if matched_byte != input_byte { | ||
// Safety: we write to input_byte farther down after performing safety checks. | ||
let Some(input_byte) = unsafe { string.to_mut().as_bytes_mut() }.get_mut(i) else { | ||
debug_assert!(false, "the same index was just accessed earlier"); | ||
break None; | ||
}; | ||
if !input_byte.is_ascii() { | ||
debug_assert!(false, "non-ASCII input byte: {input_byte}"); | ||
break None; | ||
} | ||
if !matched_byte.is_ascii() { | ||
debug_assert!(false, "non-ASCII matched byte: {matched_byte}"); | ||
break None; | ||
} | ||
// Safety: we just checked that both input_byte and matched_byte are ASCII, | ||
// so the buffer remains UTF-8 when we replace one with the other. | ||
*input_byte = matched_byte; | ||
} | ||
i += 1; | ||
}?; | ||
Some((trie_value, string)) | ||
} | ||
} | ||
|
||
/// A wrapper around a syntax-normalized IANA time zone identifier string | ||
/// and its corresponding BCP-47 time zone identifier. | ||
#[derive(Debug)] | ||
pub struct NormalizedIana<'s> { | ||
/// The syntax-normalized IANA time zone identifier string. | ||
pub string: Cow<'s, str>, | ||
/// The corresponding BCP-47 time zone identifier. | ||
pub bcp47_id: TimeZoneBcp47Id, | ||
robertbastian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
#[derive(Copy, Clone)] | ||
#[repr(transparent)] | ||
struct IanaTrieValue(usize); | ||
|
||
impl IanaTrieValue { | ||
#[inline] | ||
pub fn index(self) -> usize { | ||
self.0 >> 1 | ||
} | ||
#[inline] | ||
pub fn is_canonical(self) -> bool { | ||
(self.0 & 0x1) != 0 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For developers unfamiliar with the details of the IANA Time Zone Database (which is most of them!), it may be helpful to understand the scale of the timezone ID data, in order to help them understand the perf and RAM consequences of various mapping options. Here's one possible way to do it. Feel free to ignore if this kind of info is not appropriate here.