Skip to content

Commit

Permalink
Implement MeasureUnit (#4360)
Browse files Browse the repository at this point in the history
  • Loading branch information
younies authored Dec 7, 2023
1 parent 011c630 commit cb242cb
Show file tree
Hide file tree
Showing 11 changed files with 2,494 additions and 186 deletions.
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions experimental/unitsconversion/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ displaydoc = { version = "0.2.3", default-features = false }
icu_locid = { workspace = true }
icu_provider = { workspace = true, features = ["macros"] }
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }
smallvec = "1.11.2"
zerotrie = { workspace = true, features = ["yoke", "zerofrom"] }
zerovec = { workspace = true, features = ["yoke"] }

Expand Down
16 changes: 16 additions & 0 deletions experimental/unitsconversion/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,20 @@

extern crate alloc;

pub mod measureunit;
pub mod power;
pub mod provider;
pub mod si_prefix;

/// Represents the possible errors that can occur during the measurement unit operations.
pub enum ConversionError {
/// The unit is not valid.
/// This can happen if the unit id is not following the CLDR specification.
/// For example, `meter` is a valid unit id, but `metre` is not.
InvalidUnit,

/// The conversion is not valid.
/// This can happen if the units are not compatible.
/// For example, `meter` and `foot` are compatible, but `meter` and `second` are not.
InvalidConversion,
}
129 changes: 129 additions & 0 deletions experimental/unitsconversion/src/measureunit.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use smallvec::SmallVec;
use zerotrie::ZeroTrie;
use zerovec::ZeroVec;

use crate::{
power::get_power,
provider::{Base, MeasureUnitItem, SiPrefix},
si_prefix::{get_si_prefix_base_ten, get_si_prefix_base_two},
ConversionError,
};

// TODO(#4369): split this struct to two structs: MeasureUnitParser for parsing the identifier and MeasureUnit to represent the unit.
// TODO NOTE: the MeasureUnitParser takes the trie and the ConverterFactory takes the full payload and an instance of MeasureUnitParser.
pub struct MeasureUnit {
/// Contains the processed units.
pub contained_units: SmallVec<[MeasureUnitItem; 8]>,
}

impl MeasureUnit {
// TODO: complete all the cases for the prefixes.
// TODO: consider using a trie for the prefixes.
/// Extracts the SI prefix.
/// NOTE:
/// if the prefix is found, the function will return (SiPrefix, part without the prefix string).
/// if the prefix is not found, the function will return (SiPrefix { power: 0, base: Base::Decimal }, part).
fn get_si_prefix(part: &str) -> (SiPrefix, &str) {
let (si_prefix_base_10, part) = get_si_prefix_base_ten(part);
if si_prefix_base_10 != 0 {
return (
SiPrefix {
power: si_prefix_base_10,
base: Base::Decimal,
},
part,
);
}

let (si_prefix_base_2, part) = get_si_prefix_base_two(part);
if si_prefix_base_2 != 0 {
return (
SiPrefix {
power: si_prefix_base_2,
base: Base::Binary,
},
part,
);
}

(
SiPrefix {
power: 0,
base: Base::Decimal,
},
part,
)
}

/// Get the unit id.
/// NOTE:
/// if the unit id is found, the function will return (unit id, part without the unit id and without `-` at the beginning of the remaining part if it exists).
/// if the unit id is not found, the function will return None.
fn get_unit_id<'data>(part: &'data str, trie: &ZeroTrie<ZeroVec<'data, u8>>) -> Option<usize> {
trie.get(part.as_bytes())
}

/// Process a part of an identifier.
/// For example, if the whole identifier is: "square-kilometer-per-second",
/// this function will be called for "square-kilometer" with sign (1) and "second" with sign (-1).
fn analyze_identifier_part(
identifier_part: &str,
sign: i8,
result: &mut Vec<MeasureUnitItem>,
trie: &ZeroTrie<ZeroVec<'_, u8>>,
) -> Result<(), ConversionError> {
if identifier_part.is_empty() {
return Ok(());
}
let mut identifier_split = identifier_part.split('-');
while let Some(mut part) = identifier_split.next() {
let power = match get_power(part) {
Some(power) => {
part = identifier_split
.next()
.ok_or(ConversionError::InvalidUnit)?;
power
}
None => 1,
};

let (si_prefix, identifier_after_si) = Self::get_si_prefix(part);
let unit_id =
Self::get_unit_id(identifier_after_si, trie).ok_or(ConversionError::InvalidUnit)?;

result.push(MeasureUnitItem {
power: sign * power,
si_prefix,
unit_id: unit_id as u16,
});
}

Ok(())
}

// TODO: add test cases for this function.
/// Process an identifier.
pub fn try_from_identifier<'data>(
identifier: &'data str,
trie: &ZeroTrie<ZeroVec<'data, u8>>,
) -> Result<Vec<MeasureUnitItem>, ConversionError> {
if identifier.starts_with('-') {
return Err(ConversionError::InvalidUnit);
}

let (num_part, den_part) = identifier
.split_once("per-")
.map(|(num_part, den_part)| (num_part.strip_suffix('-').unwrap_or(num_part), den_part))
.unwrap_or((identifier, ""));

let mut measure_unit_items = Vec::<MeasureUnitItem>::new();

Self::analyze_identifier_part(num_part, 1, &mut measure_unit_items, trie)?;
Self::analyze_identifier_part(den_part, -1, &mut measure_unit_items, trie)?;
Ok(measure_unit_items)
}
}
28 changes: 28 additions & 0 deletions experimental/unitsconversion/src/power.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

// TODO: consider returning Option<(u8, &str)> instead of (1, part) for the case when the power is not found.
// TODO: complete all the cases for the powers.
// TODO: consider using a trie for the powers.
/// Converts a power string to a power.
pub fn get_power(part: &str) -> Option<i8> {
match part {
"pow1" => Some(1),
"square" | "pow2" => Some(2),
"cubic" | "pow3" => Some(3),
"pow4" => Some(4),
"pow5" => Some(5),
"pow6" => Some(6),
"pow7" => Some(7),
"pow8" => Some(8),
"pow9" => Some(9),
"pow10" => Some(10),
"pow11" => Some(11),
"pow12" => Some(12),
"pow13" => Some(13),
"pow14" => Some(14),
"pow15" => Some(15),
_ => None,
}
}
66 changes: 62 additions & 4 deletions experimental/unitsconversion/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
//!
//! Read more about data providers: [`icu_provider`]

use alloc::borrow::Cow;
use icu_provider::prelude::*;
use zerotrie::ZeroTrie;
use zerovec::{VarZeroVec, ZeroVec};
Expand Down Expand Up @@ -37,7 +36,7 @@ pub const KEYS: &[DataKey] = &[UnitsInfoV1Marker::KEY];
pub struct UnitsInfoV1<'data> {
/// Maps from unit name (e.g. foot) to it is conversion information.
#[cfg_attr(feature = "serde", serde(borrow))]
pub units_conversion_map: ZeroTrie<ZeroVec<'data, u8>>,
pub units_conversion_trie: ZeroTrie<ZeroVec<'data, u8>>,

/// Contains the conversion information, such as the conversion rate and the base unit.
/// For example, the conversion information for the unit `foot` is `1 foot = 0.3048 meter`.
Expand Down Expand Up @@ -66,9 +65,9 @@ pub struct UnitsInfoV1<'data> {
)]
#[zerovec::derive(Debug)]
pub struct ConversionInfo<'data> {
/// Contains the base unit which the unit is converted to.
/// Contains the base unit (after parsing) which what the unit is converted to.
#[cfg_attr(feature = "serde", serde(borrow))]
pub base_unit: Cow<'data, str>,
pub basic_units: ZeroVec<'data, MeasureUnitItem>,

/// Represents the numerator of the conversion factor.
#[cfg_attr(feature = "serde", serde(borrow))]
Expand Down Expand Up @@ -129,3 +128,62 @@ pub enum Exactness {
Exact = 0,
Approximate = 1,
}

#[zerovec::make_ule(BaseULE)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_unitsconversion::provider),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Default)]
#[repr(u8)]
pub enum Base {
/// The base of the si prefix is 10.
#[default]
Decimal = 0,

/// The base of the si prefix is 2.
Binary = 1,
}

/// Represents an Item of a MeasureUnit.
/// For example, the MeasureUnit `kilometer-per-square-second` contains two items:
/// 1. `kilometer` with power 1 and prefix 3 with base 10.
/// 2. `second` with power -2 and prefix `NotExist`.
#[zerovec::make_ule(MeasureUnitItemULE)]
#[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Default)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_unitsconversion::provider),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct MeasureUnitItem {
/// The power of the unit.
pub power: i8,

/// The si base of the unit.
pub si_prefix: SiPrefix,

/// The id of the unit.
pub unit_id: u16,
}

// TODO: Consider reducing the size of this struct while implementing the ULE.
/// Represents the SI prefix.
#[zerovec::make_ule(SiPrefixULE)]
#[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Default)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_unitsconversion::provider),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct SiPrefix {
/// The absolute value of the power of the si prefix.
pub power: i8,

/// The base of the si prefix.
pub base: Base,
}
93 changes: 93 additions & 0 deletions experimental/unitsconversion/src/si_prefix.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

// TODO: consider returning Option<(i8, &str)> instead of (0, part) for the case when the prefix is not found.
// TODO: consider using a trie for the prefixes.
// TODO: complete all the cases for the prefixes.
/// Extracts the SI prefix of base 10.
/// NOTE:
/// if the prefix is found, the function will return (power, part without the prefix).
/// if the prefix is not found, the function will return (0, part).
pub fn get_si_prefix_base_ten(part: &str) -> (i8, &str) {
if let Some(part) = part.strip_prefix("quetta") {
(30, part)
} else if let Some(part) = part.strip_prefix("ronna") {
(27, part)
} else if let Some(part) = part.strip_prefix("yotta") {
(24, part)
} else if let Some(part) = part.strip_prefix("zetta") {
(21, part)
} else if let Some(part) = part.strip_prefix("exa") {
(18, part)
} else if let Some(part) = part.strip_prefix("peta") {
(15, part)
} else if let Some(part) = part.strip_prefix("tera") {
(12, part)
} else if let Some(part) = part.strip_prefix("giga") {
(9, part)
} else if let Some(part) = part.strip_prefix("mega") {
(6, part)
} else if let Some(part) = part.strip_prefix("kilo") {
(3, part)
} else if let Some(part) = part.strip_prefix("hecto") {
(2, part)
} else if let Some(part) = part.strip_prefix("deca") {
(1, part)
} else if let Some(part) = part.strip_prefix("deci") {
(-1, part)
} else if let Some(part) = part.strip_prefix("centi") {
(-2, part)
} else if let Some(part) = part.strip_prefix("milli") {
(-3, part)
} else if let Some(part) = part.strip_prefix("micro") {
(-6, part)
} else if let Some(part) = part.strip_prefix("nano") {
(-9, part)
} else if let Some(part) = part.strip_prefix("pico") {
(-12, part)
} else if let Some(part) = part.strip_prefix("femto") {
(-15, part)
} else if let Some(part) = part.strip_prefix("atto") {
(-18, part)
} else if let Some(part) = part.strip_prefix("zepto") {
(-21, part)
} else if let Some(part) = part.strip_prefix("yocto") {
(-24, part)
} else if let Some(part) = part.strip_prefix("ronto") {
(-27, part)
} else if let Some(part) = part.strip_prefix("quecto") {
(-30, part)
} else {
(0, part)
}
}

// TODO: consider returning Option<(i8, &str)> instead of (0, part) for the case when the prefix is not found.
// TODO: consider using a trie for the prefixes.
// TODO: complete all the cases for the prefixes.
/// Extracts the SI prefix of base 2.
/// NOTE:
/// if the prefix is found, the function will return (power, part without the prefix).
/// if the prefix is not found, the function will return (0, part).
pub fn get_si_prefix_base_two(part: &str) -> (i8, &str) {
if let Some(part) = part.strip_prefix("kibi") {
(10, part)
} else if let Some(part) = part.strip_prefix("mebi") {
(20, part)
} else if let Some(part) = part.strip_prefix("gibi") {
(30, part)
} else if let Some(part) = part.strip_prefix("tebi") {
(40, part)
} else if let Some(part) = part.strip_prefix("pebi") {
(50, part)
} else if let Some(part) = part.strip_prefix("exbi") {
(60, part)
} else if let Some(part) = part.strip_prefix("zebi") {
(70, part)
} else if let Some(part) = part.strip_prefix("yobi") {
(80, part)
} else {
(0, part)
}
}
Loading

0 comments on commit cb242cb

Please sign in to comment.