Skip to content

Commit

Permalink
Support three Unicode properties required by UAX unicode-org#29
Browse files Browse the repository at this point in the history
The TOML file was obtained from Azure artifact archive from commit
2921a81ee4c67459ff455e31c599e7d7a09086ab titled "ICU-21811 TZ update
2021a (2021e)" on maint/maint-70 branch. Here we import the "small"
flavor of the TOML files.
  • Loading branch information
aethanyc committed Oct 27, 2021
1 parent 131b845 commit a6638c7
Show file tree
Hide file tree
Showing 9 changed files with 8,566 additions and 4 deletions.
101 changes: 101 additions & 0 deletions components/properties/src/props.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ pub enum EnumeratedProperty {
GeneralCategory = 0x1005,
/// The Script property. See [`Script`].
Script = 0x100A,
/// The Grapheme_Cluster_Break enumerated property. See [`GraphemeClusterBreak`].
GraphemeClusterBreak = 0x1012,
/// The Sentence_Break enumerated property. See [`SentenceBreak`].
SentenceBreak = 0x1013,
/// The Word_Break enumerated property. See [`WordBreak`].
WordBreak = 0x1014,
/// The Script_Extensions property. See [`Script`].
ScriptExtensions = 0x7000, // TODO(#1160) - this is a Miscellaneous property, not Enumerated
/// Represents an invalid or unknown Unicode property.
Expand Down Expand Up @@ -416,3 +422,98 @@ impl Script {
pub const Yi: Script = Script(41);
pub const ZanabazarSquare: Script = Script(177);
}

/// Enumerated property Grapheme_Cluster_Break.
///
/// See "Default Grapheme Cluster Boundary Specification" in UAX #29 for the
/// summary of each property value:
/// <https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table>
///
/// The numeric value is compatible with `UGraphemeClusterBreak` in ICU4C.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[repr(transparent)]
pub struct GraphemeClusterBreak(pub u8);

#[allow(missing_docs)] // These constants don't need individual documentation.
#[allow(non_upper_case_globals)]
impl GraphemeClusterBreak {
pub const Other: GraphemeClusterBreak = GraphemeClusterBreak(0); // name="XX"
pub const Control: GraphemeClusterBreak = GraphemeClusterBreak(1); // name="CN"
pub const CR: GraphemeClusterBreak = GraphemeClusterBreak(2); // name="CR"
pub const Extend: GraphemeClusterBreak = GraphemeClusterBreak(3); // name="EX"
pub const L: GraphemeClusterBreak = GraphemeClusterBreak(4); // name="L"
pub const LF: GraphemeClusterBreak = GraphemeClusterBreak(5); // name="LF"
pub const LV: GraphemeClusterBreak = GraphemeClusterBreak(6); // name="LV"
pub const LVT: GraphemeClusterBreak = GraphemeClusterBreak(7); // name="LVT"
pub const T: GraphemeClusterBreak = GraphemeClusterBreak(8); // name="T"
pub const V: GraphemeClusterBreak = GraphemeClusterBreak(9); // name="V"
pub const SpacingMark: GraphemeClusterBreak = GraphemeClusterBreak(10); // name="SM"
pub const Prepend: GraphemeClusterBreak = GraphemeClusterBreak(11); // name="PP"
pub const RegionalIndicator: GraphemeClusterBreak = GraphemeClusterBreak(12); // name="RI"
pub const ZWJ: GraphemeClusterBreak = GraphemeClusterBreak(17); // name="ZWJ"
}

/// Enumerated property Word_Break.
///
/// See "Default Word Boundary Specification" in UAX #29 for the summary of
/// each property value:
/// <https://www.unicode.org/reports/tr29/#Default_Word_Boundaries>.
///
/// The numeric value is compatible with `UWordBreakValues` in ICU4C.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[repr(transparent)]
pub struct WordBreak(pub u8);

#[allow(missing_docs)] // These constants don't need individual documentation.
#[allow(non_upper_case_globals)]
impl WordBreak {
pub const Other: WordBreak = WordBreak(0); // name="XX"
pub const ALetter: WordBreak = WordBreak(1); // name="LE"
pub const Format: WordBreak = WordBreak(2); // name="FO"
pub const Katakana: WordBreak = WordBreak(3); // name="KA"
pub const MidLetter: WordBreak = WordBreak(4); // name="ML"
pub const MidNum: WordBreak = WordBreak(5); // name="MN"
pub const Numeric: WordBreak = WordBreak(6); // name="NU"
pub const ExtendNumLet: WordBreak = WordBreak(7); // name="EX"
pub const CR: WordBreak = WordBreak(8); // name="CR"
pub const Extend: WordBreak = WordBreak(9); // name="Extend"
pub const LF: WordBreak = WordBreak(10); // name="LF"
pub const MidNumLet: WordBreak = WordBreak(11); // name="MB"
pub const Newline: WordBreak = WordBreak(12); // name="NL"
pub const RegionalIndicator: WordBreak = WordBreak(13); // name="RI"
pub const HebrewLetter: WordBreak = WordBreak(14); // name="HL"
pub const SingleQuote: WordBreak = WordBreak(15); // name="SQ"
pub const DoubleQuote: WordBreak = WordBreak(16); // name=DQ
pub const ZWJ: WordBreak = WordBreak(21); // name="ZWJ"
pub const WSegSpace: WordBreak = WordBreak(22); // name="WSegSpace"
}

/// Enumerated property Sentence_Break.
/// See "Default Sentence Boundary Specification" in UAX #29 for the summary of
/// each property value:
/// <https://www.unicode.org/reports/tr29/#Default_Word_Boundaries>.
///
/// The numeric value is compatible with `USentenceBreak` in ICU4C.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[repr(transparent)]
pub struct SentenceBreak(pub u8);

#[allow(missing_docs)] // These constants don't need individual documentation.
#[allow(non_upper_case_globals)]
impl SentenceBreak {
pub const Other: SentenceBreak = SentenceBreak(0); // name="XX"
pub const ATerm: SentenceBreak = SentenceBreak(1); // name="AT"
pub const Close: SentenceBreak = SentenceBreak(2); // name="CL"
pub const Format: SentenceBreak = SentenceBreak(3); // name="FO"
pub const Lower: SentenceBreak = SentenceBreak(4); // name="LO"
pub const Numeric: SentenceBreak = SentenceBreak(5); // name="NU"
pub const OLetter: SentenceBreak = SentenceBreak(6); // name="LE"
pub const Sep: SentenceBreak = SentenceBreak(7); // name="SE"
pub const Sp: SentenceBreak = SentenceBreak(8); // name="SP"
pub const STerm: SentenceBreak = SentenceBreak(9); // name="ST"
pub const Upper: SentenceBreak = SentenceBreak(10); // name="UP"
pub const CR: SentenceBreak = SentenceBreak(11); // name="CR"
pub const Extend: SentenceBreak = SentenceBreak(12); // name="EX"
pub const LF: SentenceBreak = SentenceBreak(13); // name="LF"
pub const SContinue: SentenceBreak = SentenceBreak(14); // name="SC"
}
5 changes: 4 additions & 1 deletion components/properties/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub mod key {
};
}

define_resource_keys!(267;
define_resource_keys!(270;

//
// Binary property UnicodeSets
Expand Down Expand Up @@ -322,6 +322,9 @@ pub mod key {

(GENERAL_CATEGORY_V1, "gc"),
(SCRIPT_V1, "sc"),
(GRAPHEME_CLUSTER_BREAK_V1, "GCB"),
(SENTENCE_BREAK_V1, "SB"),
(WORD_BREAK_V1, "WB"),

);
}
Expand Down
29 changes: 28 additions & 1 deletion components/properties/src/trievalue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::{GeneralSubcategory, Script};
use crate::{GeneralSubcategory, GraphemeClusterBreak, Script, SentenceBreak, WordBreak};
use core::convert::TryInto;
use core::num::TryFromIntError;
use icu_codepointtrie::codepointtrie::TrieValue;
Expand All @@ -27,3 +27,30 @@ impl TrieValue for Script {
u16::try_from(i).map(Script)
}
}

impl TrieValue for GraphemeClusterBreak {
const DATA_GET_ERROR_VALUE: GraphemeClusterBreak = GraphemeClusterBreak::Other;
type TryFromU32Error = TryFromIntError;

fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
}

impl TrieValue for WordBreak {
const DATA_GET_ERROR_VALUE: WordBreak = WordBreak::Other;
type TryFromU32Error = TryFromIntError;

fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
}

impl TrieValue for SentenceBreak {
const DATA_GET_ERROR_VALUE: SentenceBreak = SentenceBreak::Other;
type TryFromU32Error = TryFromIntError;

fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
}
44 changes: 43 additions & 1 deletion components/properties/src/ule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::{GeneralSubcategory, Script};
use crate::{GeneralSubcategory, GraphemeClusterBreak, Script, SentenceBreak, WordBreak};

use core::convert::TryFrom;
use num_enum::TryFromPrimitiveError;
Expand Down Expand Up @@ -59,3 +59,45 @@ impl AsULE for Script {
Script(u16::from_le_bytes(unaligned.0))
}
}

impl AsULE for GraphemeClusterBreak {
type ULE = u8;

#[inline]
fn as_unaligned(self) -> Self::ULE {
self.0
}

#[inline]
fn from_unaligned(unaligned: Self::ULE) -> Self {
Self(unaligned)
}
}

impl AsULE for WordBreak {
type ULE = u8;

#[inline]
fn as_unaligned(self) -> Self::ULE {
self.0
}

#[inline]
fn from_unaligned(unaligned: Self::ULE) -> Self {
Self(unaligned)
}
}

impl AsULE for SentenceBreak {
type ULE = u8;

#[inline]
fn as_unaligned(self) -> Self::ULE {
self.0
}

#[inline]
fn from_unaligned(unaligned: Self::ULE) -> Self {
Self(unaligned)
}
}
Loading

0 comments on commit a6638c7

Please sign in to comment.