diff --git a/components/casemap/src/greek_to_me/data.rs b/components/casemap/src/greek_to_me/data.rs index 61ef0f5bee1..05a1e06414c 100644 --- a/components/casemap/src/greek_to_me/data.rs +++ b/components/casemap/src/greek_to_me/data.rs @@ -9,9 +9,9 @@ // All u8s in this file are PackedGreekPrecomposedLetterDatas, see parent module /// Data for characters in U+370-U+3FF -pub(crate) const DATA_370: [u8; 0x90] = [128, 128, 128, 128, 0, 0, 128, 128, 0, 0, 128, 128, 128, 128, 0, 128, 0, 0, 0, 0, 0, 0, 65, 0, 66, 67, 68, 0, 69, 0, 70, 71, 100, 1, 128, 128, 128, 2, 128, 3, 128, 4, 128, 128, 128, 128, 128, 5, 128, 128, 0, 128, 128, 6, 128, 128, 128, 7, 36, 38, 65, 66, 67, 68, 102, 1, 128, 128, 128, 2, 128, 3, 128, 4, 128, 128, 128, 128, 128, 5, 128, 128, 128, 128, 128, 6, 128, 128, 128, 7, 36, 38, 69, 70, 71, 128, 128, 128, 8, 72, 40, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 128, 128, 128, 2, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128]; +pub(crate) const DATA_370: [u8; 0x90] = [128, 128, 128, 128, 0, 0, 128, 128, 0, 0, 128, 128, 128, 128, 0, 128, 0, 0, 0, 0, 0, 0, 65, 0, 66, 67, 68, 0, 69, 0, 70, 71, 100, 1, 128, 128, 128, 2, 128, 3, 128, 4, 128, 128, 128, 128, 128, 5, 128, 129, 0, 128, 128, 6, 128, 128, 128, 7, 36, 38, 65, 66, 67, 68, 102, 1, 128, 128, 128, 2, 128, 3, 128, 4, 128, 128, 128, 128, 128, 5, 128, 129, 128, 128, 128, 6, 128, 128, 128, 7, 36, 38, 69, 70, 71, 128, 128, 128, 8, 72, 40, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 129, 128, 128, 128, 2, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128]; /// Data for characters in U+1F00-U+1FFF -pub(crate) const DATA_1F00: [u8; 0xfd] = [1, 1, 65, 65, 65, 65, 65, 65, 1, 1, 65, 65, 65, 65, 65, 65, 2, 2, 66, 66, 66, 66, 0, 0, 2, 2, 66, 66, 66, 66, 0, 0, 3, 3, 67, 67, 67, 67, 67, 67, 3, 3, 67, 67, 67, 67, 67, 67, 4, 4, 68, 68, 68, 68, 68, 68, 4, 4, 68, 68, 68, 68, 68, 68, 5, 5, 69, 69, 69, 69, 0, 0, 5, 5, 69, 69, 69, 69, 0, 0, 6, 6, 70, 70, 70, 70, 70, 70, 0, 6, 0, 70, 0, 70, 0, 70, 7, 7, 71, 71, 71, 71, 71, 71, 7, 7, 71, 71, 71, 71, 71, 71, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 0, 0, 17, 17, 81, 81, 81, 81, 81, 81, 17, 17, 81, 81, 81, 81, 81, 81, 19, 19, 83, 83, 83, 83, 83, 83, 19, 19, 83, 83, 83, 83, 83, 83, 23, 23, 87, 87, 87, 87, 87, 87, 23, 23, 87, 87, 87, 87, 87, 87, 1, 1, 81, 17, 81, 0, 65, 81, 1, 1, 65, 65, 17, 0, 4, 0, 0, 0, 83, 19, 83, 0, 67, 83, 66, 66, 67, 67, 19, 0, 0, 0, 4, 4, 100, 100, 0, 0, 68, 100, 4, 4, 68, 68, 0, 0, 0, 0, 6, 6, 102, 102, 128, 128, 70, 102, 6, 6, 70, 70, 128, 0, 0, 0, 0, 0, 87, 23, 87, 0, 71, 87, 69, 69, 71, 71, 23]; +pub(crate) const DATA_1F00: [u8; 0xfd] = [1, 1, 65, 65, 65, 65, 65, 65, 1, 1, 65, 65, 65, 65, 65, 65, 2, 2, 66, 66, 66, 66, 0, 0, 2, 2, 66, 66, 66, 66, 0, 0, 3, 3, 67, 67, 67, 67, 67, 67, 3, 3, 67, 67, 67, 67, 67, 67, 4, 4, 68, 68, 68, 68, 68, 68, 4, 4, 68, 68, 68, 68, 68, 68, 5, 5, 69, 69, 69, 69, 0, 0, 5, 5, 69, 69, 69, 69, 0, 0, 6, 6, 70, 70, 70, 70, 70, 70, 0, 6, 0, 70, 0, 70, 0, 70, 7, 7, 71, 71, 71, 71, 71, 71, 7, 7, 71, 71, 71, 71, 71, 71, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 0, 0, 17, 17, 81, 81, 81, 81, 81, 81, 17, 17, 81, 81, 81, 81, 81, 81, 19, 19, 83, 83, 83, 83, 83, 83, 19, 19, 83, 83, 83, 83, 83, 83, 23, 23, 87, 87, 87, 87, 87, 87, 23, 23, 87, 87, 87, 87, 87, 87, 1, 1, 81, 17, 81, 0, 65, 81, 1, 1, 65, 65, 17, 0, 4, 0, 0, 0, 83, 19, 83, 0, 67, 83, 66, 66, 67, 67, 19, 0, 0, 0, 4, 4, 100, 100, 0, 0, 68, 100, 4, 4, 68, 68, 0, 0, 0, 0, 6, 6, 102, 102, 129, 129, 70, 102, 6, 6, 70, 70, 129, 0, 0, 0, 0, 0, 87, 23, 87, 0, 71, 87, 69, 69, 71, 71, 23]; /// Characters like the ohm sign that do not belong in the two blocks above pub(crate) fn match_extras(ch: char) -> Option { @@ -21,3 +21,4 @@ pub(crate) fn match_extras(ch: char) -> Option { _ => return None }) } + diff --git a/components/casemap/src/greek_to_me/mod.rs b/components/casemap/src/greek_to_me/mod.rs index cb2153f0f48..c73d5bc576f 100644 --- a/components/casemap/src/greek_to_me/mod.rs +++ b/components/casemap/src/greek_to_me/mod.rs @@ -35,9 +35,9 @@ pub(crate) fn get_data(ch: char) -> Option { /// Bit layout: /// /// ```text -/// 7 6 5 4 3 2 1 0 -/// discr=0 | [diacritics] | [vowel ] -/// discr=1 | [ unused = 0 ] +/// 7 6 5 4 3 2 1 0 +/// discr=0 | [diacritics] | [vowel ] +/// discr=1 | [ unused = 0 ] | [is_rho] /// ``` /// /// Bit 7 is the discriminant. if 0, it is a vowel, else, it is a consonant. @@ -70,8 +70,8 @@ impl TryFrom for GreekPrecomposedLetterData { Ok(GreekPrecomposedLetterData::Vowel(vowel, diacritics)) } else { // consonant - - Ok(GreekPrecomposedLetterData::Consonant) + // 0x80 is is_rho = false, 0x81 is is_rho = true + Ok(GreekPrecomposedLetterData::Consonant(other.0 == 0x81)) } } } @@ -93,7 +93,9 @@ impl From for PackedGreekPrecomposedLetterData { bits |= vowel as u8; PackedGreekPrecomposedLetterData(bits) } - GreekPrecomposedLetterData::Consonant => PackedGreekPrecomposedLetterData(0x80), + GreekPrecomposedLetterData::Consonant(is_rho) => { + PackedGreekPrecomposedLetterData(0x80 + is_rho as u8) + } } } } @@ -104,7 +106,10 @@ pub enum GreekPrecomposedLetterData { /// A vowel, with a capitalized base letter, and the diacritics found Vowel(GreekVowel, GreekDiacritics), /// A consonant or vowel that does not take diacritics - Consonant, + /// + /// The boolean is true when the consonant is a rho, which is handled specially since + /// it can take breathing marks (but is *not* a vowel) + Consonant(bool), } /// n.b. these are Greek capital letters, not Latin @@ -120,6 +125,7 @@ pub enum GreekVowel { Ω = 7, ϒ = 8, } +pub const CAPITAL_RHO: char = 'Ρ'; impl From for char { fn from(other: GreekVowel) -> Self { diff --git a/components/casemap/src/internals.rs b/components/casemap/src/internals.rs index 1a8336d6137..e789764e18b 100644 --- a/components/casemap/src/internals.rs +++ b/components/casemap/src/internals.rs @@ -238,81 +238,88 @@ impl<'data> CaseMapV1<'data> { } let data = greek_to_me::get_data(c); // Check if the character is a Greek vowel - if let Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) = data - { - // Get the diacritics on the character itself, and add any further combining diacritics - // from the context. - let mut diacritics = context.add_greek_diacritics(precomposed_diacritics); - // If the previous vowel had an accent (which would be removed) but no dialytika, - // and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate - // the now-unaccented adjacent vowels from a digraph/diphthong. - // Use a precomposed dialytika if the accent was precomposed, and a combining dialytika - // if the accent was combining, so as to map NFD to NFD and NFC to NFC. - if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ) { - if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() { - if !preceding_vowel.combining.dialytika - && !preceding_vowel.precomposed.dialytika - { - if preceding_vowel.combining.accented { - diacritics.dialytika = true; - } else { - precomposed_diacritics.dialytika = - preceding_vowel.precomposed.accented; + match data { + Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => { + // Get the diacritics on the character itself, and add any further combining diacritics + // from the context. + let mut diacritics = context.add_greek_diacritics(precomposed_diacritics); + // If the previous vowel had an accent (which would be removed) but no dialytika, + // and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate + // the now-unaccented adjacent vowels from a digraph/diphthong. + // Use a precomposed dialytika if the accent was precomposed, and a combining dialytika + // if the accent was combining, so as to map NFD to NFD and NFC to NFC. + if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ) + { + if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() { + if !preceding_vowel.combining.dialytika + && !preceding_vowel.precomposed.dialytika + { + if preceding_vowel.combining.accented { + diacritics.dialytika = true; + } else { + precomposed_diacritics.dialytika = + preceding_vowel.precomposed.accented; + } } } } - } - // Write the base of the uppercased combining character sequence. - // In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed. - // In some branches the base has a precomposed diacritic. - // In the case of the Greek disjunctive "or", a combining tonos may also be written. - match vowel { - GreekVowel::Η => { - // The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish - // the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή). - // - // A lone η with an accent other than the oxia/tonos is not expected, - // so there is no need to special-case the oxia/tonos. - // The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex, - // so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle - // (e.g. να είναι) since Byzantine times anyway. - if diacritics.accented - && !context.followed_by_cased_letter(self) - && !context.preceded_by_cased_letter(self) - && !diacritics.ypogegrammeni - { - if precomposed_diacritics.accented { - sink.write_char('Ή')?; + // Write the base of the uppercased combining character sequence. + // In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed. + // In some branches the base has a precomposed diacritic. + // In the case of the Greek disjunctive "or", a combining tonos may also be written. + match vowel { + GreekVowel::Η => { + // The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish + // the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή). + // + // A lone η with an accent other than the oxia/tonos is not expected, + // so there is no need to special-case the oxia/tonos. + // The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex, + // so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle + // (e.g. να είναι) since Byzantine times anyway. + if diacritics.accented + && !context.followed_by_cased_letter(self) + && !context.preceded_by_cased_letter(self) + && !diacritics.ypogegrammeni + { + if precomposed_diacritics.accented { + sink.write_char('Ή')?; + } else { + sink.write_char('Η')?; + sink.write_char(greek_to_me::TONOS)?; + } } else { sink.write_char('Η')?; - sink.write_char(greek_to_me::TONOS)?; } - } else { - sink.write_char('Η')?; } + GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika { + diacritics.dialytika = false; + 'Ϊ' + } else { + vowel.into() + })?, + GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika { + diacritics.dialytika = false; + 'Ϋ' + } else { + vowel.into() + })?, + _ => sink.write_char(vowel.into())?, + }; + if diacritics.dialytika { + sink.write_char(greek_to_me::DIALYTIKA)?; + } + if precomposed_diacritics.ypogegrammeni { + sink.write_char('Ι')?; } - GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika { - diacritics.dialytika = false; - 'Ϊ' - } else { - vowel.into() - })?, - GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika { - diacritics.dialytika = false; - 'Ϋ' - } else { - vowel.into() - })?, - _ => sink.write_char(vowel.into())?, - }; - if diacritics.dialytika { - sink.write_char(greek_to_me::DIALYTIKA)?; + + return Ok(()); } - if precomposed_diacritics.ypogegrammeni { - sink.write_char('Ι')?; + Some(GreekPrecomposedLetterData::Consonant(true)) => { + sink.write_char(greek_to_me::CAPITAL_RHO)?; + return Ok(()); } - - return Ok(()); + _ => (), } } diff --git a/components/casemap/tests/gen_greek_to_me.rs b/components/casemap/tests/gen_greek_to_me.rs index 53eb5736f0e..51713509611 100644 --- a/components/casemap/tests/gen_greek_to_me.rs +++ b/components/casemap/tests/gen_greek_to_me.rs @@ -66,6 +66,12 @@ fn main() { panic!("Found character {ch} that has diacritics but is not a Greek vowel"); } } + greek_to_me::diacritics!(BREATHING_AND_LENGTH) + | greek_to_me::diacritics!(ACCENTS) => { + if let Some(GreekPrecomposedLetterData::Consonant(false)) = data { + panic!("Found character {ch} that has diacritics but is not a Greek vowel"); + } + } // Ignore all small letters '\u{1D00}'..='\u{1DBF}' | '\u{AB65}' => (), // caps: [[:Grek:]&[:L:]-[\u1D00-\u1DBF\uAB65]] . NFD, remove non-letters, uppercase @@ -87,7 +93,8 @@ fn main() { GreekDiacritics::default(), )) } else { - data = Some(GreekPrecomposedLetterData::Consonant) + let is_rho = uppercased == greek_to_me::CAPITAL_RHO; + data = Some(GreekPrecomposedLetterData::Consonant(is_rho)) }; } _ => (),