Skip to content

Commit

Permalink
Handle rho specially
Browse files Browse the repository at this point in the history
  • Loading branch information
Manishearth committed Aug 25, 2023
1 parent b236769 commit c686348
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 75 deletions.
5 changes: 3 additions & 2 deletions components/casemap/src/greek_to_me/data.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 13 additions & 7 deletions components/casemap/src/greek_to_me/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ pub(crate) fn get_data(ch: char) -> Option<GreekPrecomposedLetterData> {
/// Bit layout:
///
/// ```text
/// 7 6 5 4 3 2 1 0
/// discr=0 | [diacritics] | [vowel ]
/// discr=1 | [ unused = 0 ]
/// 7 6 5 4 3 2 1 0
/// discr=0 | [diacritics] | [vowel ]
/// discr=1 | [ unused = 0 ] | [is_rho]
/// ```
///
/// Bit 7 is the discriminant. if 0, it is a vowel, else, it is a consonant.
Expand Down Expand Up @@ -70,8 +70,8 @@ impl TryFrom<PackedGreekPrecomposedLetterData> for GreekPrecomposedLetterData {
Ok(GreekPrecomposedLetterData::Vowel(vowel, diacritics))
} else {
// consonant

Ok(GreekPrecomposedLetterData::Consonant)
// 0x80 is is_rho = false, 0x81 is is_rho = true
Ok(GreekPrecomposedLetterData::Consonant(other.0 == 0x81))
}
}
}
Expand All @@ -93,7 +93,9 @@ impl From<GreekPrecomposedLetterData> for PackedGreekPrecomposedLetterData {
bits |= vowel as u8;
PackedGreekPrecomposedLetterData(bits)
}
GreekPrecomposedLetterData::Consonant => PackedGreekPrecomposedLetterData(0x80),
GreekPrecomposedLetterData::Consonant(is_rho) => {
PackedGreekPrecomposedLetterData(0x80 + is_rho as u8)
}
}
}
}
Expand All @@ -104,7 +106,10 @@ pub enum GreekPrecomposedLetterData {
/// A vowel, with a capitalized base letter, and the diacritics found
Vowel(GreekVowel, GreekDiacritics),
/// A consonant or vowel that does not take diacritics
Consonant,
///
/// The boolean is true when the consonant is a rho, which is handled specially since
/// it can take breathing marks (but is *not* a vowel)
Consonant(bool),
}

/// n.b. these are Greek capital letters, not Latin
Expand All @@ -120,6 +125,7 @@ pub enum GreekVowel {
Ω = 7,
ϒ = 8,
}
pub const CAPITAL_RHO: char = 'Ρ';

impl From<GreekVowel> for char {
fn from(other: GreekVowel) -> Self {
Expand Down
137 changes: 72 additions & 65 deletions components/casemap/src/internals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,81 +238,88 @@ impl<'data> CaseMapV1<'data> {
}
let data = greek_to_me::get_data(c);
// Check if the character is a Greek vowel
if let Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) = data
{
// Get the diacritics on the character itself, and add any further combining diacritics
// from the context.
let mut diacritics = context.add_greek_diacritics(precomposed_diacritics);
// If the previous vowel had an accent (which would be removed) but no dialytika,
// and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate
// the now-unaccented adjacent vowels from a digraph/diphthong.
// Use a precomposed dialytika if the accent was precomposed, and a combining dialytika
// if the accent was combining, so as to map NFD to NFD and NFC to NFC.
if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ) {
if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() {
if !preceding_vowel.combining.dialytika
&& !preceding_vowel.precomposed.dialytika
{
if preceding_vowel.combining.accented {
diacritics.dialytika = true;
} else {
precomposed_diacritics.dialytika =
preceding_vowel.precomposed.accented;
match data {
Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => {
// Get the diacritics on the character itself, and add any further combining diacritics
// from the context.
let mut diacritics = context.add_greek_diacritics(precomposed_diacritics);
// If the previous vowel had an accent (which would be removed) but no dialytika,
// and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate
// the now-unaccented adjacent vowels from a digraph/diphthong.
// Use a precomposed dialytika if the accent was precomposed, and a combining dialytika
// if the accent was combining, so as to map NFD to NFD and NFC to NFC.
if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ)
{
if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() {
if !preceding_vowel.combining.dialytika
&& !preceding_vowel.precomposed.dialytika
{
if preceding_vowel.combining.accented {
diacritics.dialytika = true;
} else {
precomposed_diacritics.dialytika =
preceding_vowel.precomposed.accented;
}
}
}
}
}
// Write the base of the uppercased combining character sequence.
// In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed.
// In some branches the base has a precomposed diacritic.
// In the case of the Greek disjunctive "or", a combining tonos may also be written.
match vowel {
GreekVowel::Η => {
// The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish
// the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή).
//
// A lone η with an accent other than the oxia/tonos is not expected,
// so there is no need to special-case the oxia/tonos.
// The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex,
// so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle
// (e.g. να είναι) since Byzantine times anyway.
if diacritics.accented
&& !context.followed_by_cased_letter(self)
&& !context.preceded_by_cased_letter(self)
&& !diacritics.ypogegrammeni
{
if precomposed_diacritics.accented {
sink.write_char('Ή')?;
// Write the base of the uppercased combining character sequence.
// In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed.
// In some branches the base has a precomposed diacritic.
// In the case of the Greek disjunctive "or", a combining tonos may also be written.
match vowel {
GreekVowel::Η => {
// The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish
// the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή).
//
// A lone η with an accent other than the oxia/tonos is not expected,
// so there is no need to special-case the oxia/tonos.
// The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex,
// so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle
// (e.g. να είναι) since Byzantine times anyway.
if diacritics.accented
&& !context.followed_by_cased_letter(self)
&& !context.preceded_by_cased_letter(self)
&& !diacritics.ypogegrammeni
{
if precomposed_diacritics.accented {
sink.write_char('Ή')?;
} else {
sink.write_char('Η')?;
sink.write_char(greek_to_me::TONOS)?;
}
} else {
sink.write_char('Η')?;
sink.write_char(greek_to_me::TONOS)?;
}
} else {
sink.write_char('Η')?;
}
GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika {
diacritics.dialytika = false;
'Ϊ'
} else {
vowel.into()
})?,
GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika {
diacritics.dialytika = false;
'Ϋ'
} else {
vowel.into()
})?,
_ => sink.write_char(vowel.into())?,
};
if diacritics.dialytika {
sink.write_char(greek_to_me::DIALYTIKA)?;
}
if precomposed_diacritics.ypogegrammeni {
sink.write_char('Ι')?;
}
GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika {
diacritics.dialytika = false;
'Ϊ'
} else {
vowel.into()
})?,
GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika {
diacritics.dialytika = false;
'Ϋ'
} else {
vowel.into()
})?,
_ => sink.write_char(vowel.into())?,
};
if diacritics.dialytika {
sink.write_char(greek_to_me::DIALYTIKA)?;

return Ok(());
}
if precomposed_diacritics.ypogegrammeni {
sink.write_char('Ι')?;
Some(GreekPrecomposedLetterData::Consonant(true)) => {
sink.write_char(greek_to_me::CAPITAL_RHO)?;
return Ok(());
}

return Ok(());
_ => (),
}
}

Expand Down
9 changes: 8 additions & 1 deletion components/casemap/tests/gen_greek_to_me.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ fn main() {
panic!("Found character {ch} that has diacritics but is not a Greek vowel");
}
}
greek_to_me::diacritics!(BREATHING_AND_LENGTH)
| greek_to_me::diacritics!(ACCENTS) => {
if let Some(GreekPrecomposedLetterData::Consonant(false)) = data {
panic!("Found character {ch} that has diacritics but is not a Greek vowel");
}
}
// Ignore all small letters
'\u{1D00}'..='\u{1DBF}' | '\u{AB65}' => (),
// caps: [[:Grek:]&[:L:]-[\u1D00-\u1DBF\uAB65]] . NFD, remove non-letters, uppercase
Expand All @@ -87,7 +93,8 @@ fn main() {
GreekDiacritics::default(),
))
} else {
data = Some(GreekPrecomposedLetterData::Consonant)
let is_rho = uppercased == greek_to_me::CAPITAL_RHO;
data = Some(GreekPrecomposedLetterData::Consonant(is_rho))
};
}
_ => (),
Expand Down

0 comments on commit c686348

Please sign in to comment.