Skip to content

Commit

Permalink
Add all fixups for retaining explicits
Browse files Browse the repository at this point in the history
  • Loading branch information
Manishearth committed Dec 20, 2022
1 parent 8c9781b commit 98c05a8
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 14 deletions.
22 changes: 17 additions & 5 deletions src/explicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,17 @@ pub fn compute(
RLE | LRE | RLO | LRO | RLI | LRI | FSI => {
let last_level = stack.last().level;

// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
levels[i] = last_level;

// X5a-X5c: Isolate initiators get the level of the last entry on the stack.
let is_isolate = match original_classes[i] {
RLI | LRI | FSI => true,
_ => false,
};
if is_isolate {
levels[i] = last_level;
// Redundant due to "Retaining explicit formatting characters" step
// levels[i] = last_level;
match stack.last().status {
OverrideStatus::RTL => processing_classes[i] = R,
OverrideStatus::LTR => processing_classes[i] = L,
Expand Down Expand Up @@ -90,10 +94,17 @@ pub fn compute(
} else if overflow_isolate_count == 0 {
overflow_embedding_count += 1;
}

if !is_isolate {
// X9 +
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
processing_classes[i] = BN;
}
}

// <http://www.unicode.org/reports/tr9/#X6a>
PDI => {
// The BN is from <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
PDI | BN => {
if overflow_isolate_count > 0 {
overflow_isolate_count -= 1;
} else if valid_isolate_count > 0 {
Expand Down Expand Up @@ -132,13 +143,14 @@ pub fn compute(
if stack.last().status != OverrideStatus::Isolate && stack.vec.len() >= 2 {
stack.vec.pop();
}
// The spec doesn't explicitly mention this step, but it is necessary.
// See the reference implementations for comparison.
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
levels[i] = stack.last().level;
processing_classes[i] = BN;
}

// Nothing
B | BN => {}
// BN case moved up with PDI, see <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
B => {}

// <http://www.unicode.org/reports/tr9/#X6>
_ => {
Expand Down
64 changes: 56 additions & 8 deletions src/implicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ pub fn resolve_weak(
let mut prev_class = sequence.sos;
let mut last_strong_is_al = false;
let mut et_run_indices = Vec::new(); // for W5
let mut bn_run_indices = Vec::new(); // for W5 + <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>

// Like sequence.runs.iter().flat_map(Clone::clone), but make indices itself clonable.
fn id(x: LevelRun) -> LevelRun {
x
Expand Down Expand Up @@ -88,6 +90,7 @@ pub fn resolve_weak(
.clone()
.skip(ch.len_utf8() - 1)
.map(|j| processing_classes[j])
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
.find(not_removed_by_x9)
.unwrap_or(sequence.eos);
if next_class == EN && last_strong_is_al {
Expand All @@ -99,6 +102,25 @@ pub fn resolve_weak(
(EN, ES, EN) | (EN, CS, EN) => EN,
(AN, CS, AN) => AN,
(_, _, _) => ON,
};

// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
// we have to do this before W5 gets its grubby hands on these characters and thinks
// they're part of an ET run
if processing_classes[i] == ON {
for class in processing_classes[..i]
.iter_mut()
.rev()
.take_while(|c| **c == BN)
{
*class = ON;
}
for class in processing_classes[(i + ch.len_utf8() - 1)..]
.iter_mut()
.take_while(|c| **c == BN)
{
*class = ON;
}
}
} else {
// we're in the middle of a character, copy over work done for previous bytes
Expand All @@ -110,16 +132,30 @@ pub fn resolve_weak(
ET => {
match prev_class {
EN => processing_classes[i] = EN,
_ => et_run_indices.push(i), // In case this is followed by an EN.
_ => {
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
// if there was a BN run before this, that's now a part of this ET run
et_run_indices.extend(&bn_run_indices);

// In case this is followed by an EN.
et_run_indices.push(i);
}
}
}
class => {
if removed_by_x9(class) {
continue;
}
BN => {
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
// keeps track of bn runs for W5 in case we see an ET
bn_run_indices.push(i);
// skips over BNs for W1
continue;
}
_ => {}
}

// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
// BN runs would not exit the above loop
bn_run_indices.clear();

prev_class = processing_classes[i];
match w2_processing_class {
L | R => {
Expand Down Expand Up @@ -153,6 +189,8 @@ pub fn resolve_weak(
R | AL => {
last_strong_is_l = false;
}
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
// already scanning past BN here
_ => {}
}
}
Expand Down Expand Up @@ -286,22 +324,30 @@ pub fn resolve_neutral<D: BidiDataSource>(
for class in &mut processing_classes[pair.end..pair.end + end_len_utf8] {
*class = class_to_set;
}
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
for class in processing_classes[..pair.start]
.iter_mut()
.rev()
.take_while(|c| **c == BN)
{
*class = class_to_set;
}
// > Any number of characters that had original bidirectional character type NSM prior to the application of
// > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.

// This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
// about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
let nsm_start = pair.start + start_len_utf8;
for (idx, class) in original_classes[nsm_start..].iter().enumerate() {
if *class == BidiClass::NSM {
if *class == BidiClass::NSM || processing_classes[nsm_start + idx] == BN {
processing_classes[nsm_start + idx] = class_to_set;
} else {
break;
}
}
let nsm_end = pair.end + end_len_utf8;
for (idx, class) in original_classes[nsm_end..].iter().enumerate() {
if *class == BidiClass::NSM {
if *class == BidiClass::NSM || processing_classes[nsm_end + idx] == BN {
processing_classes[nsm_end + idx] = class_to_set;
} else {
break;
Expand All @@ -321,7 +367,8 @@ pub fn resolve_neutral<D: BidiDataSource>(
while let Some(mut i) = indices.next() {
// Process sequences of NI characters.
let mut ni_run = Vec::new();
if is_NI(processing_classes[i]) {
// The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
if is_NI(processing_classes[i]) || processing_classes[i] == BN {
// Consume a run of consecutive NI characters.
ni_run.push(i);
let mut next_class;
Expand Down Expand Up @@ -469,6 +516,7 @@ pub fn resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> L
(false, R) | (true, L) | (true, EN) | (true, AN) => {
levels[i].raise(1).expect("Level number error")
}
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> handled here
(_, _) => {}
}
max_level = max(max_level, levels[i]);
Expand Down
2 changes: 1 addition & 1 deletion tests/conformance_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ fn gen_base_levels_for_base_tests(bitset: u8) -> Vec<Option<Level>> {
}

#[test]
#[should_panic(expected = "29 test cases failed! (91678 passed)")]
#[should_panic(expected = "19 test cases failed! (91688 passed)")]
fn test_character_conformance() {
let test_data = include_str!("data/BidiCharacterTest.txt");

Expand Down

0 comments on commit 98c05a8

Please sign in to comment.