Skip to content

Commit

Permalink
[lib] Implement L1 rule
Browse files Browse the repository at this point in the history
To be able to implement L1, we need access to more information from
`BidiInfo`, namely `original_classes` of the `text`, in `visual_runs()`,
which would mean it should pass through `reorder_line()`.

The fact that information from `BidiInfo` is needed for both steps of
the public API (generating `BidiInfo` and consuming it
per-paragraph/per-level) made me change the API design and move these
methods into `impl BidiInfo`.

Then, since we needed access to `text` for every `BidiInfo` consumption,
I added a reference to `text` to `BidiInfo`, which also enables more
compile-time checks for `BidiInfo` isntance not outliving the text in
the user code.

NOTE: We are already breaking API in version 0.3.0 and paving for full
spec support is a good reason to do so, IMHO.

The L1 rule works by one pass on the text of the line.

Conformance Test: this implementation reduces the number of failures
from 60494 to 23770 (out of total 256747 cases).

Fix #2
  • Loading branch information
behnam committed May 12, 2017
1 parent 19d9858 commit 6427532
Show file tree
Hide file tree
Showing 8 changed files with 462 additions and 357 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
This software was written by the following people:

Matt Brubeck <mbrubeck@limpet.net>
Behnam Esfahbod <behnam@zwnj.org>
9 changes: 8 additions & 1 deletion src/char_data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ pub fn bidi_class(c: char) -> BidiClass {
bsearch_range_value_table(c, bidi_class_table)
}

pub fn is_rtl(bidi_class: BidiClass) -> bool {
match bidi_class {
RLE | RLO | RLI => true,
_ => false,
}
}

fn bsearch_range_value_table(c: char, r: &'static [(char, char, BidiClass)]) -> BidiClass {
match r.binary_search_by(
|&(lo, hi, _)| if lo <= c && c <= hi {
Expand All @@ -45,7 +52,7 @@ fn bsearch_range_value_table(c: char, r: &'static [(char, char, BidiClass)]) ->
}

#[cfg(test)]
mod test {
mod tests {
use super::*;

#[test]
Expand Down
27 changes: 11 additions & 16 deletions src/explicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@
//!
//! http://www.unicode.org/reports/tr9/#Explicit_Levels_and_Directions

use super::char_data::BidiClass;
use super::char_data::{BidiClass, is_rtl};
use super::level::Level;

use BidiClass::*;

/// Compute explicit embedding levels for one paragraph of text (X1-X8).
///
/// `classes[i]` must contain the BidiClass of the char at byte index `i`,
/// `processing_classes[i]` must contain the BidiClass of the char at byte index `i`,
/// for each char in `text`.
pub fn compute(
text: &str,
para_level: Level,
initial_classes: &[BidiClass],
levels: &mut [Level],
classes: &mut [BidiClass],
processing_classes: &mut [BidiClass],
) {
assert!(text.len() == initial_classes.len());

Expand All @@ -41,13 +41,8 @@ pub fn compute(
match initial_classes[i] {
// Rules X2-X5c
RLE | LRE | RLO | LRO | RLI | LRI | FSI => {
let char_is_rtl = match initial_classes[i] {
RLE | RLO | RLI => true,
_ => false,
};

let last_level = stack.last().level;
let new_level = if char_is_rtl {
let new_level = if is_rtl(initial_classes[i]) {
last_level.new_explicit_next_rtl()
} else {
last_level.new_explicit_next_ltr()
Expand All @@ -58,8 +53,8 @@ pub fn compute(
if is_isolate {
levels[i] = last_level;
match stack.last().status {
OverrideStatus::RTL => classes[i] = R,
OverrideStatus::LTR => classes[i] = L,
OverrideStatus::RTL => processing_classes[i] = R,
OverrideStatus::LTR => processing_classes[i] = L,
_ => {}
}
}
Expand Down Expand Up @@ -108,8 +103,8 @@ pub fn compute(
let last = stack.last();
levels[i] = last.level;
match last.status {
OverrideStatus::RTL => classes[i] = R,
OverrideStatus::LTR => classes[i] = L,
OverrideStatus::RTL => processing_classes[i] = R,
OverrideStatus::LTR => processing_classes[i] = L,
_ => {}
}
}
Expand All @@ -135,16 +130,16 @@ pub fn compute(
let last = stack.last();
levels[i] = last.level;
match last.status {
OverrideStatus::RTL => classes[i] = R,
OverrideStatus::LTR => classes[i] = L,
OverrideStatus::RTL => processing_classes[i] = R,
OverrideStatus::LTR => processing_classes[i] = L,
_ => {}
}
}
}
// Handle multi-byte characters.
for j in 1..c.len_utf8() {
levels[i + j] = levels[i];
classes[i + j] = classes[i];
processing_classes[i + j] = processing_classes[i];
}
}
}
Expand Down
44 changes: 22 additions & 22 deletions src/implicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use BidiClass::*;
/// 3.3.4 Resolving Weak Types
///
/// http://www.unicode.org/reports/tr9/#Resolving_Weak_Types
pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass]) {
pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [BidiClass]) {
// FIXME (#8): This function applies steps W1-W6 in a single pass. This can produce
// incorrect results in cases where a "later" rule changes the value of `prev_class` seen
// by an "earlier" rule. We should either split this into separate passes, or preserve
Expand All @@ -41,38 +41,38 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
.flat_map(id as fn(LevelRun) -> LevelRun);

while let Some(i) = indices.next() {
match classes[i] {
match processing_classes[i] {
// http://www.unicode.org/reports/tr9/#W1
NSM => {
classes[i] = match prev_class {
processing_classes[i] = match prev_class {
RLI | LRI | FSI | PDI => ON,
_ => prev_class,
};
}
EN => {
if last_strong_is_al {
// W2. If previous strong char was AL, change EN to AN.
classes[i] = AN;
processing_classes[i] = AN;
} else {
// W5. If a run of ETs is adjacent to an EN, change the ETs to EN.
for j in &et_run_indices {
classes[*j] = EN;
processing_classes[*j] = EN;
}
et_run_indices.clear();
}
}
// http://www.unicode.org/reports/tr9/#W3
AL => classes[i] = R,
AL => processing_classes[i] = R,

// http://www.unicode.org/reports/tr9/#W4
ES | CS => {
let next_class = indices
.clone()
.map(|j| classes[j])
.map(|j| processing_classes[j])
.filter(not_removed_by_x9)
.next()
.unwrap_or(sequence.eos);
classes[i] = match (prev_class, classes[i], next_class) {
processing_classes[i] = match (prev_class, processing_classes[i], next_class) {
(EN, ES, EN) | (EN, CS, EN) => EN,
(AN, CS, AN) => AN,
(_, _, _) => ON,
Expand All @@ -81,7 +81,7 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
// http://www.unicode.org/reports/tr9/#W5
ET => {
match prev_class {
EN => classes[i] = EN,
EN => processing_classes[i] = EN,
_ => et_run_indices.push(i), // In case this is followed by an EN.
}
}
Expand All @@ -92,7 +92,7 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
}
}

prev_class = classes[i];
prev_class = processing_classes[i];
match prev_class {
L | R => {
last_strong_is_al = false;
Expand All @@ -105,7 +105,7 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
if prev_class != ET {
// W6. If we didn't find an adjacent EN, turn any ETs into ON instead.
for j in &et_run_indices {
classes[*j] = ON;
processing_classes[*j] = ON;
}
et_run_indices.clear();
}
Expand All @@ -115,9 +115,9 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
let mut last_strong_is_l = sequence.sos == L;
for run in &sequence.runs {
for i in run.clone() {
match classes[i] {
match processing_classes[i] {
EN if last_strong_is_l => {
classes[i] = L;
processing_classes[i] = L;
}
L => {
last_strong_is_l = true;
Expand All @@ -137,7 +137,7 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, classes: &mut [BidiClass])
pub fn resolve_neutral(
sequence: &IsolatingRunSequence,
levels: &[Level],
classes: &mut [BidiClass],
processing_classes: &mut [BidiClass],
) {
let mut indices = sequence.runs.iter().flat_map(Clone::clone);
let mut prev_class = sequence.sos;
Expand All @@ -154,18 +154,18 @@ pub fn resolve_neutral(

// Process sequences of NI characters.
let mut ni_run = Vec::new();
if ni(classes[i]) {
if ni(processing_classes[i]) {
// Consume a run of consecutive NI characters.
ni_run.push(i);
let mut next_class;
loop {
match indices.next() {
Some(j) => {
i = j;
if removed_by_x9(classes[i]) {
if removed_by_x9(processing_classes[i]) {
continue;
}
next_class = classes[j];
next_class = processing_classes[j];
if ni(next_class) {
ni_run.push(i);
} else {
Expand All @@ -187,11 +187,11 @@ pub fn resolve_neutral(
(_, _) => levels[i].bidi_class(),
};
for j in &ni_run {
classes[*j] = new_class;
processing_classes[*j] = new_class;
}
ni_run.clear();
}
prev_class = classes[i];
prev_class = processing_classes[i];
}
}

Expand All @@ -200,12 +200,12 @@ pub fn resolve_neutral(
/// Returns the maximum embedding level in the paragraph.
///
/// http://www.unicode.org/reports/tr9/#Resolving_Implicit_Levels
pub fn resolve_levels(classes: &[BidiClass], levels: &mut [Level]) -> Level {
pub fn resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> Level {
let mut max_level = Level::ltr();

assert!(classes.len() == levels.len());
assert!(original_classes.len() == levels.len());
for i in 0..levels.len() {
match (levels[i].is_rtl(), classes[i]) {
match (levels[i].is_rtl(), original_classes[i]) {
// http://www.unicode.org/reports/tr9/#I1
(false, R) => levels[i].raise(1).expect("Level number error"),
(false, AN) | (false, EN) => levels[i].raise(2).expect("Level number error"),
Expand Down
2 changes: 1 addition & 1 deletion src/level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ impl<'a> PartialEq<&'a str> for Level {
}

#[cfg(test)]
mod test {
mod tests {
use super::*;

#[test]
Expand Down
Loading

0 comments on commit 6427532

Please sign in to comment.