Skip to content

Commit

Permalink
whitespace normalization and processing instructions (#75)
Browse files Browse the repository at this point in the history
Normalize whitespace of text nodes, ignores considerations for #55
and treats all nodes as if 'whitespace collapse' were in effect.

Improve processing instructions #12
  • Loading branch information
webern authored Nov 23, 2020
1 parent 867bbcd commit 55a8e29
Show file tree
Hide file tree
Showing 47 changed files with 802 additions and 247 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ Currently we are using v0.0.x where every version can and will contain breaking

## [Unreleased]
## Added
- Nothing yet.
- Whitespace normalization of text nodes [#75]

### Changed
- Make `Element` struct members private [#74]
- Improve processing instructions [#75]

[#74]: https://github.com/webern/exile/pull/74
[#75]: https://github.com/webern/exile/pull/75

## [v0.0.2] - 2020-11-15
### Added
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@ Currently supported:
- Text Nodes
- Processing Instructions
- UTF-8
- Whitespace Normalization

Not Supported:
- Entities
- Entity References
- Doctypes
- Comment Parsing
- Other Encodings
- Whitespace Preservation
- Whitesace Preservation: All text nodes are treated as if whitespace `collapse` were in-effect.

## Example

Expand Down
3 changes: 2 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ Currently supported:
- Text Nodes
- Processing Instructions
- UTF-8
- Whitespace Normalization
Not Supported:
- Entities
- Entity References
- Doctypes
- Comment Parsing
- Other Encodings
- Whitespace Preservation
- Whitesace Preservation: All text nodes are treated as if whitespace `collapse` were in-effect.
# Example
Expand Down
4 changes: 3 additions & 1 deletion src/parser/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ fn parse_children(iter: &mut Iter<'_>, parent: &mut Element) -> Result<()> {
}
} else {
let text = parse_text(iter)?;
parent.add_node(Node::Text(text));
if !text.is_empty() {
parent.add_node(Node::Text(text));
}
}
// some parsing functions may return with the iter pointing to the last thing that was part
// of their construct, while others might advance the iter to the next char *after* the
Expand Down
16 changes: 10 additions & 6 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use std::collections::HashMap;
use std::iter::Peekable;
use std::path::Path;
use std::str::Chars;

use crate::error::{display_char, parse_err, Error, ParseError, Result, ThrowSite, XmlSite};
use crate::parser::chars::{is_name_char, is_name_start_char};
use crate::parser::element::parse_element;
use crate::parser::pi::{parse_pi, parse_pi_logic};
use crate::{Declaration, Document, Encoding, Misc, Version};
use std::path::Path;

mod chars;
mod element;
Expand Down Expand Up @@ -320,20 +320,24 @@ fn parse_document(iter: &mut Iter<'_>, document: &mut Document) -> Result<()> {
// the values found into the mutable document parameter
fn parse_declaration_pi(iter: &mut Iter<'_>, document: &mut Document) -> Result<()> {
state_must_be_before_declaration(iter)?;
let (target, instructions) = parse_pi_logic(iter)?;
document.set_declaration(parse_declaration(&target, &instructions)?);
let (target, data) = parse_pi_logic(iter)?;
document.set_declaration(parse_declaration(&target, &data)?);
Ok(())
}

fn parse_declaration(target: &str, instructions: &[String]) -> Result<Declaration> {
fn parse_declaration(target: &str, data: &str) -> Result<Declaration> {
let mut declaration = Declaration::default();
if target != "xml" {
return raise!("pi_data.target != xml");
}
let instructions: Vec<&str> = data.split_whitespace().collect();
if instructions.len() > 2 {
return raise!("");
return raise!(
"only able to parse xml declarations that include version and encoding. \
a string split of the xml processing instruction data yielded more than two items."
);
}
let map = parse_as_map(instructions)?;
let map = parse_as_map(&instructions)?;
if let Some(&val) = map.get("version") {
match val {
"1.0" => {
Expand Down
256 changes: 110 additions & 146 deletions src/parser/pi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,173 +2,137 @@ use crate::error::Result;
use crate::parser::Iter;
use crate::PI;

use super::chars::{is_name_char, is_name_start_char};

#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
enum PIStatus {
BeforeTarget,
InsideTarget,
AfterTarget,
AfterInstruction,
QuestionMark,
Close,
}
/// The iter should be pointing to the opening `<` of a processing instruction.
pub(crate) fn parse_pi_logic(iter: &mut Iter<'_>) -> Result<(String, String)> {
expect!(iter, '<')?;
iter.advance_or_die()?;
expect!(iter, '?')?;
iter.advance_or_die()?;

impl Default for PIStatus {
fn default() -> Self {
PIStatus::BeforeTarget
// handle the special case <??>
if iter.is('?') {
iter.advance_or_die()?;
expect!(iter, '>')?;
iter.advance();
return Ok(("".into(), "".into()));
}
}

#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Default)]
struct PIProcessor {
status: PIStatus,
target: String,
instructions: Vec<String>,
let target = parse_pi_target(iter)?;
let mut data = String::new();
loop {
if iter.is('?') && iter.peek_is('>') {
iter.advance_or_die()?;
iter.advance();
break;
}
data.push(iter.st.c);
iter.advance_or_die()?;
}
Ok((target, data))
}

/// The iter should be pointing to the opening `<` of a processing instruction.
pub(crate) fn parse_pi_logic(iter: &mut Iter<'_>) -> Result<(String, Vec<String>)> {
expect!(iter, '<')?;
iter.advance_or_die()?;
expect!(iter, '?')?;
/// Must be a valid name terminated by whitespace.
fn parse_pi_target(iter: &mut Iter<'_>) -> Result<String> {
if !iter.is_name_start_char() {
return parse_err!(iter, "expected name start char, found '{}'", iter.st.c);
}
let mut name = String::new();
name.push(iter.st.c);
iter.advance_or_die()?;
let mut processor = PIProcessor::default();
loop {
take_processing_instruction_char(iter, &mut processor)?;
if processor.status == PIStatus::Close {
if iter.is_whitespace() {
iter.advance_or_die()?;
break;
} else if iter.is('?') {
// e.g. <?target??
break;
} else if !iter.is_name_char() {
return parse_err!(iter, "expected name char, found '{}'", iter.st.c);
} else {
name.push(iter.st.c);
}
iter.advance_or_die()?;
}
Ok((processor.target, processor.instructions))
Ok(name)
}

/// The iter should be pointing to the opening `<` of a processing instruction.
pub(crate) fn parse_pi(iter: &mut Iter<'_>) -> Result<PI> {
let (target, instructions) = parse_pi_logic(iter)?;
Ok(PI {
target,
instructions,
})
let (target, data) = parse_pi_logic(iter)?;
Ok(PI { target, data })
}

fn take_processing_instruction_char(
iter: &mut Iter<'_>,
processor: &mut PIProcessor,
) -> Result<()> {
match processor.status {
PIStatus::BeforeTarget => {
if !is_name_start_char(iter.st.c) {
return parse_err!(iter);
} else {
processor.target.push(iter.st.c);
processor.status = PIStatus::InsideTarget;
}
}
PIStatus::InsideTarget => {
if iter.st.c.is_ascii_whitespace() {
processor.status = PIStatus::AfterTarget;
} else if !is_name_char(iter.st.c) {
return parse_err!(iter);
} else {
processor.target.push(iter.st.c);
}
}
PIStatus::AfterTarget | PIStatus::AfterInstruction => {
if iter.st.c == '?' {
processor.status = PIStatus::QuestionMark;
} else if !iter.is_whitespace() {
let instruction = parse_pi_string(iter)?;
processor.instructions.push(instruction);
if iter.is('?') {
processor.status = PIStatus::QuestionMark;
} else if !iter.is_whitespace() {
return parse_err!(iter);
} else {
processor.status = PIStatus::AfterInstruction;
}
}
}
PIStatus::QuestionMark => {
if iter.st.c == '>' {
processor.status = PIStatus::Close;
} else {
return parse_err!(iter);
}
}
PIStatus::Close => { /* done */ }
}
Ok(())
////////////////////////////////////////////////////////////////////////////////////////////////////

#[test]
fn parse_pi_easy() {
let pi_str = "<?target data?>";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("target", pi.target);
assert_eq!("data", pi.data);
assert!(!iter.advance());
}

fn is_pi_close(iter: &mut Iter<'_>) -> Result<bool> {
Ok(iter.is('?') && iter.peek_or_die()? == '>')
#[test]
fn parse_pi_peasy() {
let pi_str = "<?target data?>X";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("target", pi.target);
assert_eq!("data", pi.data);
assert!(iter.is('X'));
}

fn parse_pi_string(iter: &mut Iter<'_>) -> Result<String> {
let mut buf = String::new();
loop {
if iter.is_whitespace() || is_pi_close(iter)? {
return Ok(buf);
} else {
buf.push(iter.st.c);
}
if !iter.advance() {
break;
}
}
Ok(buf)
#[test]
fn parse_pi_funky_1() {
let pi_str = "<?pi some data ? >";
let mut iter = Iter::new(pi_str).unwrap();
let parse_result = parse_pi(&mut iter);
assert!(parse_result.is_err());
}

#[test]
fn parse_pi_string_test() {
struct TestCase {
input: &'static str,
want: &'static str,
iter: char,
}
let test_cases = vec![
TestCase {
input: "bloop bleep",
want: "bloop",
iter: ' ',
},
TestCase {
input: "bloop?bleep",
want: "bloop?bleep",
iter: 'p',
},
TestCase {
input: "bloop?>bleep",
want: "bloop",
iter: '?',
},
TestCase {
input: "beer🍺🍺🍺 🍺🍺?>",
want: "beer🍺🍺🍺",
iter: ' ',
},
TestCase {
input: "beer🍺🍺🍺🍺🍺",
want: "beer🍺🍺🍺🍺🍺",
iter: '🍺',
},
];
for test_case in &test_cases {
let mut iter = Iter::new(test_case.input).unwrap();
let got = parse_pi_string(&mut iter).unwrap();
assert_eq!(
got.as_str(),
test_case.want,
"parse_pi_string(\"{}\") returned '{}', expected '{}'",
test_case.input,
got.as_str(),
test_case.want
);
assert_eq!(
iter.st.c, test_case.iter,
"expected iter to be pointing at '{}', got '{}'",
test_case.iter, iter.st.c
);
}
fn parse_pi_funky_2() {
let pi_str = "<??>";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("", pi.target);
assert!(pi.data.is_empty());
}

#[test]
fn parse_pi_funky_3() {
// established as not-well-formed by jclark_not_wf_sa_003.xml
let pi_str = "<? ?>";
let mut iter = Iter::new(pi_str).unwrap();
let parse_result = parse_pi(&mut iter);
assert!(parse_result.is_err());
}

#[test]
fn parse_pi_funky_4() {
let pi_str = "< ? ? >";
let mut iter = Iter::new(pi_str).unwrap();
let parse_result = parse_pi(&mut iter);
assert!(parse_result.is_err());
}

#[test]
fn parse_pi_funky_5() {
let pi_str = "<?bones?>";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("bones", pi.target);
assert!(pi.data.is_empty());
}

#[test]
fn parse_pi_funky_6() {
// this is from jclark_valid_sa_017.xml
let pi_str = "<?pi some data ? > <??>";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("pi", pi.target);
assert_eq!("some data ? > <?", pi.data);
}
Loading

0 comments on commit 55a8e29

Please sign in to comment.