Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

whitespace normalization and processing instructions #75

Merged
merged 9 commits into from
Nov 23, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/parser/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ fn parse_children(iter: &mut Iter<'_>, parent: &mut Element) -> Result<()> {
}
} else {
let text = parse_text(iter)?;
parent.add_node(Node::Text(text));
if !text.is_empty() {
parent.add_node(Node::Text(text));
}
}
// some parsing functions may return with the iter pointing to the last thing that was part
// of their construct, while others might advance the iter to the next char *after* the
Expand Down
11 changes: 6 additions & 5 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use std::collections::HashMap;
use std::iter::Peekable;
use std::path::Path;
use std::str::Chars;

use crate::error::{display_char, parse_err, Error, ParseError, Result, ThrowSite, XmlSite};
use crate::parser::chars::{is_name_char, is_name_start_char};
use crate::parser::element::parse_element;
use crate::parser::pi::{parse_pi, parse_pi_logic};
use crate::{Declaration, Document, Encoding, Misc, Version};
use std::path::Path;

mod chars;
mod element;
Expand Down Expand Up @@ -320,20 +320,21 @@ fn parse_document(iter: &mut Iter<'_>, document: &mut Document) -> Result<()> {
// the values found into the mutable document parameter
fn parse_declaration_pi(iter: &mut Iter<'_>, document: &mut Document) -> Result<()> {
state_must_be_before_declaration(iter)?;
let (target, instructions) = parse_pi_logic(iter)?;
document.set_declaration(parse_declaration(&target, &instructions)?);
let (target, data) = parse_pi_logic(iter)?;
document.set_declaration(parse_declaration(&target, &data)?);
Ok(())
}

fn parse_declaration(target: &str, instructions: &[String]) -> Result<Declaration> {
fn parse_declaration(target: &str, data: &str) -> Result<Declaration> {
let mut declaration = Declaration::default();
if target != "xml" {
return raise!("pi_data.target != xml");
}
let instructions: Vec<&str> = data.split_whitespace().collect();
if instructions.len() > 2 {
return raise!("");
webern marked this conversation as resolved.
Show resolved Hide resolved
}
let map = parse_as_map(instructions)?;
let map = parse_as_map(&instructions)?;
if let Some(&val) = map.get("version") {
match val {
"1.0" => {
Expand Down
256 changes: 110 additions & 146 deletions src/parser/pi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,173 +2,137 @@ use crate::error::Result;
use crate::parser::Iter;
use crate::PI;

use super::chars::{is_name_char, is_name_start_char};

#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
enum PIStatus {
BeforeTarget,
InsideTarget,
AfterTarget,
AfterInstruction,
QuestionMark,
Close,
}
/// The iter should be pointing to the opening `<` of a processing instruction.
pub(crate) fn parse_pi_logic(iter: &mut Iter<'_>) -> Result<(String, String)> {
expect!(iter, '<')?;
iter.advance_or_die()?;
expect!(iter, '?')?;
iter.advance_or_die()?;

impl Default for PIStatus {
fn default() -> Self {
PIStatus::BeforeTarget
// handle the special case <??>
if iter.is('?') {
iter.advance_or_die()?;
expect!(iter, '>')?;
iter.advance();
return Ok(("".into(), "".into()));
}
}

#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Default)]
struct PIProcessor {
status: PIStatus,
target: String,
instructions: Vec<String>,
let target = parse_pi_target(iter)?;
let mut data = String::new();
loop {
if iter.is('?') && iter.peek_is('>') {
iter.advance_or_die()?;
iter.advance();
break;
}
data.push(iter.st.c);
iter.advance_or_die()?;
}
Ok((target, data))
}

/// The iter should be pointing to the opening `<` of a processing instruction.
pub(crate) fn parse_pi_logic(iter: &mut Iter<'_>) -> Result<(String, Vec<String>)> {
expect!(iter, '<')?;
iter.advance_or_die()?;
expect!(iter, '?')?;
/// Must be a valid name terminated by whitespace.
fn parse_pi_target(iter: &mut Iter<'_>) -> Result<String> {
if !iter.is_name_start_char() {
return parse_err!(iter, "expected name start char, found '{}'", iter.st.c);
}
let mut name = String::new();
name.push(iter.st.c);
iter.advance_or_die()?;
let mut processor = PIProcessor::default();
loop {
take_processing_instruction_char(iter, &mut processor)?;
if processor.status == PIStatus::Close {
if iter.is_whitespace() {
iter.advance_or_die()?;
break;
} else if iter.is('?') {
// e.g. <?target??
break;
} else if !iter.is_name_char() {
return parse_err!(iter, "expected name char, found '{}'", iter.st.c);
} else {
name.push(iter.st.c);
}
iter.advance_or_die()?;
}
Ok((processor.target, processor.instructions))
Ok(name)
}

/// The iter should be pointing to the opening `<` of a processing instruction.
pub(crate) fn parse_pi(iter: &mut Iter<'_>) -> Result<PI> {
let (target, instructions) = parse_pi_logic(iter)?;
Ok(PI {
target,
instructions,
})
let (target, data) = parse_pi_logic(iter)?;
Ok(PI { target, data })
}

fn take_processing_instruction_char(
iter: &mut Iter<'_>,
processor: &mut PIProcessor,
) -> Result<()> {
match processor.status {
PIStatus::BeforeTarget => {
if !is_name_start_char(iter.st.c) {
return parse_err!(iter);
} else {
processor.target.push(iter.st.c);
processor.status = PIStatus::InsideTarget;
}
}
PIStatus::InsideTarget => {
if iter.st.c.is_ascii_whitespace() {
processor.status = PIStatus::AfterTarget;
} else if !is_name_char(iter.st.c) {
return parse_err!(iter);
} else {
processor.target.push(iter.st.c);
}
}
PIStatus::AfterTarget | PIStatus::AfterInstruction => {
if iter.st.c == '?' {
processor.status = PIStatus::QuestionMark;
} else if !iter.is_whitespace() {
let instruction = parse_pi_string(iter)?;
processor.instructions.push(instruction);
if iter.is('?') {
processor.status = PIStatus::QuestionMark;
} else if !iter.is_whitespace() {
return parse_err!(iter);
} else {
processor.status = PIStatus::AfterInstruction;
}
}
}
PIStatus::QuestionMark => {
if iter.st.c == '>' {
processor.status = PIStatus::Close;
} else {
return parse_err!(iter);
}
}
PIStatus::Close => { /* done */ }
}
Ok(())
////////////////////////////////////////////////////////////////////////////////////////////////////

#[test]
fn parse_pi_easy() {
let pi_str = "<?target data?>";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("target", pi.target);
assert_eq!("data", pi.data);
assert!(!iter.advance());
}

fn is_pi_close(iter: &mut Iter<'_>) -> Result<bool> {
Ok(iter.is('?') && iter.peek_or_die()? == '>')
#[test]
fn parse_pi_peasy() {
let pi_str = "<?target data?>X";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("target", pi.target);
assert_eq!("data", pi.data);
assert!(iter.is('X'));
}

fn parse_pi_string(iter: &mut Iter<'_>) -> Result<String> {
let mut buf = String::new();
loop {
if iter.is_whitespace() || is_pi_close(iter)? {
return Ok(buf);
} else {
buf.push(iter.st.c);
}
if !iter.advance() {
break;
}
}
Ok(buf)
#[test]
fn parse_pi_funky_1() {
let pi_str = "<?pi some data ? >";
let mut iter = Iter::new(pi_str).unwrap();
let parse_result = parse_pi(&mut iter);
assert!(parse_result.is_err());
}

#[test]
fn parse_pi_string_test() {
struct TestCase {
input: &'static str,
want: &'static str,
iter: char,
}
let test_cases = vec![
TestCase {
input: "bloop bleep",
want: "bloop",
iter: ' ',
},
TestCase {
input: "bloop?bleep",
want: "bloop?bleep",
iter: 'p',
},
TestCase {
input: "bloop?>bleep",
want: "bloop",
iter: '?',
},
TestCase {
input: "beer🍺🍺🍺 🍺🍺?>",
want: "beer🍺🍺🍺",
iter: ' ',
},
TestCase {
input: "beer🍺🍺🍺🍺🍺",
want: "beer🍺🍺🍺🍺🍺",
iter: '🍺',
},
];
for test_case in &test_cases {
let mut iter = Iter::new(test_case.input).unwrap();
let got = parse_pi_string(&mut iter).unwrap();
assert_eq!(
got.as_str(),
test_case.want,
"parse_pi_string(\"{}\") returned '{}', expected '{}'",
test_case.input,
got.as_str(),
test_case.want
);
assert_eq!(
iter.st.c, test_case.iter,
"expected iter to be pointing at '{}', got '{}'",
test_case.iter, iter.st.c
);
}
fn parse_pi_funky_2() {
let pi_str = "<??>";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("", pi.target);
assert!(pi.data.is_empty());
}

#[test]
fn parse_pi_funky_3() {
// established as not-well-formed by jclark_not_wf_sa_003.xml
let pi_str = "<? ?>";
let mut iter = Iter::new(pi_str).unwrap();
let parse_result = parse_pi(&mut iter);
assert!(parse_result.is_err());
}

#[test]
fn parse_pi_funky_4() {
let pi_str = "< ? ? >";
let mut iter = Iter::new(pi_str).unwrap();
let parse_result = parse_pi(&mut iter);
assert!(parse_result.is_err());
}

#[test]
fn parse_pi_funky_5() {
let pi_str = "<?bones?>";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("bones", pi.target);
assert!(pi.data.is_empty());
}

#[test]
fn parse_pi_funky_6() {
// this is from jclark_valid_sa_017.xml
let pi_str = "<?pi some data ? > <??>";
let mut iter = Iter::new(pi_str).unwrap();
let pi = parse_pi(&mut iter).unwrap();
assert_eq!("pi", pi.target);
assert_eq!("some data ? > <?", pi.data);
}
22 changes: 21 additions & 1 deletion src/parser/string.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::error::Result;
use crate::parser::Iter;
use crate::xdoc::is_whitespace;

#[derive(Clone, Copy, Eq, PartialEq, Debug)]
pub(crate) enum StringType {
Expand All @@ -16,16 +17,35 @@ pub(crate) enum StringType {

/// Parse a string that terminates based on some character(s) determined by `string_type`.
pub(crate) fn parse_string(iter: &mut Iter<'_>, string_type: StringType) -> Result<String> {
let mut space_buffer = None;
let mut is_non_white_reached = false;
let mut result = String::new();
while !is_end_char(iter, string_type) {
if iter.st.c == '&' {
let c = parse_escape(iter)?;
result.push(c);
if is_whitespace(c) {
if is_non_white_reached {
space_buffer = Some(' ');
}
} else {
is_non_white_reached = true;
result.push(c);
}
} else if is_forbidden(iter, string_type) {
return parse_err!(iter, "forbidden character in {:?} string", string_type);
} else if iter.is_whitespace() {
if is_non_white_reached {
space_buffer = Some(' ');
}
} else {
is_non_white_reached = true;
if let Some(space) = space_buffer {
result.push(space);
space_buffer = None;
}
result.push(iter.st.c);
}

if !iter.advance() {
return parse_err!(
iter,
Expand Down
10 changes: 0 additions & 10 deletions src/xdoc/chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,3 @@
pub fn is_whitespace(c: char) -> bool {
c == ' ' || c == '\t' || c == '\r' || c == '\n'
}

/// contains `(#x20 | #x9 | #xD | #xA)`, i.e. space, tab, carriage return, or line feed.
pub fn contains_whitespace<S: AsRef<str>>(s: S) -> bool {
for c in s.as_ref().chars() {
if is_whitespace(c) {
return true;
}
}
false
}
Loading