diff --git a/src/tree/c14n.rs b/src/tree/c14n.rs new file mode 100644 index 000000000..3f69ff395 --- /dev/null +++ b/src/tree/c14n.rs @@ -0,0 +1,44 @@ +//! Shared canonicalization logic and types. +//! +use std::ffi::c_int; + +use crate::bindings::{ + xmlC14NMode_XML_C14N_1_0, xmlC14NMode_XML_C14N_1_1, xmlC14NMode_XML_C14N_EXCLUSIVE_1_0, +}; + +/// Options for configuring how to canonicalize XML +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)] +pub struct CanonicalizationOptions { + /// Canonicalization specification to use + pub mode: CanonicalizationMode, + /// If true, keep `` comments, otherwise remove + pub with_comments: bool, + /// Namespaces to keep even if they are unused. By default, in [CanonicalizationMode::ExclusiveCanonical1_0], unused namespaces are removed. + /// + /// Doesn't apply to other canonicalization modes. + pub inclusive_ns_prefixes: Vec, +} + +/// Canonicalization specification to use +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)] +pub enum CanonicalizationMode { + /// Original C14N 1.0 spec + Canonical1_0, + /// Exclusive C14N 1.0 spec + #[default] + ExclusiveCanonical1_0, + /// C14N 1.1 spec + Canonical1_1, +} + +impl From for c_int { + fn from(mode: CanonicalizationMode) -> Self { + let c14n_mode = match mode { + CanonicalizationMode::Canonical1_0 => xmlC14NMode_XML_C14N_1_0, + CanonicalizationMode::ExclusiveCanonical1_0 => xmlC14NMode_XML_C14N_EXCLUSIVE_1_0, + CanonicalizationMode::Canonical1_1 => xmlC14NMode_XML_C14N_1_1, + }; + + c_int::from(c14n_mode as i32) + } +} diff --git a/src/tree/document.rs b/src/tree/document.rs index 69ffe9aaf..2931c2edf 100644 --- a/src/tree/document.rs +++ b/src/tree/document.rs @@ -343,3 +343,5 @@ impl Document { Ok(()) } } + +mod c14n; diff --git a/src/tree/document/c14n.rs b/src/tree/document/c14n.rs new file mode 100644 index 000000000..305825117 --- /dev/null +++ b/src/tree/document/c14n.rs @@ -0,0 +1,111 @@ +//! Document canonicalization logic +//! +use std::ffi::{c_int, c_void, CString}; +use std::os::raw; +use std::ptr::null_mut; + +use crate::tree::c14n::*; + +use super::{ + xmlAllocOutputBuffer, xmlC14NExecute, xmlC14NIsVisibleCallback, xmlChar, xmlNodePtr, + xmlOutputBufferClose, xmlOutputBufferPtr, Document, +}; + +impl Document { + /// Canonicalize a document and return the results. + pub fn canonicalize( + &self, + options: CanonicalizationOptions, + callback: Option<(xmlNodePtr, xmlC14NIsVisibleCallback)>, + ) -> Result { + let document = (*self.0).borrow().doc_ptr; + + let mut ns_list_c = to_xml_string_vec(options.inclusive_ns_prefixes); + let inclusive_ns_prefixes = ns_list_c.as_mut_ptr(); + let with_comments = c_int::from(options.with_comments); + + let (is_visible_callback, user_data) = if let Some((node_ptr, visibility_callback)) = callback { + (visibility_callback, node_ptr as *mut _) + } else { + (None, null_mut()) + }; + + let mode = options.mode.into(); + unsafe { + let c_obuf = create_output_buffer(); + + let status = xmlC14NExecute( + document, + is_visible_callback, + user_data, + mode, + inclusive_ns_prefixes, + with_comments, + c_obuf, + ); + + let res = c_obuf_into_output(c_obuf); + + if status < 0 { + Err(()) + } else { + Ok(res) + } + } + } +} + +unsafe fn c_obuf_into_output(c_obuf: xmlOutputBufferPtr) -> String { + let ctx_ptr = (*c_obuf).context; + let output = Box::from_raw(ctx_ptr as *mut String); + + (*c_obuf).context = std::ptr::null_mut::(); + + xmlOutputBufferClose(c_obuf); + + *output +} + +unsafe fn create_output_buffer() -> xmlOutputBufferPtr { + let output = String::new(); + let ctx_ptr = Box::into_raw(Box::new(output)); + let encoder = std::ptr::null_mut(); + + let buf = xmlAllocOutputBuffer(encoder); + + (*buf).writecallback = Some(xml_write_io); + (*buf).closecallback = Some(xml_close_io); + (*buf).context = ctx_ptr as _; + + buf +} + +unsafe extern "C" fn xml_close_io(_context: *mut raw::c_void) -> raw::c_int { + 0 +} + +unsafe extern "C" fn xml_write_io( + io_ptr: *mut raw::c_void, + buffer: *const raw::c_char, + len: raw::c_int, +) -> raw::c_int { + if io_ptr.is_null() { + 0 + } else { + let buf = std::slice::from_raw_parts_mut(buffer as *mut u8, len as usize); + let buf = String::from_utf8_lossy(buf); + let s2_ptr = io_ptr as *mut String; + String::push_str(&mut *s2_ptr, &buf); + + len + } +} + +/// Create a [Vec] of null-terminated [*mut xmlChar] strings +fn to_xml_string_vec(vec: Vec) -> Vec<*mut xmlChar> { + vec + .into_iter() + .map(|s| CString::new(s).unwrap().into_raw() as *mut xmlChar) + .chain(std::iter::once(std::ptr::null_mut())) + .collect() +} diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 228d08db4..11ffb2e37 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -1,6 +1,7 @@ //! The tree functionality //! +pub mod c14n; pub mod document; pub mod namespace; pub mod node; diff --git a/src/tree/node.rs b/src/tree/node.rs index 09a7b9f2a..48c711333 100644 --- a/src/tree/node.rs +++ b/src/tree/node.rs @@ -1054,6 +1054,34 @@ impl Node { context.findnodes(xpath, Some(self)) } + /// Search this node for XPath `path`, and return only the first match. + pub fn at_xpath(&self, path: &str, ns_binlings: &[(&str, &str)]) -> Result, ()> { + let mut context = Context::from_node(self)?; + for (prefix, href) in ns_binlings { + context.register_namespace(prefix, href)?; + } + let nodes = context.findnodes(path, Some(self))?; + + Ok(nodes.first().cloned()) + } + + /// Get a list of ancestor Node for this Node. + pub fn ancestors(&self) -> Vec { + let node_ptr = self.node_ptr(); + + let mut res = Vec::new(); + + let ancestor_ptrs = node_ancestors(node_ptr); + + for ptr in ancestor_ptrs { + if let Some(node) = self.ptr_as_option(ptr) { + res.push(node) + } + } + + res + } + /// find String values via xpath, at a specified node or the document root pub fn findvalues(&self, xpath: &str) -> Result, ()> { let mut context = Context::from_node(self)?; @@ -1100,3 +1128,26 @@ impl Node { } } } + +fn node_ancestors(node_ptr: xmlNodePtr) -> Vec { + if node_ptr.is_null() { + return Vec::new(); + } + + let mut parent_ptr = xmlGetParent(node_ptr); + + if parent_ptr.is_null() { + Vec::new() + } else { + let mut parents = vec![parent_ptr]; + + while !xmlGetParent(parent_ptr).is_null() { + parent_ptr = xmlGetParent(parent_ptr); + parents.push(parent_ptr); + } + + parents + } +} + +mod c14n; diff --git a/src/tree/node/c14n.rs b/src/tree/node/c14n.rs new file mode 100644 index 000000000..afd15c8dd --- /dev/null +++ b/src/tree/node/c14n.rs @@ -0,0 +1,58 @@ +//! Node canonicalization logic +//! +use std::ffi::c_void; + +use crate::{ + bindings::{xmlC14NIsVisibleCallback, xmlNodePtr}, + c_helpers::xmlGetNodeType, + tree::{c14n::*, Node}, +}; + +use super::node_ancestors; + +impl Node { + /// Canonicalize a document and return the results. + pub fn canonicalize(&mut self, options: CanonicalizationOptions) -> Result { + let doc_ref = self.get_docref().upgrade().unwrap(); + let document = crate::tree::Document(doc_ref); + + let user_data = self.node_ptr_mut().unwrap(); + let callback: xmlC14NIsVisibleCallback = Some(callback_wrapper); + + document.canonicalize(options, Some((user_data, callback))) + } +} + +unsafe extern "C" fn callback_wrapper( + c14n_root_ptr: *mut c_void, + node_ptr: xmlNodePtr, + parent_ptr: xmlNodePtr, +) -> ::std::os::raw::c_int { + let c14n_root_ptr = c14n_root_ptr as xmlNodePtr; + let node_type = xmlGetNodeType(node_ptr); + + let tn_ptr = if NODE_TYPES.contains(&node_type) { + node_ptr + } else { + parent_ptr + }; + + let tn_ancestors = node_ancestors(tn_ptr); + + let ret = (tn_ptr == c14n_root_ptr) || tn_ancestors.contains(&c14n_root_ptr); + if ret { + 1 + } else { + 0 + } +} + +const NODE_TYPES: [u32; 7] = [ + super::xmlElementType_XML_ELEMENT_NODE, + super::xmlElementType_XML_ATTRIBUTE_NODE, + super::xmlElementType_XML_DOCUMENT_TYPE_NODE, + super::xmlElementType_XML_TEXT_NODE, + super::xmlElementType_XML_DTD_NODE, + super::xmlElementType_XML_PI_NODE, + super::xmlElementType_XML_COMMENT_NODE, +]; diff --git a/tests/c14n.rs b/tests/c14n.rs new file mode 100644 index 000000000..be379fafa --- /dev/null +++ b/tests/c14n.rs @@ -0,0 +1,243 @@ +use libxml::parser::Parser; +use libxml::tree::c14n::{CanonicalizationMode, CanonicalizationOptions}; + +fn canonicalize_xml(input: &str, opts: CanonicalizationOptions) -> String { + let parser = Parser::default(); + let doc = parser.parse_string(input).unwrap(); + + doc.canonicalize(opts, None).unwrap() +} + +#[test] +fn canonical_1_1_example_3_1_no_comment() { + let input = include_str!("resources/canonical_1_1/3_1_input.xml"); + let expected = include_str!("resources/canonical_1_1/3_1_output_no_comment.xml"); + + let canonicalized = canonicalize_xml( + input, + CanonicalizationOptions { + mode: CanonicalizationMode::Canonical1_1, + with_comments: false, + inclusive_ns_prefixes: vec![], + }, + ); + assert_eq!(canonicalized, expected) +} + +#[test] +fn canonical_1_1_example_3_2() { + let input = include_str!("resources/canonical_1_1/3_2_input.xml"); + let expected = include_str!("resources/canonical_1_1/3_2_output.xml"); + + let canonicalized = canonicalize_xml( + input, + CanonicalizationOptions { + mode: CanonicalizationMode::Canonical1_1, + with_comments: true, + inclusive_ns_prefixes: vec![], + }, + ); + + // for some reason, we get a stray \n at end of file :/ + assert_eq!(canonicalized, expected.trim()) +} + +#[test] +fn canonical_exclusive_example_1() { + let input = include_str!("resources/canonical_exclusive/1_input.xml"); + let expected = include_str!("resources/canonical_exclusive/1_output.xml"); + + let canonicalized = canonicalize_xml( + input, + CanonicalizationOptions { + mode: CanonicalizationMode::ExclusiveCanonical1_0, + with_comments: true, + inclusive_ns_prefixes: vec![], + }, + ); + + // for some reason, we get a stray \n at end of file :/ + assert_eq!(canonicalized, expected.trim()) +} + +#[test] +fn canonical_exclusive_example_2() { + let input = include_str!("resources/canonical_exclusive/2_input.xml"); + let expected = include_str!("resources/canonical_exclusive/2_output.xml"); + + let canonicalized = canonicalize_xml( + input, + CanonicalizationOptions { + mode: CanonicalizationMode::ExclusiveCanonical1_0, + with_comments: true, + inclusive_ns_prefixes: ["stay1".to_string(), "stay2".to_string()].to_vec(), + }, + ); + + // for some reason, we get a stray \n at end of file :/ + assert_eq!(canonicalized, expected.trim()) +} + +#[test] +fn test_c14n_node() { + let xml = ""; + let doc = Parser::default().parse_string(xml).unwrap(); + let mut node = doc.as_node().findnodes("//b").unwrap().pop().unwrap(); + + let c14n = node.canonicalize(opts()).unwrap(); + + assert_eq!("", c14n) +} + +#[test] +fn test_c14n_modes() { + // http://www.w3.org/TR/xml-exc-c14n/#sec-Enveloping + + let doc1 = Parser::default() + .parse_string( + r#" + + + + + + + "#, + ) + .unwrap(); + + let mut node1 = doc1 + .as_node() + .at_xpath("//n1:elem2", &[("n1", "http://example.net")]) + .unwrap() + .unwrap(); + + let doc2 = Parser::default() + .parse_string( + r#" + + + + + + + "#, + ) + .unwrap(); + let mut node2 = doc2 + .as_node() + .at_xpath("//n1:elem2", &[("n1", "http://example.net")]) + .unwrap() + .unwrap(); + + let expected = r#" + + + + "#.trim(); + let c14n = node1.canonicalize(opts()).unwrap(); + assert_eq!(expected, c14n); + + let expected = r#" + + + + + "#.trim(); + let c14n = node2.canonicalize(opts()).unwrap(); + assert_eq!(expected, c14n); + + let opts = CanonicalizationOptions { + mode: CanonicalizationMode::Canonical1_0, + ..Default::default() + }; + let c14n = node2.canonicalize(opts).unwrap(); + assert_eq!(expected, c14n); + + let expected = r#" + + + + "# + .trim(); + let c14n = node1 + .canonicalize(CanonicalizationOptions { + mode: CanonicalizationMode::ExclusiveCanonical1_0, + ..Default::default() + }) + .unwrap(); + + assert_eq!(expected, c14n); + + let expected = r#" + + + + + "# + .trim(); + let c14n = node2 + .canonicalize(CanonicalizationOptions { + mode: CanonicalizationMode::ExclusiveCanonical1_0, + ..Default::default() + }) + .unwrap(); + assert_eq!(expected, c14n); + + let expected = r#" + + + + + "# + .trim(); + let c14n = node2 + .canonicalize(CanonicalizationOptions { + mode: CanonicalizationMode::ExclusiveCanonical1_0, + inclusive_ns_prefixes: vec!["n2".into()], + ..Default::default() + }) + .unwrap(); + assert_eq!(expected, c14n); + + let expected = r#" + + + + + "#.trim(); + let c14n = node2 + .canonicalize(CanonicalizationOptions { + mode: CanonicalizationMode::ExclusiveCanonical1_0, + inclusive_ns_prefixes: vec!["n2".into(), "n4".into()], + ..Default::default() + }) + .unwrap(); + assert_eq!(expected, c14n); + + let expected = r#" + + + + + "#.trim(); + let c14n = node2 + .canonicalize(CanonicalizationOptions { + mode: CanonicalizationMode::Canonical1_1, + ..Default::default() + }) + .unwrap(); + assert_eq!(expected, c14n); +} + +fn opts() -> CanonicalizationOptions { + CanonicalizationOptions { + mode: CanonicalizationMode::Canonical1_1, + with_comments: false, + inclusive_ns_prefixes: vec![], + } +} diff --git a/tests/resources/canonical_1_1/3_1_input.xml b/tests/resources/canonical_1_1/3_1_input.xml new file mode 100644 index 000000000..5ac5735fe --- /dev/null +++ b/tests/resources/canonical_1_1/3_1_input.xml @@ -0,0 +1,14 @@ + + + + + + +Hello, world! + + + + + + \ No newline at end of file diff --git a/tests/resources/canonical_1_1/3_1_output.xml b/tests/resources/canonical_1_1/3_1_output.xml new file mode 100644 index 000000000..199e93890 --- /dev/null +++ b/tests/resources/canonical_1_1/3_1_output.xml @@ -0,0 +1,6 @@ + +Hello, world! + + + \ No newline at end of file diff --git a/tests/resources/canonical_1_1/3_1_output_no_comment.xml b/tests/resources/canonical_1_1/3_1_output_no_comment.xml new file mode 100644 index 000000000..bcff11569 --- /dev/null +++ b/tests/resources/canonical_1_1/3_1_output_no_comment.xml @@ -0,0 +1,4 @@ + +Hello, world! + \ No newline at end of file diff --git a/tests/resources/canonical_1_1/3_2_input.xml b/tests/resources/canonical_1_1/3_2_input.xml new file mode 100644 index 000000000..9869cc443 --- /dev/null +++ b/tests/resources/canonical_1_1/3_2_input.xml @@ -0,0 +1,12 @@ + + + A B + + A + + B + A B + C + + + diff --git a/tests/resources/canonical_1_1/3_2_output.xml b/tests/resources/canonical_1_1/3_2_output.xml new file mode 100644 index 000000000..74eeea147 --- /dev/null +++ b/tests/resources/canonical_1_1/3_2_output.xml @@ -0,0 +1,11 @@ + + + A B + + A + + B + A B + C + + diff --git a/tests/resources/canonical_1_1/Readme.md b/tests/resources/canonical_1_1/Readme.md new file mode 100644 index 000000000..99da73891 --- /dev/null +++ b/tests/resources/canonical_1_1/Readme.md @@ -0,0 +1,3 @@ +Examples taken from spec: + +https://www.w3.org/TR/xml-c14n11/#Examples \ No newline at end of file diff --git a/tests/resources/canonical_exclusive/1_input.xml b/tests/resources/canonical_exclusive/1_input.xml new file mode 100644 index 000000000..52c4b8b76 --- /dev/null +++ b/tests/resources/canonical_exclusive/1_input.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/resources/canonical_exclusive/1_output.xml b/tests/resources/canonical_exclusive/1_output.xml new file mode 100644 index 000000000..0a0b3507f --- /dev/null +++ b/tests/resources/canonical_exclusive/1_output.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/resources/canonical_exclusive/2_input.xml b/tests/resources/canonical_exclusive/2_input.xml new file mode 100644 index 000000000..46bb7423e --- /dev/null +++ b/tests/resources/canonical_exclusive/2_input.xml @@ -0,0 +1,3 @@ + + + diff --git a/tests/resources/canonical_exclusive/2_output.xml b/tests/resources/canonical_exclusive/2_output.xml new file mode 100644 index 000000000..31c5db0d7 --- /dev/null +++ b/tests/resources/canonical_exclusive/2_output.xml @@ -0,0 +1,3 @@ + + + diff --git a/tests/resources/canonical_exclusive/Readme.md b/tests/resources/canonical_exclusive/Readme.md new file mode 100644 index 000000000..a243a24be --- /dev/null +++ b/tests/resources/canonical_exclusive/Readme.md @@ -0,0 +1,4 @@ +Test samples taken from: + +- https://xmlstar.sourceforge.net/doc/UG/ch04s06.html +- https://www.ietf.org/rfc/rfc3741.txt (adapted) diff --git a/tests/resources/staff.xml b/tests/resources/staff.xml new file mode 100644 index 000000000..acebb31d9 --- /dev/null +++ b/tests/resources/staff.xml @@ -0,0 +1,59 @@ + + + + + Element data"> + + + + + + +]> + + + + EMP0001 + Margaret Martin + Accountant + 56,000 + Female +
1230 North Ave. Dallas, Texas 98551
+
+ + EMP0002 + Martha Raynolds + + Secretary + 35,000 + Female +
&ent2; Dallas, &ent3; + 98554
+
+ + EMP0003 + Roger + Jones + Department Manager + 100,000 + &ent4; +
PO Box 27 Irving, texas 98553
+
+ + EMP0004 + Jeny Oconnor + Personnel Director + 95,000 + Female +
27 South Road. Dallas, Texas 98556
+
+ + EMP0005 + Robert Myers + Computer Specialist + 90,000 + male +
1821 Nordic. Road, Irving Texas 98558
+
+