Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add c14n for node and document #138

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/tree/c14n.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
//! Shared canonicalization logic and types.
//!
use std::ffi::c_int;

use crate::bindings::{
xmlC14NMode_XML_C14N_1_0, xmlC14NMode_XML_C14N_1_1, xmlC14NMode_XML_C14N_EXCLUSIVE_1_0,
};

/// Options for configuring how to canonicalize XML
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)]
pub struct CanonicalizationOptions {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With inclusive_ns_prefixes as part of the CanonicalizationMode (see below), there are only two options left. For those, I'd rather have them as arguments to the functions instead of having an options struct.

/// Canonicalization specification to use
pub mode: CanonicalizationMode,
/// If true, keep `<!-- ... -->` comments, otherwise remove
pub with_comments: bool,
/// Namespaces to keep even if they are unused. By default, in [CanonicalizationMode::ExclusiveCanonical1_0], unused namespaces are removed.
///
/// Doesn't apply to other canonicalization modes.
pub inclusive_ns_prefixes: Vec<String>,
}

/// Canonicalization specification to use
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had implemented this as follows:

/// Canonicalization mode for [`Document.c14n`]
pub enum C14NMode {
  /// [XML_C14N_1_0](https://www.w3.org/TR/2001/REC-xml-c14n-20010315)
  Mode1_0,
  /// [XML_C14N_1_1](https://www.w3.org/TR/xml-c14n11/)
  Mode1_1,
  /// [XML_C14N_EXCLUSIVE_1_0](https://www.w3.org/TR/xml-exc-c14n/)
  ModeExclusive1_0(Vec<CString>),
}
impl C14NMode {
  fn as_c_int(&self) -> c_int {
    match self {
      C14NMode::Mode1_0 => xmlC14NMode_XML_C14N_1_0 as c_int,
      C14NMode::Mode1_1 => xmlC14NMode_XML_C14N_1_1 as c_int,
      C14NMode::ModeExclusive1_0(_) => xmlC14NMode_XML_C14N_EXCLUSIVE_1_0 as c_int,
    }
  }
}

It doesn't map the C API 1:1, but makes it clear the inclusive_ns_prefixes only apply to the exclusive c14n mode

pub enum CanonicalizationMode {
/// Original C14N 1.0 spec
Canonical1_0,
/// Exclusive C14N 1.0 spec
#[default]
ExclusiveCanonical1_0,
/// C14N 1.1 spec
Canonical1_1,
}

impl From<CanonicalizationMode> for c_int {
fn from(mode: CanonicalizationMode) -> Self {
let c14n_mode = match mode {
CanonicalizationMode::Canonical1_0 => xmlC14NMode_XML_C14N_1_0,
CanonicalizationMode::ExclusiveCanonical1_0 => xmlC14NMode_XML_C14N_EXCLUSIVE_1_0,
CanonicalizationMode::Canonical1_1 => xmlC14NMode_XML_C14N_1_1,
};

c_int::from(c14n_mode as i32)
}
}
2 changes: 2 additions & 0 deletions src/tree/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -343,3 +343,5 @@ impl Document {
Ok(())
}
}

mod c14n;
111 changes: 111 additions & 0 deletions src/tree/document/c14n.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
//! Document canonicalization logic
//!
use std::ffi::{c_int, c_void, CString};
use std::os::raw;
use std::ptr::null_mut;

use crate::tree::c14n::*;

use super::{
xmlAllocOutputBuffer, xmlC14NExecute, xmlC14NIsVisibleCallback, xmlChar, xmlNodePtr,
xmlOutputBufferClose, xmlOutputBufferPtr, Document,
};

impl Document {
/// Canonicalize a document and return the results.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This docstring could use a more information or a link to the libxml2 documentation (which isn't that great either). The callback parameter is obvious and could benefit from an examples

pub fn canonicalize(
&self,
options: CanonicalizationOptions,
callback: Option<(xmlNodePtr, xmlC14NIsVisibleCallback)>,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had implemented the callback functionality as follows:

type IsVisibleCallback = Box<dyn Fn(&RoNode, &RoNode) -> bool>;

// thin pointer wrapper that calls supplied the callback instead
unsafe extern "C" fn _is_visible_wrapper(
  data: *mut c_void,
  node: xmlNodePtr,
  parent: xmlNodePtr,
) -> c_int {
  let callback = unsafe { &mut *(data as *mut IsVisibleCallback) };
 // handling of parent nodes etc.
 // …
 (callback)(&RoNode(node), &RoNode(parent))) as c_int
}

impl Document{
  /// Canonicalizes the XML document according to the W3C XML Canonicalization specification.
  ///
  /// This method produces a canonical form of the XML document, which is useful for digital signatures
  /// and document comparison. The canonicalization process ensures consistent representation of the XML content.
    pub fn c14n(
    &self,
    mode: C14NMode,
    with_comments: bool,
  ) -> Result<String, ()> {
    self.c14n_with_visibility_callback(None, mode, with_comments)
  }

  /// Canonicalizes the document with an optional visibility callback
  ///
  /// `is_visible_callback(node: &RoNode, parent: &RoNode)` is called for every
  /// node having a parent, returning true if the node should be included in the
  /// canonicalized output.
  pub fn c14n_with_visibility_callback(
    &self,
    is_visible_callback: Option<IsVisibleCallback>,
    mode: C14NMode,
    with_comments: bool,
  ) -> Result<String, ()> {
      // boxes the callback so it can be passed as a void pointer to [`_is_visible_wrapper`]
      let (is_visible_fn, mut user_data) = match is_visible_callback {
        Some(f) => (
          Some(_is_visible_wrapper as unsafe extern "C" fn(_, _, _) -> _),
          Some(Box::into_raw(f)),
        ),
        None => (None, None),
      };
      let c14n_res = xmlC14NExecute(
        self.doc_ptr(),
        is_visible_fn,
        user_data
          .as_mut()
          .map(|s| ptr::from_mut(s))
          .unwrap_or(ptr::null_mut()) as *mut c_void,
        mode.as_c_int(),
        inclusive_ns_prefixes,
        with_comments as c_int,
        xmlOutputBufferCreateBuffer(buffer, ptr::null_mut()),
      );
      // …
  }
}

Usage looks like this:

let input = r#"<ns1:root><ns2:foo x="1"  a="2"/><!--cmt--><a/><b/></ns1:root>"#;
let callback = |_node: &RoNode, _parent: &RoNode| {
    !(_parent.get_name() == "ns1:root" && _node.get_name() == "a")
  };
  let c14n_result = doc.c14n_with_visibility_callback(
    Some(Box::new(callback)),
    libxml::tree::document::C14NMode::Mode1_1,
    false,
  );

It's a lot more flexible, (I assume) a lot more complex once the ancestor nodes are handled probably as in your PR

) -> Result<String, ()> {
let document = (*self.0).borrow().doc_ptr;

let mut ns_list_c = to_xml_string_vec(options.inclusive_ns_prefixes);
let inclusive_ns_prefixes = ns_list_c.as_mut_ptr();
let with_comments = c_int::from(options.with_comments);

let (is_visible_callback, user_data) = if let Some((node_ptr, visibility_callback)) = callback {
(visibility_callback, node_ptr as *mut _)
} else {
(None, null_mut())
};

let mode = options.mode.into();
unsafe {
let c_obuf = create_output_buffer();

let status = xmlC14NExecute(
document,
is_visible_callback,
user_data,
mode,
inclusive_ns_prefixes,
with_comments,
c_obuf,
);

let res = c_obuf_into_output(c_obuf);

if status < 0 {
Err(())
} else {
Ok(res)
}
}
}
}

unsafe fn c_obuf_into_output(c_obuf: xmlOutputBufferPtr) -> String {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My first approach looked like this:

let buffer = xmlBufferCreate();
let c14n_res = xmlC14NExecute(,
        xmlOutputBufferCreateBuffer(buffer, ptr::null_mut()),
      );
let result = xmlBufferContent(buffer);
let c_string = CStr::from_ptr(result as *const c_char);
let node_string = c_string.to_string_lossy().
xmlBufferFree(
Ok(node_string)

It's shorter and fewer additional functions, but I'm probably missing some edge cases / needed error handling.

let ctx_ptr = (*c_obuf).context;
let output = Box::from_raw(ctx_ptr as *mut String);

(*c_obuf).context = std::ptr::null_mut::<c_void>();

xmlOutputBufferClose(c_obuf);

*output
}

unsafe fn create_output_buffer() -> xmlOutputBufferPtr {
let output = String::new();
let ctx_ptr = Box::into_raw(Box::new(output));
let encoder = std::ptr::null_mut();

let buf = xmlAllocOutputBuffer(encoder);

(*buf).writecallback = Some(xml_write_io);
(*buf).closecallback = Some(xml_close_io);
(*buf).context = ctx_ptr as _;

buf
}

unsafe extern "C" fn xml_close_io(_context: *mut raw::c_void) -> raw::c_int {
0
}

unsafe extern "C" fn xml_write_io(
io_ptr: *mut raw::c_void,
buffer: *const raw::c_char,
len: raw::c_int,
) -> raw::c_int {
if io_ptr.is_null() {
0
} else {
let buf = std::slice::from_raw_parts_mut(buffer as *mut u8, len as usize);
let buf = String::from_utf8_lossy(buf);
let s2_ptr = io_ptr as *mut String;
String::push_str(&mut *s2_ptr, &buf);

len
}
}

/// Create a [Vec] of null-terminated [*mut xmlChar] strings
fn to_xml_string_vec(vec: Vec<String>) -> Vec<*mut xmlChar> {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depending on the audience it might be feasible to require the inclusive namespaces to be a Vec<CString>. After all, the namespaces to keep will most likely be hardcoded anyways.

vec
.into_iter()
.map(|s| CString::new(s).unwrap().into_raw() as *mut xmlChar)
.chain(std::iter::once(std::ptr::null_mut()))
.collect()
}
1 change: 1 addition & 0 deletions src/tree/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! The tree functionality
//!

pub mod c14n;
pub mod document;
pub mod namespace;
pub mod node;
Expand Down
51 changes: 51 additions & 0 deletions src/tree/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,34 @@ impl Node {
context.findnodes(xpath, Some(self))
}

/// Search this node for XPath `path`, and return only the first match.
pub fn at_xpath(&self, path: &str, ns_binlings: &[(&str, &str)]) -> Result<Option<Node>, ()> {
let mut context = Context::from_node(self)?;
for (prefix, href) in ns_binlings {
context.register_namespace(prefix, href)?;
}
let nodes = context.findnodes(path, Some(self))?;

Ok(nodes.first().cloned())
}

/// Get a list of ancestor Node for this Node.
pub fn ancestors(&self) -> Vec<Node> {
let node_ptr = self.node_ptr();

let mut res = Vec::new();

let ancestor_ptrs = node_ancestors(node_ptr);

for ptr in ancestor_ptrs {
if let Some(node) = self.ptr_as_option(ptr) {
res.push(node)
}
}

res
}

/// find String values via xpath, at a specified node or the document root
pub fn findvalues(&self, xpath: &str) -> Result<Vec<String>, ()> {
let mut context = Context::from_node(self)?;
Expand Down Expand Up @@ -1100,3 +1128,26 @@ impl Node {
}
}
}

fn node_ancestors(node_ptr: xmlNodePtr) -> Vec<xmlNodePtr> {
if node_ptr.is_null() {
return Vec::new();
}

let mut parent_ptr = xmlGetParent(node_ptr);

if parent_ptr.is_null() {
Vec::new()
} else {
let mut parents = vec![parent_ptr];

while !xmlGetParent(parent_ptr).is_null() {
parent_ptr = xmlGetParent(parent_ptr);
parents.push(parent_ptr);
}

parents
}
}

mod c14n;
58 changes: 58 additions & 0 deletions src/tree/node/c14n.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
//! Node canonicalization logic
//!
use std::ffi::c_void;

use crate::{
bindings::{xmlC14NIsVisibleCallback, xmlNodePtr},
c_helpers::xmlGetNodeType,
tree::{c14n::*, Node},
};

use super::node_ancestors;

impl Node {
/// Canonicalize a document and return the results.
pub fn canonicalize(&mut self, options: CanonicalizationOptions) -> Result<String, ()> {
let doc_ref = self.get_docref().upgrade().unwrap();
let document = crate::tree::Document(doc_ref);

let user_data = self.node_ptr_mut().unwrap();
let callback: xmlC14NIsVisibleCallback = Some(callback_wrapper);

document.canonicalize(options, Some((user_data, callback)))
}
}

unsafe extern "C" fn callback_wrapper(
c14n_root_ptr: *mut c_void,
node_ptr: xmlNodePtr,
parent_ptr: xmlNodePtr,
) -> ::std::os::raw::c_int {
let c14n_root_ptr = c14n_root_ptr as xmlNodePtr;
let node_type = xmlGetNodeType(node_ptr);

let tn_ptr = if NODE_TYPES.contains(&node_type) {
node_ptr
} else {
parent_ptr
};

let tn_ancestors = node_ancestors(tn_ptr);

let ret = (tn_ptr == c14n_root_ptr) || tn_ancestors.contains(&c14n_root_ptr);
if ret {
1
} else {
0
}
}

const NODE_TYPES: [u32; 7] = [
super::xmlElementType_XML_ELEMENT_NODE,
super::xmlElementType_XML_ATTRIBUTE_NODE,
super::xmlElementType_XML_DOCUMENT_TYPE_NODE,
super::xmlElementType_XML_TEXT_NODE,
super::xmlElementType_XML_DTD_NODE,
super::xmlElementType_XML_PI_NODE,
super::xmlElementType_XML_COMMENT_NODE,
];
Loading
Loading