From 14a1d66bf81a39be83f0781ae4de831619cfe4db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Sch=C3=A4fer?= Date: Sun, 16 Jun 2024 10:14:13 +0200 Subject: [PATCH] xso: create library for streamed XML parsing This library provides the traits to parse structs from XML and serialise them into XML without having to buffer the document object model in memory. The only implementations it provides are for minidom, basically providing a lower-level interface to `minidom::Element::from_reader` and `minidom::Element::to_writer`. This is the first stepping stone into a world where `xmpp_parsers` can parse the structs directly from XML. --- Cargo.toml | 1 + minidom/src/element.rs | 5 + xso/Cargo.toml | 14 +++ xso/src/error.rs | 106 ++++++++++++++++ xso/src/lib.rs | 123 +++++++++++++++++++ xso/src/minidom_compat.rs | 248 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 497 insertions(+) create mode 100644 xso/Cargo.toml create mode 100644 xso/src/error.rs create mode 100644 xso/src/lib.rs create mode 100644 xso/src/minidom_compat.rs diff --git a/Cargo.toml b/Cargo.toml index 80a8a02..6d20ee4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ # alphabetically sorted "sasl", "tokio-xmpp", "xmpp", + "xso", ] resolver = "2" diff --git a/minidom/src/element.rs b/minidom/src/element.rs index b161a99..0bb6c0b 100644 --- a/minidom/src/element.rs +++ b/minidom/src/element.rs @@ -427,6 +427,11 @@ impl Element { Ok(()) } + /// Extracts all children into a collection. + pub fn take_nodes(&mut self) -> Vec { + self.children.drain(..).collect() + } + /// Returns an iterator over references to every child node of this element. /// /// # Examples diff --git a/xso/Cargo.toml b/xso/Cargo.toml new file mode 100644 index 0000000..944a636 --- /dev/null +++ b/xso/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "xso" +version = "0.0.2" +edition = "2021" +description = "XML Streamed Objects: similar to serde, but XML-native." +homepage = "https://xmpp.rs" +repository = "https://gitlab.com/xmpp-rs/xmpp-rs" +keywords = ["xmpp", "xml", "serialization"] +categories = ["encoding"] +license = "MPL-2.0" + +[dependencies] +rxml = { version = "0.11.0", default-features = false } +minidom = { version = "^0.15" } diff --git a/xso/src/error.rs b/xso/src/error.rs new file mode 100644 index 0000000..5da472d --- /dev/null +++ b/xso/src/error.rs @@ -0,0 +1,106 @@ +/*! +# Error types for XML parsing + +This module contains the error types used throughout the `xso` crate. +*/ +// Copyright (c) 2024 Jonas Schäfer +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. +use std::fmt; + +use rxml::error::XmlError; + +/// Error variants generated while parsing or serialising XML data. +#[derive(Debug)] +pub enum Error { + /// Invalid XML data encountered + XmlError(XmlError), + + /// Attempt to parse text data failed with the provided nested error. + TextParseError(Box), + + /// An element header did not match an expected element. + /// + /// This is only rarely generated: most of the time, a mismatch of element + /// types is reported as either an unexpected or a missing child element, + /// errors which are generally more specific. + TypeMismatch, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::XmlError(ref e) => write!(f, "xml parse error: {}", e), + Self::TextParseError(ref e) => write!(f, "text parse error: {}", e), + Self::TypeMismatch => f.write_str("mismatch between expected and actual XML data"), + } + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::XmlError(ref e) => Some(e), + Self::TextParseError(ref e) => Some(&**e), + _ => None, + } + } +} + +impl From for Error { + fn from(other: rxml::error::XmlError) -> Error { + Error::XmlError(other) + } +} + +impl From for Error { + fn from(other: rxml::strings::Error) -> Error { + Error::XmlError(other.into()) + } +} + +/// Error returned from +/// [`FromXml::from_events`][`crate::FromXml::from_events`]. +#[derive(Debug)] +pub enum FromEventsError { + /// The `name` and/or `attrs` passed to `FromXml::from_events` did not + /// match the element's type. + Mismatch { + /// The `name` passed to `from_events`. + name: rxml::QName, + + /// The `attrs` passed to `from_events`. + attrs: rxml::AttrMap, + }, + + /// The `name` and `attrs` passed to `FromXml::from_events` matched the + /// element's type, but the data was invalid. Details are in the inner + /// error. + Invalid(Error), +} + +impl From for FromEventsError { + fn from(other: Error) -> Self { + Self::Invalid(other) + } +} + +impl fmt::Display for FromEventsError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Mismatch { .. } => f.write_str("element header did not match"), + Self::Invalid(ref e) => fmt::Display::fmt(e, f), + } + } +} + +impl std::error::Error for FromEventsError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Mismatch { .. } => None, + Self::Invalid(ref e) => Some(e), + } + } +} diff --git a/xso/src/lib.rs b/xso/src/lib.rs new file mode 100644 index 0000000..c38f386 --- /dev/null +++ b/xso/src/lib.rs @@ -0,0 +1,123 @@ +#![forbid(missing_docs, unsafe_code)] +/*! +# XML Streamed Objects -- serde-like parsing for XML + +This crate provides the traits for parsing XML data into Rust structs, and +vice versa. + +While it is in 0.0.x versions, many features still need to be developed, but +rest assured that there is a solid plan to get it fully usable for even +advanced XML scenarios. + +XSO is an acronym for XML Stream(ed) Objects, referring to the main field of +use of this library in parsing XML streams like specified in RFC 6120. +*/ +// Copyright (c) 2024 Jonas Schäfer +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. +pub mod error; +pub mod minidom_compat; + +/// Trait allowing to consume a struct and iterate its contents as +/// serialisable [`rxml::Event`] items. +pub trait IntoXml { + /// The iterator type. + type EventIter: Iterator>; + + /// Return an iterator which emits the contents of the struct or enum as + /// serialisable [`rxml::Event`] items. + fn into_event_iter(self) -> Result; +} + +/// Trait for a temporary object allowing to construct a struct from +/// [`rxml::Event`] items. +/// +/// Objects of this type are generally constructed through +/// [`FromXml::from_events`] and are used to build Rust structs or enums from +/// XML data. The XML data must be fed as `rxml::Event` to the +/// [`feed`][`Self::feed`] method. +pub trait FromEventsBuilder { + /// The type which will be constructed by this builder. + type Output; + + /// Feed another [`rxml::Event`] into the element construction + /// process. + /// + /// Once the construction process completes, `Ok(Some(_))` is returned. + /// When valid data has been fed but more events are needed to fully + /// construct the resulting struct, `Ok(None)` is returned. + /// + /// If the construction fails, `Err(_)` is returned. Errors are generally + /// fatal and the builder should be assumed to be broken at that point. + /// Feeding more events after an error may result in panics, errors or + /// inconsistent result data, though it may never result in unsound or + /// unsafe behaviour. + fn feed(&mut self, ev: rxml::Event) -> Result, self::error::Error>; +} + +/// Trait allowing to construct a struct from a stream of +/// [`rxml::Event`] items. +/// +/// To use this, first call [`FromXml::from_events`] with the qualified +/// name and the attributes of the corresponding +/// [`rxml::Event::StartElement`] event. If the call succeeds, the +/// returned builder object must be fed with the events representing the +/// contents of the element, and then with the `EndElement` event. +/// +/// The `StartElement` passed to `from_events` must not be passed to `feed`. +/// +/// **Important:** Changing the [`Builder`][`Self::Builder`] associated type +/// is considered a non-breaking change for any given implementation of this +/// trait. Always refer to a type's builder type using fully-qualified +/// notation, for example: `::Builder`. +pub trait FromXml { + /// A builder type used to construct the element. + /// + /// **Important:** Changing this type is considered a non-breaking change + /// for any given implementation of this trait. Always refer to a type's + /// builder type using fully-qualified notation, for example: + /// `::Builder`. + type Builder: FromEventsBuilder; + + /// Attempt to initiate the streamed construction of this struct from XML. + /// + /// If the passed qualified `name` and `attrs` match the element's type, + /// the [`Self::Builder`] is returned and should be fed with XML events + /// by the caller. + /// + /// Otherwise, an appropriate error is returned. + fn from_events( + name: rxml::QName, + attrs: rxml::AttrMap, + ) -> Result; +} + +/// Attempt to transform a type implementing [`IntoXml`] into another +/// type which implements [`FromXml`]. +pub fn transform(from: F) -> Result { + let mut iter = from.into_event_iter()?; + let (qname, attrs) = match iter.next() { + Some(Ok(rxml::Event::StartElement(_, qname, attrs))) => (qname, attrs), + Some(Err(e)) => return Err(e), + _ => panic!("into_event_iter did not start with StartElement event!"), + }; + let mut sink = match T::from_events(qname, attrs) { + Ok(v) => v, + Err(self::error::FromEventsError::Mismatch { .. }) => { + return Err(self::error::Error::TypeMismatch) + } + Err(self::error::FromEventsError::Invalid(e)) => return Err(e), + }; + for event in iter { + let event = event?; + match sink.feed(event)? { + Some(v) => return Ok(v), + None => (), + } + } + Err(self::error::Error::XmlError( + rxml::error::XmlError::InvalidEof("during transform"), + )) +} diff --git a/xso/src/minidom_compat.rs b/xso/src/minidom_compat.rs new file mode 100644 index 0000000..fffe2e2 --- /dev/null +++ b/xso/src/minidom_compat.rs @@ -0,0 +1,248 @@ +//! Implementations of traits from this crate for minidom types +// Copyright (c) 2024 Jonas Schäfer +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. +use std::vec::IntoIter; + +use minidom::{Element, Node}; + +use rxml::{ + parser::EventMetrics, + writer::{SimpleNamespaces, TrackNamespace}, + AttrMap, Event, Name, Namespace, NcName, +}; + +use crate::{ + error::{Error, FromEventsError}, + FromEventsBuilder, FromXml, IntoXml, +}; + +/// State machine for converting a minidom Element into rxml events. +enum IntoEventsInner { + /// Element header: the element is still intact and we need to generate + /// the [`rxml::Event::StartElement`] event from the namespace, name, and + /// attributes. + Header(Element), + + /// Content: The contents of the element are streamed as events. + Nodes { + /// Remaining child nodes (text and/or children) to emit. + remaining: IntoIter, + + /// When emitting a child element, this is a nested [`IntoEvents`] + /// instance for that child element. + nested: Option>, + }, + + /// End of iteration: this state generates an end-of-iterator state. + /// + /// Note that the [`rxml::Event::EndElement`] event for the element itself + /// is generated by the iterator alraedy in the `Nodes` state, when + /// `nested` is None and `remaining` returns `None` from its `next()` + /// implementation. + Fin, +} + +/// Create the parts for a [`rxml::Event::StartElement`] from a +/// [`minidom::Element`]. +/// +/// Note that this copies the attribute data as well as namespace and name. +/// This is due to limitations in the [`minidom::Element`] API. +// NOTE to developers: The limitations are not fully trivial to overcome: +// the attributes use a BTreeMap internally, which does not offer a `drain` +// iterator. +fn make_start_ev_parts(el: &Element) -> Result<(rxml::QName, AttrMap), Error> { + let name = NcName::try_from(el.name())?; + let namespace = Namespace::from(el.ns()); + + let mut attrs = AttrMap::new(); + for (name, value) in el.attrs() { + let name = Name::try_from(name)?; + let (prefix, name) = name.split_name()?; + let namespace = if let Some(prefix) = prefix { + if prefix == "xml" { + Namespace::XML + } else { + let ns = match el.prefixes.get(&Some(prefix.into())) { + Some(v) => v, + None => { + panic!("undeclared xml namespace prefix in minidom::Element") + } + }; + Namespace::from(ns.to_owned()) + } + } else { + Namespace::NONE + }; + + attrs.insert(namespace, name, value.to_owned()); + } + + Ok(((namespace, name), attrs)) +} + +impl IntoEventsInner { + fn next(&mut self) -> Result, Error> { + match self { + IntoEventsInner::Header(ref mut el) => { + let (qname, attrs) = make_start_ev_parts(el)?; + let event = Event::StartElement(EventMetrics::zero(), qname, attrs); + + *self = IntoEventsInner::Nodes { + remaining: el.take_nodes().into_iter(), + nested: None, + }; + return Ok(Some(event)); + } + IntoEventsInner::Nodes { + ref mut nested, + ref mut remaining, + } => { + loop { + if let Some(nested) = nested.as_mut() { + if let Some(ev) = nested.next() { + return Some(ev).transpose(); + } + } + match remaining.next() { + Some(Node::Text(text)) => { + return Ok(Some(Event::Text(EventMetrics::zero(), text))); + } + Some(Node::Element(el)) => { + *nested = Some(Box::new(el.into_event_iter()?)); + // fallthrough to next loop iteration + } + None => { + // end of element, switch state and emit EndElement + *self = IntoEventsInner::Fin; + return Ok(Some(Event::EndElement(EventMetrics::zero()))); + } + } + } + } + IntoEventsInner::Fin => Ok(None), + } + } +} + +/// Convert a [`minidom::Element`] into [`rxml::Event`]s. +/// +/// This can be constructed from the +/// [`IntoXml::into_event_iter`][`crate::IntoXml::into_event_iter`] +/// implementation on [`minidom::Element`]. +pub struct IntoEvents(IntoEventsInner); + +impl Iterator for IntoEvents { + type Item = Result; + + fn next(&mut self) -> Option { + self.0.next().transpose() + } +} + +impl IntoXml for Element { + type EventIter = IntoEvents; + + fn into_event_iter(self) -> Result { + Ok(IntoEvents(IntoEventsInner::Header(self))) + } +} + +/// Construct a [`minidom::Element`] from [`rxml::Event`]s +/// +/// This can be constructed from the +/// [`FromXml::from_events`][`crate::FromXml::from_events`] +/// implementation on [`minidom::Element`]. +pub struct ElementFromEvents { + inner: Option, + nested: Option>, +} + +impl FromEventsBuilder for ElementFromEvents { + type Output = minidom::Element; + + fn feed(&mut self, ev: Event) -> Result, Error> { + let inner = self + .inner + .as_mut() + .expect("feed() called after it finished"); + if let Some(nested) = self.nested.as_mut() { + match nested.feed(ev)? { + Some(v) => { + inner.append_child(v); + self.nested = None; + return Ok(None); + } + None => return Ok(None), + } + } + match ev { + Event::XmlDeclaration(_, _) => Ok(None), + Event::StartElement(_, qname, attrs) => { + let nested = match Element::from_events(qname, attrs) { + Ok(v) => v, + Err(FromEventsError::Invalid(e)) => return Err(e), + Err(FromEventsError::Mismatch { .. }) => { + unreachable!("::from_events should accept everything!") + } + }; + self.nested = Some(Box::new(nested)); + Ok(None) + } + Event::Text(_, text) => { + inner.append_text_node(text); + Ok(None) + } + Event::EndElement(_) => Ok(Some(self.inner.take().unwrap())), + } + } +} + +impl FromXml for Element { + type Builder = ElementFromEvents; + + fn from_events( + qname: rxml::QName, + attrs: rxml::AttrMap, + ) -> Result { + let mut prefixes = SimpleNamespaces::new(); + let mut builder = Element::builder(qname.1, qname.0); + for ((namespace, name), value) in attrs.into_iter() { + if namespace.is_none() { + builder = builder.attr(name, String::from(value)); + } else { + let (is_new, prefix) = prefixes.declare_with_auto_prefix(namespace.clone()); + let name = prefix.with_suffix(&name); + if is_new { + builder = builder + .prefix( + Some(prefix.as_str().to_owned()), + namespace.as_str().to_owned(), + ) + .unwrap(); + } + builder = builder.attr(name, String::from(value)); + } + } + + let element = builder.build(); + Ok(Self::Builder { + inner: Some(element), + nested: None, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn transform_element_is_equivalent() { + let el: Element = "some text".parse().unwrap(); + let transformed: Element = crate::transform(el.clone()).unwrap(); + assert_eq!(el, transformed); + } +}