xso: create library for streamed XML parsing

This library provides the traits to parse structs from XML and
serialise them into XML without having to buffer the document object
model in memory.

The only implementations it provides are for minidom, basically
providing a lower-level interface to `minidom::Element::from_reader` and
`minidom::Element::to_writer`.

This is the first stepping stone into a world where `xmpp_parsers` can
parse the structs directly from XML.
This commit is contained in:
Jonas Schäfer 2024-06-16 10:14:13 +02:00
parent 998d2825f8
commit 14a1d66bf8
6 changed files with 497 additions and 0 deletions

View file

@ -6,6 +6,7 @@ members = [ # alphabetically sorted
"sasl",
"tokio-xmpp",
"xmpp",
"xso",
]
resolver = "2"

View file

@ -427,6 +427,11 @@ impl Element {
Ok(())
}
/// Extracts all children into a collection.
pub fn take_nodes(&mut self) -> Vec<Node> {
self.children.drain(..).collect()
}
/// Returns an iterator over references to every child node of this element.
///
/// # Examples

14
xso/Cargo.toml Normal file
View file

@ -0,0 +1,14 @@
[package]
name = "xso"
version = "0.0.2"
edition = "2021"
description = "XML Streamed Objects: similar to serde, but XML-native."
homepage = "https://xmpp.rs"
repository = "https://gitlab.com/xmpp-rs/xmpp-rs"
keywords = ["xmpp", "xml", "serialization"]
categories = ["encoding"]
license = "MPL-2.0"
[dependencies]
rxml = { version = "0.11.0", default-features = false }
minidom = { version = "^0.15" }

106
xso/src/error.rs Normal file
View file

@ -0,0 +1,106 @@
/*!
# Error types for XML parsing
This module contains the error types used throughout the `xso` crate.
*/
// Copyright (c) 2024 Jonas Schäfer <jonas@zombofant.net>
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
use std::fmt;
use rxml::error::XmlError;
/// Error variants generated while parsing or serialising XML data.
#[derive(Debug)]
pub enum Error {
/// Invalid XML data encountered
XmlError(XmlError),
/// Attempt to parse text data failed with the provided nested error.
TextParseError(Box<dyn std::error::Error + Send + Sync + 'static>),
/// An element header did not match an expected element.
///
/// This is only rarely generated: most of the time, a mismatch of element
/// types is reported as either an unexpected or a missing child element,
/// errors which are generally more specific.
TypeMismatch,
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::XmlError(ref e) => write!(f, "xml parse error: {}", e),
Self::TextParseError(ref e) => write!(f, "text parse error: {}", e),
Self::TypeMismatch => f.write_str("mismatch between expected and actual XML data"),
}
}
}
impl std::error::Error for Error {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::XmlError(ref e) => Some(e),
Self::TextParseError(ref e) => Some(&**e),
_ => None,
}
}
}
impl From<rxml::error::XmlError> for Error {
fn from(other: rxml::error::XmlError) -> Error {
Error::XmlError(other)
}
}
impl From<rxml::strings::Error> for Error {
fn from(other: rxml::strings::Error) -> Error {
Error::XmlError(other.into())
}
}
/// Error returned from
/// [`FromXml::from_events`][`crate::FromXml::from_events`].
#[derive(Debug)]
pub enum FromEventsError {
/// The `name` and/or `attrs` passed to `FromXml::from_events` did not
/// match the element's type.
Mismatch {
/// The `name` passed to `from_events`.
name: rxml::QName,
/// The `attrs` passed to `from_events`.
attrs: rxml::AttrMap,
},
/// The `name` and `attrs` passed to `FromXml::from_events` matched the
/// element's type, but the data was invalid. Details are in the inner
/// error.
Invalid(Error),
}
impl From<Error> for FromEventsError {
fn from(other: Error) -> Self {
Self::Invalid(other)
}
}
impl fmt::Display for FromEventsError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::Mismatch { .. } => f.write_str("element header did not match"),
Self::Invalid(ref e) => fmt::Display::fmt(e, f),
}
}
}
impl std::error::Error for FromEventsError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Mismatch { .. } => None,
Self::Invalid(ref e) => Some(e),
}
}
}

123
xso/src/lib.rs Normal file
View file

@ -0,0 +1,123 @@
#![forbid(missing_docs, unsafe_code)]
/*!
# XML Streamed Objects -- serde-like parsing for XML
This crate provides the traits for parsing XML data into Rust structs, and
vice versa.
While it is in 0.0.x versions, many features still need to be developed, but
rest assured that there is a solid plan to get it fully usable for even
advanced XML scenarios.
XSO is an acronym for XML Stream(ed) Objects, referring to the main field of
use of this library in parsing XML streams like specified in RFC 6120.
*/
// Copyright (c) 2024 Jonas Schäfer <jonas@zombofant.net>
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
pub mod error;
pub mod minidom_compat;
/// Trait allowing to consume a struct and iterate its contents as
/// serialisable [`rxml::Event`] items.
pub trait IntoXml {
/// The iterator type.
type EventIter: Iterator<Item = Result<rxml::Event, self::error::Error>>;
/// Return an iterator which emits the contents of the struct or enum as
/// serialisable [`rxml::Event`] items.
fn into_event_iter(self) -> Result<Self::EventIter, self::error::Error>;
}
/// Trait for a temporary object allowing to construct a struct from
/// [`rxml::Event`] items.
///
/// Objects of this type are generally constructed through
/// [`FromXml::from_events`] and are used to build Rust structs or enums from
/// XML data. The XML data must be fed as `rxml::Event` to the
/// [`feed`][`Self::feed`] method.
pub trait FromEventsBuilder {
/// The type which will be constructed by this builder.
type Output;
/// Feed another [`rxml::Event`] into the element construction
/// process.
///
/// Once the construction process completes, `Ok(Some(_))` is returned.
/// When valid data has been fed but more events are needed to fully
/// construct the resulting struct, `Ok(None)` is returned.
///
/// If the construction fails, `Err(_)` is returned. Errors are generally
/// fatal and the builder should be assumed to be broken at that point.
/// Feeding more events after an error may result in panics, errors or
/// inconsistent result data, though it may never result in unsound or
/// unsafe behaviour.
fn feed(&mut self, ev: rxml::Event) -> Result<Option<Self::Output>, self::error::Error>;
}
/// Trait allowing to construct a struct from a stream of
/// [`rxml::Event`] items.
///
/// To use this, first call [`FromXml::from_events`] with the qualified
/// name and the attributes of the corresponding
/// [`rxml::Event::StartElement`] event. If the call succeeds, the
/// returned builder object must be fed with the events representing the
/// contents of the element, and then with the `EndElement` event.
///
/// The `StartElement` passed to `from_events` must not be passed to `feed`.
///
/// **Important:** Changing the [`Builder`][`Self::Builder`] associated type
/// is considered a non-breaking change for any given implementation of this
/// trait. Always refer to a type's builder type using fully-qualified
/// notation, for example: `<T as xso::FromXml>::Builder`.
pub trait FromXml {
/// A builder type used to construct the element.
///
/// **Important:** Changing this type is considered a non-breaking change
/// for any given implementation of this trait. Always refer to a type's
/// builder type using fully-qualified notation, for example:
/// `<T as xso::FromXml>::Builder`.
type Builder: FromEventsBuilder<Output = Self>;
/// Attempt to initiate the streamed construction of this struct from XML.
///
/// If the passed qualified `name` and `attrs` match the element's type,
/// the [`Self::Builder`] is returned and should be fed with XML events
/// by the caller.
///
/// Otherwise, an appropriate error is returned.
fn from_events(
name: rxml::QName,
attrs: rxml::AttrMap,
) -> Result<Self::Builder, self::error::FromEventsError>;
}
/// Attempt to transform a type implementing [`IntoXml`] into another
/// type which implements [`FromXml`].
pub fn transform<T: FromXml, F: IntoXml>(from: F) -> Result<T, self::error::Error> {
let mut iter = from.into_event_iter()?;
let (qname, attrs) = match iter.next() {
Some(Ok(rxml::Event::StartElement(_, qname, attrs))) => (qname, attrs),
Some(Err(e)) => return Err(e),
_ => panic!("into_event_iter did not start with StartElement event!"),
};
let mut sink = match T::from_events(qname, attrs) {
Ok(v) => v,
Err(self::error::FromEventsError::Mismatch { .. }) => {
return Err(self::error::Error::TypeMismatch)
}
Err(self::error::FromEventsError::Invalid(e)) => return Err(e),
};
for event in iter {
let event = event?;
match sink.feed(event)? {
Some(v) => return Ok(v),
None => (),
}
}
Err(self::error::Error::XmlError(
rxml::error::XmlError::InvalidEof("during transform"),
))
}

248
xso/src/minidom_compat.rs Normal file
View file

@ -0,0 +1,248 @@
//! Implementations of traits from this crate for minidom types
// Copyright (c) 2024 Jonas Schäfer <jonas@zombofant.net>
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
use std::vec::IntoIter;
use minidom::{Element, Node};
use rxml::{
parser::EventMetrics,
writer::{SimpleNamespaces, TrackNamespace},
AttrMap, Event, Name, Namespace, NcName,
};
use crate::{
error::{Error, FromEventsError},
FromEventsBuilder, FromXml, IntoXml,
};
/// State machine for converting a minidom Element into rxml events.
enum IntoEventsInner {
/// Element header: the element is still intact and we need to generate
/// the [`rxml::Event::StartElement`] event from the namespace, name, and
/// attributes.
Header(Element),
/// Content: The contents of the element are streamed as events.
Nodes {
/// Remaining child nodes (text and/or children) to emit.
remaining: IntoIter<Node>,
/// When emitting a child element, this is a nested [`IntoEvents`]
/// instance for that child element.
nested: Option<Box<IntoEvents>>,
},
/// End of iteration: this state generates an end-of-iterator state.
///
/// Note that the [`rxml::Event::EndElement`] event for the element itself
/// is generated by the iterator alraedy in the `Nodes` state, when
/// `nested` is None and `remaining` returns `None` from its `next()`
/// implementation.
Fin,
}
/// Create the parts for a [`rxml::Event::StartElement`] from a
/// [`minidom::Element`].
///
/// Note that this copies the attribute data as well as namespace and name.
/// This is due to limitations in the [`minidom::Element`] API.
// NOTE to developers: The limitations are not fully trivial to overcome:
// the attributes use a BTreeMap internally, which does not offer a `drain`
// iterator.
fn make_start_ev_parts(el: &Element) -> Result<(rxml::QName, AttrMap), Error> {
let name = NcName::try_from(el.name())?;
let namespace = Namespace::from(el.ns());
let mut attrs = AttrMap::new();
for (name, value) in el.attrs() {
let name = Name::try_from(name)?;
let (prefix, name) = name.split_name()?;
let namespace = if let Some(prefix) = prefix {
if prefix == "xml" {
Namespace::XML
} else {
let ns = match el.prefixes.get(&Some(prefix.into())) {
Some(v) => v,
None => {
panic!("undeclared xml namespace prefix in minidom::Element")
}
};
Namespace::from(ns.to_owned())
}
} else {
Namespace::NONE
};
attrs.insert(namespace, name, value.to_owned());
}
Ok(((namespace, name), attrs))
}
impl IntoEventsInner {
fn next(&mut self) -> Result<Option<Event>, Error> {
match self {
IntoEventsInner::Header(ref mut el) => {
let (qname, attrs) = make_start_ev_parts(el)?;
let event = Event::StartElement(EventMetrics::zero(), qname, attrs);
*self = IntoEventsInner::Nodes {
remaining: el.take_nodes().into_iter(),
nested: None,
};
return Ok(Some(event));
}
IntoEventsInner::Nodes {
ref mut nested,
ref mut remaining,
} => {
loop {
if let Some(nested) = nested.as_mut() {
if let Some(ev) = nested.next() {
return Some(ev).transpose();
}
}
match remaining.next() {
Some(Node::Text(text)) => {
return Ok(Some(Event::Text(EventMetrics::zero(), text)));
}
Some(Node::Element(el)) => {
*nested = Some(Box::new(el.into_event_iter()?));
// fallthrough to next loop iteration
}
None => {
// end of element, switch state and emit EndElement
*self = IntoEventsInner::Fin;
return Ok(Some(Event::EndElement(EventMetrics::zero())));
}
}
}
}
IntoEventsInner::Fin => Ok(None),
}
}
}
/// Convert a [`minidom::Element`] into [`rxml::Event`]s.
///
/// This can be constructed from the
/// [`IntoXml::into_event_iter`][`crate::IntoXml::into_event_iter`]
/// implementation on [`minidom::Element`].
pub struct IntoEvents(IntoEventsInner);
impl Iterator for IntoEvents {
type Item = Result<Event, Error>;
fn next(&mut self) -> Option<Self::Item> {
self.0.next().transpose()
}
}
impl IntoXml for Element {
type EventIter = IntoEvents;
fn into_event_iter(self) -> Result<Self::EventIter, Error> {
Ok(IntoEvents(IntoEventsInner::Header(self)))
}
}
/// Construct a [`minidom::Element`] from [`rxml::Event`]s
///
/// This can be constructed from the
/// [`FromXml::from_events`][`crate::FromXml::from_events`]
/// implementation on [`minidom::Element`].
pub struct ElementFromEvents {
inner: Option<Element>,
nested: Option<Box<ElementFromEvents>>,
}
impl FromEventsBuilder for ElementFromEvents {
type Output = minidom::Element;
fn feed(&mut self, ev: Event) -> Result<Option<Self::Output>, Error> {
let inner = self
.inner
.as_mut()
.expect("feed() called after it finished");
if let Some(nested) = self.nested.as_mut() {
match nested.feed(ev)? {
Some(v) => {
inner.append_child(v);
self.nested = None;
return Ok(None);
}
None => return Ok(None),
}
}
match ev {
Event::XmlDeclaration(_, _) => Ok(None),
Event::StartElement(_, qname, attrs) => {
let nested = match Element::from_events(qname, attrs) {
Ok(v) => v,
Err(FromEventsError::Invalid(e)) => return Err(e),
Err(FromEventsError::Mismatch { .. }) => {
unreachable!("<Element as FromXml>::from_events should accept everything!")
}
};
self.nested = Some(Box::new(nested));
Ok(None)
}
Event::Text(_, text) => {
inner.append_text_node(text);
Ok(None)
}
Event::EndElement(_) => Ok(Some(self.inner.take().unwrap())),
}
}
}
impl FromXml for Element {
type Builder = ElementFromEvents;
fn from_events(
qname: rxml::QName,
attrs: rxml::AttrMap,
) -> Result<Self::Builder, FromEventsError> {
let mut prefixes = SimpleNamespaces::new();
let mut builder = Element::builder(qname.1, qname.0);
for ((namespace, name), value) in attrs.into_iter() {
if namespace.is_none() {
builder = builder.attr(name, String::from(value));
} else {
let (is_new, prefix) = prefixes.declare_with_auto_prefix(namespace.clone());
let name = prefix.with_suffix(&name);
if is_new {
builder = builder
.prefix(
Some(prefix.as_str().to_owned()),
namespace.as_str().to_owned(),
)
.unwrap();
}
builder = builder.attr(name, String::from(value));
}
}
let element = builder.build();
Ok(Self::Builder {
inner: Some(element),
nested: None,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn transform_element_is_equivalent() {
let el: Element = "<foo xmlns='urn:a' a='b' c='d'><child a='x'/><child a='y'>some text</child><child xmlns='urn:b'><nested-child/></child></foo>".parse().unwrap();
let transformed: Element = crate::transform(el.clone()).unwrap();
assert_eq!(el, transformed);
}
}