From 9cec9fce9b8d203cd5b4c6edb7b5a0b78b8e129a Mon Sep 17 00:00:00 2001 From: Bastien Orivel Date: Wed, 7 Jun 2017 22:40:53 +0200 Subject: [PATCH] Replace xml-rs by quick_xml quick_xml is way faster than xml-rs Here is an example with a quick atom parser: With xml-rs: test parse_factorio_atom ... bench: 3,295,678 ns/iter (+/- 165,851) With quick_xml: test parse_factorio_atom ... bench: 203,215 ns/iter (+/- 13,485) Unfortunately I had to break the API for this change to happen. * Element::from_reader now takes `R: BufRead` instead of `R: Read` * Element::write_to now takes `W: io::Write` instead of `EventWriter` This migration also allow us to have a write_to function which assumes we're already in a given namespace (see `write_to_in_namespace`). --- Cargo.toml | 6 +- src/element.rs | 229 +++++++++++++++++++++++++++---------------------- src/error.rs | 22 ++--- src/lib.rs | 2 +- src/tests.rs | 24 ++++-- 5 files changed, 156 insertions(+), 127 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0ddb025..ebb09d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "minidom" -version = "0.4.3" +version = "0.5.0" authors = ["lumi ", "Emmanuel Gil Peyrot ", "Bastien Orivel "] -description = "A small, simple DOM implementation on top of xml-rs." +description = "A small, simple DOM implementation on top of quick-xml" homepage = "https://gitlab.com/lumi/minidom-rs" repository = "https://gitlab.com/lumi/minidom-rs" documentation = "https://docs.rs/minidom" @@ -14,5 +14,5 @@ license = "MIT" gitlab = { repository = "lumi/minidom-rs" } [dependencies] -xml-rs = "0.4.1" +quick-xml = "0.7.3" error-chain = "0.10.0" diff --git a/src/element.rs b/src/element.rs index 3c5c9d5..ee24850 100644 --- a/src/element.rs +++ b/src/element.rs @@ -1,18 +1,16 @@ //! Provides an `Element` type, which represents DOM nodes, and a builder to create them with. -use std::io::prelude::*; -use std::io::Cursor; -use std::collections::BTreeMap; -use std::collections::btree_map; +use std::io:: Write; +use std::collections::{btree_map, BTreeMap}; -use std::fmt; +use std::str; use error::{Error, ErrorKind, Result}; -use xml::reader::{XmlEvent as ReaderEvent, EventReader}; -use xml::writer::{XmlEvent as WriterEvent, EventWriter, EmitterConfig}; -use xml::name::Name; -use xml::namespace::NS_NO_PREFIX; +use quick_xml::reader::Reader as EventReader; +use quick_xml::events::{Event, BytesStart}; + +use std::io::BufRead; use std::str::FromStr; @@ -69,9 +67,18 @@ impl Node { Node::Text(ref s) => Some(s), } } + + fn write_to_inner(&self, writer: &mut W, last_namespace: &mut Option) -> Result<()>{ + match *self { + Node::Element(ref elmt) => elmt.write_to_inner(writer, last_namespace)?, + Node::Text(ref s) => write!(writer, "{}", s)?, + } + + Ok(()) + } } -#[derive(Clone, PartialEq, Eq)] +#[derive(Clone, PartialEq, Eq, Debug)] /// A struct representing a DOM Element. pub struct Element { name: String, @@ -82,26 +89,18 @@ pub struct Element { impl<'a> From<&'a Element> for String { fn from(elem: &'a Element) -> String { - let mut out = Vec::new(); - let config = EmitterConfig::new() - .write_document_declaration(false); - elem.write_to(&mut EventWriter::new_with_config(&mut out, config)).unwrap(); - String::from_utf8(out).unwrap() + let mut writer = Vec::new(); + elem.write_to(&mut writer).unwrap(); + String::from_utf8(writer).unwrap() } } -impl fmt::Debug for Element { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - write!(fmt, "{}", String::from(self))?; - Ok(()) - } -} impl FromStr for Element { type Err = Error; fn from_str(s: &str) -> Result { - let mut reader = EventReader::new(Cursor::new(s)); + let mut reader = EventReader::from_str(s); Element::from_reader(&mut reader) } } @@ -246,106 +245,105 @@ impl Element { } /// Parse a document from an `EventReader`. - pub fn from_reader(reader: &mut EventReader) -> Result { - loop { - let e = reader.next()?; - match e { - ReaderEvent::StartElement { name, attributes, namespace } => { - let attributes = attributes.into_iter() - .map(|o| { - (match o.name.prefix { - Some(prefix) => format!("{}:{}", prefix, o.name.local_name), - None => o.name.local_name - }, - o.value) - }) - .collect(); - let ns = if let Some(ref prefix) = name.prefix { - namespace.get(prefix) - } - else { - namespace.get(NS_NO_PREFIX) - }.map(|s| s.to_owned()); + pub fn from_reader(reader: &mut EventReader) -> Result { + let mut buf = Vec::new(); + let root: Element; - let mut root = Element::new(name.local_name, ns, attributes, Vec::new()); - root.from_reader_inner(reader)?; - return Ok(root); + loop { + let e = reader.read_event(&mut buf)?; + match e { + Event::Empty(ref e) | Event::Start(ref e) => { + root = build_element(e)?; // FIXME: could be break build_element(e)? when break value is stable + break; }, - ReaderEvent::EndDocument => { + Event::Eof => { bail!(ErrorKind::EndOfDocument); }, _ => () // TODO: may need more errors } - } - } + }; + + let mut stack = vec![root]; - #[cfg_attr(feature = "cargo-clippy", allow(wrong_self_convention))] - fn from_reader_inner(&mut self, reader: &mut EventReader) -> Result<()> { loop { - let e = reader.next()?; - match e { - ReaderEvent::StartElement { name, attributes, namespace } => { - let attributes = attributes.into_iter() - .map(|o| { - (match o.name.prefix { - Some(prefix) => format!("{}:{}", prefix, o.name.local_name), - None => o.name.local_name - }, - o.value) - }) - .collect(); - let ns = if let Some(ref prefix) = name.prefix { - namespace.get(prefix) + match reader.read_event(&mut buf)? { + Event::Empty(ref e) => { + let elem = build_element(e)?; + // Since there is no Event::End after, directly append it to the current node + stack.last_mut().unwrap().append_child(elem); + }, + Event::Start(ref e) => { + let elem = build_element(e)?; + stack.push(elem); + }, + Event::End(ref e) => { + if stack.len() <= 1 { + break; + } + let elem = stack.pop().unwrap(); + if let Some(to) = stack.last_mut() { + if elem.name().as_bytes() != e.name() { + bail!(ErrorKind::InvalidElementClosed); + } + to.append_child(elem); } - else { - namespace.get(NS_NO_PREFIX) - }.map(|s| s.to_owned()); - let elem = Element::new(name.local_name, ns, attributes, Vec::with_capacity(1)); - let elem_ref = self.append_child(elem); - elem_ref.from_reader_inner(reader)?; }, - ReaderEvent::EndElement { .. } => { - // TODO: may want to check whether we're closing the correct element - return Ok(()); + Event::Text(s) | Event::CData(s) => { + let text = s.unescape_and_decode(reader)?; + if text != "" { + let mut current_elem = stack.last_mut().unwrap(); + current_elem.append_text_node(text); + } }, - ReaderEvent::Characters(s) | ReaderEvent::CData(s) => { - self.append_text_node(s); - }, - ReaderEvent::EndDocument => { - bail!(ErrorKind::EndOfDocument); + Event::Eof => { + break; }, _ => (), // TODO: may need to implement more } } + Ok(stack.pop().unwrap()) } - /// Output a document to an `EventWriter`. - pub fn write_to(&self, writer: &mut EventWriter) -> Result<()> { - let name = if let Some(ref ns) = self.namespace { - Name::qualified(&self.name, ns, None) - } - else { - Name::local(&self.name) - }; - let mut start = WriterEvent::start_element(name); + /// Output a document to a `Writer`. + pub fn write_to(&self, writer: &mut W) -> Result<()> { + let mut last_namespace = None; + write!(writer, "")?; + self.write_to_inner(writer, &mut last_namespace) + } + + /// Output a document to a `Writer` assuming you're already in the provided namespace + pub fn write_to_in_namespace(&self, writer: &mut W, namespace: &str) -> Result<()> { + write!(writer, "")?; + self.write_to_inner(writer, &mut Some(namespace.to_owned())) + } + + fn write_to_inner(&self, writer: &mut W, last_namespace: &mut Option) -> Result<()> { + write!(writer, "<")?; + write!(writer, "{}", self.name)?; + if let Some(ref ns) = self.namespace { - start = start.default_ns(ns.clone()); - } - for attr in &self.attributes { // TODO: I think this could be done a lot more efficiently - start = start.attr(Name::local(attr.0), attr.1); - } - writer.write(start)?; - for child in &self.children { - match *child { - Node::Element(ref e) => { - e.write_to(writer)?; - }, - Node::Text(ref s) => { - writer.write(WriterEvent::characters(s))?; - }, + if *last_namespace != self.namespace { + write!(writer, " xmlns=\"{}\"", ns)?; + *last_namespace = Some(ns.clone()); } } - writer.write(WriterEvent::end_element())?; + + for (key, value) in &self.attributes { + write!(writer, " {}=\"{}\"", key, value)?; + } + + if self.children.is_empty() { + write!(writer, " />")?; + return Ok(()) + } + + write!(writer, ">")?; + + for child in &self.children { + child.write_to_inner(writer, last_namespace)?; + } + + write!(writer, "", self.name)?; Ok(()) } @@ -354,7 +352,7 @@ impl Element { /// # Examples /// /// ```rust - /// use minidom::{Element, Node}; + /// use minidom::Element; /// /// let elem: Element = "abc".parse().unwrap(); /// @@ -592,6 +590,31 @@ impl Element { } } +fn build_element(event: &BytesStart) -> Result { + let mut attributes = event.attributes() + .map(|o| { + let o = o?; + let key = str::from_utf8(o.key)?.to_owned(); + let value = str::from_utf8(o.value)?.to_owned(); + Ok((key, value)) + } + ) + .collect::>>()?; + let mut ns_key = None; + for (key, _) in &attributes { + if key == "xmlns" || key.starts_with("xmlns:") { + ns_key = Some(key.clone()); + } + } + + let ns = match ns_key { + None => None, + Some(key) => attributes.remove(&key), + }; + let name = str::from_utf8(event.name())?.to_owned(); + Ok(Element::new(name, ns, attributes, Vec::new())) +} + /// An iterator over references to child elements of an `Element`. pub struct Children<'a> { iter: slice::Iter<'a, Node>, diff --git a/src/error.rs b/src/error.rs index 4e76c6a..0b8eb23 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,30 +1,30 @@ //! Provides an error type for this crate. -use std::io; - use std::convert::From; -use xml::writer::Error as WriterError; -use xml::reader::Error as ReaderError; - error_chain! { foreign_links { - XmlWriterError(WriterError) - /// An error with writing an XML event, from xml::writer::EventWriter. + XmlError(::quick_xml::errors::Error) + /// An error from quick_xml. ; - XmlReaderError(ReaderError) - /// An error with reading an XML event, from xml::reader::EventReader. + Utf8Error(::std::str::Utf8Error) + /// An UTF-8 conversion error. ; - IoError(io::Error) + IoError(::std::io::Error) /// An I/O error, from std::io. ; } errors { - /// En error which is returned when the end of the document was reached prematurely. + /// An error which is returned when the end of the document was reached prematurely. EndOfDocument { description("the end of the document has been reached prematurely") display("the end of the document has been reached prematurely") } + /// An error which is returned when an element is closed when it shouldn't be + InvalidElementClosed { + description("The XML is invalid, an element was wrongly closed") + display("the XML is invalid, an element was wrongly closed") + } } } diff --git a/src/lib.rs b/src/lib.rs index d51de43..2e00b9f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,7 +64,7 @@ //! minidom = "*" //! ``` -extern crate xml; +extern crate quick_xml; #[macro_use] extern crate error_chain; pub mod error; diff --git a/src/tests.rs b/src/tests.rs index f58bb6e..bc71ca2 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,9 +1,6 @@ -use std::io::Cursor; - use std::iter::Iterator; -use xml::reader::EventReader; -use xml::writer::EventWriter; +use quick_xml::reader::Reader; use element::Element; @@ -32,19 +29,18 @@ fn build_test_tree() -> Element { #[test] fn reader_works() { - let mut reader = EventReader::new(Cursor::new(TEST_STRING)); + let mut reader = Reader::from_str(TEST_STRING); assert_eq!(Element::from_reader(&mut reader).unwrap(), build_test_tree()); } #[test] fn writer_works() { let root = build_test_tree(); - let mut out = Vec::new(); + let mut writer = Vec::new(); { - let mut writer = EventWriter::new(&mut out); root.write_to(&mut writer).unwrap(); } - assert_eq!(String::from_utf8(out).unwrap(), TEST_STRING); + assert_eq!(String::from_utf8(writer).unwrap(), TEST_STRING); } #[test] @@ -110,8 +106,18 @@ fn two_elements_with_same_arguments_different_order_are_equal() { #[test] fn namespace_attributes_works() { - let mut reader = EventReader::new(Cursor::new(TEST_STRING)); + let mut reader = Reader::from_str(TEST_STRING); let root = Element::from_reader(&mut reader).unwrap(); assert_eq!("en", root.attr("xml:lang").unwrap()); assert_eq!("fr", root.get_child("child", "child_ns").unwrap().attr("xml:lang").unwrap()); } + +#[test] +fn wrongly_closed_elements_error() { + let elem1 = "".parse::(); + assert!(elem1.is_err()); + let elem1 = "".parse::(); + assert!(elem1.is_err()); + let elem1 = "".parse::(); + assert!(elem1.is_ok()); +}