From 1887fdd1b512c2b05fb3ecc22de1535294979801 Mon Sep 17 00:00:00 2001 From: Astro Date: Wed, 23 Mar 2022 21:43:34 +0100 Subject: [PATCH] minidom: add tree_builder --- minidom/src/element.rs | 220 +++++------------------------------- minidom/src/error.rs | 11 ++ minidom/src/lib.rs | 1 + minidom/src/parser.rs | 11 +- minidom/src/tests.rs | 8 +- minidom/src/tokenizer.rs | 12 +- minidom/src/tree_builder.rs | 127 +++++++++++++++++++++ 7 files changed, 187 insertions(+), 203 deletions(-) create mode 100644 minidom/src/tree_builder.rs diff --git a/minidom/src/element.rs b/minidom/src/element.rs index b04d752..0d6ce36 100644 --- a/minidom/src/element.rs +++ b/minidom/src/element.rs @@ -17,19 +17,18 @@ use crate::error::{Error, Result}; use crate::namespaces::NSChoice; use crate::node::Node; use crate::prefixes::{Namespace, Prefix, Prefixes}; +use crate::tokenizer::Tokenizer; +use crate::tree_builder::TreeBuilder; use std::collections::{btree_map, BTreeMap}; -use std::io::Write; +use std::io::{Cursor, Read, Write}; use std::borrow::Cow; use std::str; use quick_xml::events::{BytesDecl, BytesEnd, BytesStart, Event}; -use quick_xml::Reader as EventReader; use quick_xml::Writer as EventWriter; -use std::io::BufRead; - use std::str::FromStr; use std::slice; @@ -102,7 +101,7 @@ impl FromStr for Element { type Err = Error; fn from_str(s: &str) -> Result { - let mut reader = EventReader::from_str(s); + let mut reader = Cursor::new(s); Element::from_reader(&mut reader) } } @@ -128,7 +127,7 @@ fn ensure_no_prefix>(s: &S) -> Result<()> { } impl Element { - fn new>( + pub(crate) fn new>( name: String, namespace: String, prefix: Option, @@ -310,123 +309,28 @@ impl Element { namespace.into().compare(self.namespace.as_ref()) } - /// Parse a document from an `EventReader`. - pub fn from_reader(reader: &mut EventReader) -> Result { - let mut buf = Vec::new(); - - let mut prefixes = BTreeMap::new(); - let root: Element = loop { - let e = reader.read_event(&mut buf)?; - match e { - Event::Empty(ref e) | Event::Start(ref e) => { - break build_element(reader, e, &mut prefixes)?; - } - Event::Eof => { - return Err(Error::EndOfDocument); - } - Event::Comment { .. } => { - return Err(Error::NoComments); - } - Event::Text { .. } - | Event::End { .. } - | Event::CData { .. } - | Event::Decl { .. } - | Event::PI { .. } - | Event::DocType { .. } => (), // TODO: may need more errors - } - }; - - let mut stack = vec![root]; - let mut prefix_stack = vec![prefixes]; + /// Parse a document from a `Read`. + pub fn from_reader(mut reader: R) -> Result { + const CHUNK_SIZE: usize = 65536; + let mut buf = [0; CHUNK_SIZE]; + let mut tokenizer = Tokenizer::new(); + let mut tree_builder = TreeBuilder::new(); loop { - match reader.read_event(&mut buf)? { - Event::Empty(ref e) => { - let mut prefixes = prefix_stack.last().unwrap().clone(); - let elem = build_element(reader, e, &mut prefixes)?; - // Since there is no Event::End after, directly append it to the current node - stack.last_mut().unwrap().append_child(elem); + let len = reader.read(&mut buf)?; + if len == 0 { + break; + } + tokenizer.push(&buf[0..len]); + while let Some(token) = tokenizer.pull()? { + tree_builder.process_token(token); + + if let Some(root) = tree_builder.root.take() { + return Ok(root); } - Event::Start(ref e) => { - let mut prefixes = prefix_stack.last().unwrap().clone(); - let elem = build_element(reader, e, &mut prefixes)?; - stack.push(elem); - prefix_stack.push(prefixes); - } - Event::End(ref e) => { - if stack.len() <= 1 { - break; - } - let prefixes = prefix_stack.pop().unwrap(); - let elem = stack.pop().unwrap(); - if let Some(to) = stack.last_mut() { - // TODO: check whether this is correct, we are comparing &[u8]s, not &strs - let elem_name = e.name(); - let mut split_iter = elem_name.splitn(2, |u| *u == 0x3A); - let possible_prefix = split_iter.next().unwrap(); // Can't be empty. - let opening_prefix = { - let mut tmp: Option> = None; - for (prefix, ns) in prefixes { - if ns == elem.namespace { - tmp = Some(prefix.clone()); - break; - } - } - match tmp { - Some(prefix) => prefix, - None => return Err(Error::InvalidPrefix), - } - }; - match split_iter.next() { - // There is a prefix on the closing tag - Some(name) => { - // Does the closing prefix match the opening prefix? - match opening_prefix { - Some(prefix) if possible_prefix == prefix.as_bytes() => (), - _ => return Err(Error::InvalidElementClosed), - } - // Does the closing tag name match the opening tag name? - if name != elem.name().as_bytes() { - return Err(Error::InvalidElementClosed); - } - } - // There was no prefix on the closing tag - None => { - // Is there a prefix on the opening tag? - if opening_prefix.is_some() { - return Err(Error::InvalidElementClosed); - } - // Does the opening tag name match the closing one? - if possible_prefix != elem.name().as_bytes() { - return Err(Error::InvalidElementClosed); - } - } - } - to.append_child(elem); - } - } - Event::Text(s) => { - let text = s.unescape_and_decode(reader)?; - if !text.is_empty() { - let current_elem = stack.last_mut().unwrap(); - current_elem.append_text_node(text); - } - } - Event::CData(s) => { - let text = s.unescape_and_decode(&reader)?; - if !text.is_empty() { - let current_elem = stack.last_mut().unwrap(); - current_elem.append_text_node(text); - } - } - Event::Eof => { - break; - } - Event::Comment(_) => return Err(Error::NoComments), - Event::Decl { .. } | Event::PI { .. } | Event::DocType { .. } => (), } } - Ok(stack.pop().unwrap()) + Err(Error::EndOfDocument) } /// Output a document to a `Writer`. @@ -824,68 +728,6 @@ impl Element { } } -fn split_element_name>(s: S) -> Result<(Option, String)> { - let name_parts = s.as_ref().split(':').collect::>(); - match name_parts.len() { - 2 => Ok((Some(name_parts[0].to_owned()), name_parts[1].to_owned())), - 1 => Ok((None, name_parts[0].to_owned())), - _ => Err(Error::InvalidElement), - } -} - -fn build_element( - reader: &EventReader, - event: &BytesStart, - prefixes: &mut BTreeMap, -) -> Result { - let (prefix, name) = split_element_name(str::from_utf8(event.name())?)?; - let mut local_prefixes = BTreeMap::new(); - - let attributes = event - .attributes() - .map(|o| { - let o = o?; - let key = str::from_utf8(o.key)?.to_owned(); - let value = o.unescape_and_decode_value(reader)?; - Ok((key, value)) - }) - .filter(|o| match *o { - Ok((ref key, ref value)) if key == "xmlns" => { - local_prefixes.insert(None, value.clone()); - prefixes.insert(None, value.clone()); - false - } - Ok((ref key, ref value)) if key.starts_with("xmlns:") => { - local_prefixes.insert(Some(key[6..].to_owned()), value.to_owned()); - prefixes.insert(Some(key[6..].to_owned()), value.to_owned()); - false - } - _ => true, - }) - .collect::>>()?; - - let namespace: &String = { - if let Some(namespace) = local_prefixes.get(&prefix) { - namespace - } else if let Some(namespace) = prefixes.get(&prefix) { - namespace - } else { - return Err(Error::MissingNamespace); - } - }; - - Ok(Element::new( - name, - namespace.clone(), - // Note that this will always be Some(_) as we can't distinguish between the None case and - // Some(None). At least we make sure the prefix has a namespace associated. - Some(prefix), - local_prefixes, - attributes, - Vec::new(), - )) -} - /// An iterator over references to child elements of an `Element`. pub struct Children<'a> { iter: slice::Iter<'a, Node>, @@ -1068,7 +910,7 @@ mod tests { #[test] fn test_from_reader_simple() { let xml = ""; - let mut reader = EventReader::from_str(xml); + let mut reader = Cursor::new(xml); let elem = Element::from_reader(&mut reader); let elem2 = Element::builder("foo", "ns1").build(); @@ -1079,7 +921,7 @@ mod tests { #[test] fn test_from_reader_nested() { let xml = ""; - let mut reader = EventReader::from_str(xml); + let mut reader = Cursor::new(xml); let elem = Element::from_reader(&mut reader); let nested = Element::builder("bar", "ns1").attr("baz", "qxx").build(); @@ -1091,7 +933,7 @@ mod tests { #[test] fn test_from_reader_with_prefix() { let xml = ""; - let mut reader = EventReader::from_str(xml); + let mut reader = Cursor::new(xml); let elem = Element::from_reader(&mut reader); let nested = Element::builder("bar", "ns1").attr("baz", "qxx").build(); @@ -1103,7 +945,7 @@ mod tests { #[test] fn test_from_reader_split_prefix() { let xml = ""; - let mut reader = EventReader::from_str(xml); + let mut reader = Cursor::new(xml); let elem = Element::from_reader(&mut reader).unwrap(); assert_eq!(elem.name(), String::from("bar")); @@ -1123,14 +965,14 @@ mod tests { "#; - let mut reader = EventReader::from_str(xml); + let mut reader = Cursor::new(xml); let _ = Element::from_reader(&mut reader).unwrap(); } #[test] fn does_not_unescape_cdata() { let xml = "]]>"; - let mut reader = EventReader::from_str(xml); + let mut reader = Cursor::new(xml); let elem = Element::from_reader(&mut reader).unwrap(); assert_eq!(elem.text(), "'>blah"); } @@ -1138,17 +980,17 @@ mod tests { #[test] fn test_compare_all_ns() { let xml = ""; - let mut reader = EventReader::from_str(xml); + let mut reader = Cursor::new(xml); let elem = Element::from_reader(&mut reader).unwrap(); let elem2 = elem.clone(); let xml3 = ""; - let mut reader3 = EventReader::from_str(xml3); + let mut reader3 = Cursor::new(xml3); let elem3 = Element::from_reader(&mut reader3).unwrap(); let xml4 = ""; - let mut reader4 = EventReader::from_str(xml4); + let mut reader4 = Cursor::new(xml4); let elem4 = Element::from_reader(&mut reader4).unwrap(); assert_eq!(elem, elem2); diff --git a/minidom/src/error.rs b/minidom/src/error.rs index b34375a..05158de 100644 --- a/minidom/src/error.rs +++ b/minidom/src/error.rs @@ -20,6 +20,9 @@ pub enum Error { /// An error from quick_xml. XmlError(::quick_xml::Error), + /// Error from the Tokenizer + TokenizerError(crate::tokenizer::TokenizerError), + /// An UTF-8 conversion error. Utf8Error(::std::str::Utf8Error), @@ -53,6 +56,7 @@ impl StdError for Error { fn cause(&self) -> Option<&dyn StdError> { match self { Error::XmlError(e) => Some(e), + Error::TokenizerError(e) => Some(e), Error::Utf8Error(e) => Some(e), Error::IoError(e) => Some(e), Error::EndOfDocument => None, @@ -70,6 +74,7 @@ impl std::fmt::Display for Error { fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Error::XmlError(e) => write!(fmt, "XML error: {}", e), + Error::TokenizerError(e) => write!(fmt, "XML tokenizer error: {}", e), Error::Utf8Error(e) => write!(fmt, "UTF-8 error: {}", e), Error::IoError(e) => write!(fmt, "IO error: {}", e), Error::EndOfDocument => { @@ -96,6 +101,12 @@ impl From<::quick_xml::Error> for Error { } } +impl From for Error { + fn from(err: crate::tokenizer::TokenizerError) -> Error { + Error::TokenizerError(err) + } +} + impl From<::std::str::Utf8Error> for Error { fn from(err: ::std::str::Utf8Error) -> Error { Error::Utf8Error(err) diff --git a/minidom/src/lib.rs b/minidom/src/lib.rs index d62691c..beea39a 100644 --- a/minidom/src/lib.rs +++ b/minidom/src/lib.rs @@ -85,6 +85,7 @@ pub mod node; mod prefixes; pub mod token; pub mod tokenizer; +pub mod tree_builder; #[cfg(test)] mod tests; diff --git a/minidom/src/parser.rs b/minidom/src/parser.rs index 49896c3..cda6a4c 100644 --- a/minidom/src/parser.rs +++ b/minidom/src/parser.rs @@ -10,16 +10,16 @@ use crate::element::Element; use crate::error::{Error, ParserError, Result}; +use crate::tokenizer::Tokenizer; +use crate::tree_builder::TreeBuilder; -use bytes::BytesMut; -use quick_xml::Reader as EventReader; -use std::cell::RefCell; use std::str; /// Parser #[derive(Debug)] pub struct Parser { - buffer: RefCell, + tokenizer: Tokenizer, + tree_builder: TreeBuilder, state: ParserState, } @@ -90,7 +90,8 @@ impl Parser { /// Creates a new Parser pub fn new() -> Parser { Parser { - buffer: RefCell::new(BytesMut::new()), + tokenizer: Tokenizer::new(), + tree_builder: TreeBuilder::new(), state: ParserState::Empty, } } diff --git a/minidom/src/tests.rs b/minidom/src/tests.rs index 726379f..ce43ecb 100644 --- a/minidom/src/tests.rs +++ b/minidom/src/tests.rs @@ -10,11 +10,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. +use std::io::Cursor; + use crate::element::Element; use crate::error::Error; -use quick_xml::Reader; - const TEST_STRING: &'static str = r#"meownya"#; fn build_test_tree() -> Element { @@ -36,7 +36,7 @@ fn build_test_tree() -> Element { #[test] fn reader_works() { - let mut reader = Reader::from_str(TEST_STRING); + let mut reader = Cursor::new(TEST_STRING); assert_eq!( Element::from_reader(&mut reader).unwrap(), build_test_tree() @@ -348,7 +348,7 @@ fn two_elements_with_same_arguments_different_order_are_equal() { #[test] fn namespace_attributes_works() { - let mut reader = Reader::from_str(TEST_STRING); + let mut reader = Cursor::new(TEST_STRING); let root = Element::from_reader(&mut reader).unwrap(); assert_eq!("en", root.attr("xml:lang").unwrap()); assert_eq!( diff --git a/minidom/src/tokenizer.rs b/minidom/src/tokenizer.rs index ea13cae..cf67f89 100644 --- a/minidom/src/tokenizer.rs +++ b/minidom/src/tokenizer.rs @@ -4,7 +4,7 @@ use bytes::BytesMut; use super::Token; /// `Result::Err` type returned from `Tokenizer` -pub type TokenizerError = nom::error::Error<()>; +pub type TokenizerError = nom::error::Error; /// Streaming tokenizer (SAX parser) pub struct Tokenizer { @@ -33,9 +33,11 @@ impl Tokenizer { pub fn pull(&mut self) -> Result, TokenizerError> { /// cannot return an error with location info that points to /// our buffer that we still want to mutate - fn erase_location(e: nom::error::Error) -> TokenizerError { + fn with_input_to_owned(e: nom::error::Error<&[u8]>) -> TokenizerError { nom::error::Error { - input: (), + input: std::str::from_utf8(e.input) + .unwrap_or("invalud UTF-8") + .to_owned(), code: e.code, } } @@ -46,9 +48,9 @@ impl Tokenizer { Result::Err(nom::Err::Incomplete(_)) => None, Result::Err(nom::Err::Error(e)) => - return Err(erase_location(e)), + return Err(with_input_to_owned(e)), Result::Err(nom::Err::Failure(e)) => - return Err(erase_location(e)), + return Err(with_input_to_owned(e)), } }; match result { Some((s_len, token)) => { diff --git a/minidom/src/tree_builder.rs b/minidom/src/tree_builder.rs new file mode 100644 index 0000000..7787d92 --- /dev/null +++ b/minidom/src/tree_builder.rs @@ -0,0 +1,127 @@ +//! SAX events to DOM tree conversion + +use std::collections::BTreeMap; +use crate::Element; +use crate::prefixes::Prefixes; +use crate::token::{Attribute, LocalName, Token}; + +/// Tree-building parser state +pub struct TreeBuilder { + /// Parsing stack + stack: Vec, + /// Namespace set stack by prefix + prefixes_stack: Vec, + /// Document root element if finished + pub root: Option, +} + +impl TreeBuilder { + /// Create a new one + pub fn new() -> Self { + TreeBuilder { + stack: vec![], + prefixes_stack: vec![], + root: None, + } + } + + /// Stack depth + pub fn depth(&self) -> usize { + self.stack.len() + } + + /// Pop the top-most element from the stack + pub fn pop(&mut self) -> Option { + self.prefixes_stack.pop(); + self.stack.pop() + } + + /// Lookup XML namespace declaration for given prefix (or no prefix) + fn lookup_prefix(&self, prefix: &Option) -> Option<&str> { + for nss in self.prefixes_stack.iter().rev() { + if let Some(ns) = nss.get(prefix) { + return Some(ns); + } + } + + None + } + + fn process_start_tag(&mut self, name: LocalName, attrs: Vec) { + let mut prefixes = Prefixes::default(); + let mut attributes = BTreeMap::new(); + for attr in attrs.into_iter() { + match (attr.name.prefix, attr.name.name) { + (None, xmlns) if xmlns == "xmlns" => { + prefixes.insert(None, attr.value); + } + (Some(xmlns), prefix) if xmlns == "xmlns" => { + prefixes.insert(Some(prefix), attr.value); + } + (Some(prefix), name) => { + attributes.insert(format!("{}:{}", prefix, name), attr.value); + } + (None, name) => { + attributes.insert(name, attr.value); + } + } + } + self.prefixes_stack.push(prefixes.clone()); + + let el = Element::new( + name.name, + self.lookup_prefix(&name.prefix).unwrap_or("").to_owned(), + Some(name.prefix), + prefixes, + attributes, + vec![] + ); + self.stack.push(el); + } + + fn process_end_tag(&mut self) { + if let Some(el) = self.pop() { + if self.depth() > 0 { + let top = self.stack.len() - 1; + self.stack[top].append_child(el); + } else { + self.root = Some(el); + } + } + } + + fn process_text(&mut self, text: String) { + if self.depth() > 0 { + let top = self.stack.len() - 1; + self.stack[top].append_text_node(text); + } + } + + /// Process a Token that you got out of a Tokenizer + pub fn process_token(&mut self, token: Token) { + match token { + Token::XmlDecl { .. } => {}, + + Token::StartTag { + name, + attrs, + self_closing: false, + } => self.process_start_tag(name, attrs), + + Token::StartTag { + name, + attrs, + self_closing: true, + } => { + self.process_start_tag(name, attrs); + self.process_end_tag(); + } + + Token::EndTag { .. } => + self.process_end_tag(), + + Token::Text(text) => + self.process_text(text), + } + } +}