minidom: add tree_builder

This commit is contained in:
Astro 2022-03-23 21:43:34 +01:00
parent 4e5630d03c
commit 1887fdd1b5
7 changed files with 187 additions and 203 deletions

View file

@ -17,19 +17,18 @@ use crate::error::{Error, Result};
use crate::namespaces::NSChoice; use crate::namespaces::NSChoice;
use crate::node::Node; use crate::node::Node;
use crate::prefixes::{Namespace, Prefix, Prefixes}; use crate::prefixes::{Namespace, Prefix, Prefixes};
use crate::tokenizer::Tokenizer;
use crate::tree_builder::TreeBuilder;
use std::collections::{btree_map, BTreeMap}; use std::collections::{btree_map, BTreeMap};
use std::io::Write; use std::io::{Cursor, Read, Write};
use std::borrow::Cow; use std::borrow::Cow;
use std::str; use std::str;
use quick_xml::events::{BytesDecl, BytesEnd, BytesStart, Event}; use quick_xml::events::{BytesDecl, BytesEnd, BytesStart, Event};
use quick_xml::Reader as EventReader;
use quick_xml::Writer as EventWriter; use quick_xml::Writer as EventWriter;
use std::io::BufRead;
use std::str::FromStr; use std::str::FromStr;
use std::slice; use std::slice;
@ -102,7 +101,7 @@ impl FromStr for Element {
type Err = Error; type Err = Error;
fn from_str(s: &str) -> Result<Element> { fn from_str(s: &str) -> Result<Element> {
let mut reader = EventReader::from_str(s); let mut reader = Cursor::new(s);
Element::from_reader(&mut reader) Element::from_reader(&mut reader)
} }
} }
@ -128,7 +127,7 @@ fn ensure_no_prefix<S: AsRef<str>>(s: &S) -> Result<()> {
} }
impl Element { impl Element {
fn new<P: Into<Prefixes>>( pub(crate) fn new<P: Into<Prefixes>>(
name: String, name: String,
namespace: String, namespace: String,
prefix: Option<Prefix>, prefix: Option<Prefix>,
@ -310,123 +309,28 @@ impl Element {
namespace.into().compare(self.namespace.as_ref()) namespace.into().compare(self.namespace.as_ref())
} }
/// Parse a document from an `EventReader`. /// Parse a document from a `Read`.
pub fn from_reader<R: BufRead>(reader: &mut EventReader<R>) -> Result<Element> { pub fn from_reader<R: Read>(mut reader: R) -> Result<Element> {
let mut buf = Vec::new(); const CHUNK_SIZE: usize = 65536;
let mut prefixes = BTreeMap::new();
let root: Element = loop {
let e = reader.read_event(&mut buf)?;
match e {
Event::Empty(ref e) | Event::Start(ref e) => {
break build_element(reader, e, &mut prefixes)?;
}
Event::Eof => {
return Err(Error::EndOfDocument);
}
Event::Comment { .. } => {
return Err(Error::NoComments);
}
Event::Text { .. }
| Event::End { .. }
| Event::CData { .. }
| Event::Decl { .. }
| Event::PI { .. }
| Event::DocType { .. } => (), // TODO: may need more errors
}
};
let mut stack = vec![root];
let mut prefix_stack = vec![prefixes];
let mut buf = [0; CHUNK_SIZE];
let mut tokenizer = Tokenizer::new();
let mut tree_builder = TreeBuilder::new();
loop { loop {
match reader.read_event(&mut buf)? { let len = reader.read(&mut buf)?;
Event::Empty(ref e) => { if len == 0 {
let mut prefixes = prefix_stack.last().unwrap().clone(); break;
let elem = build_element(reader, e, &mut prefixes)?; }
// Since there is no Event::End after, directly append it to the current node tokenizer.push(&buf[0..len]);
stack.last_mut().unwrap().append_child(elem); while let Some(token) = tokenizer.pull()? {
tree_builder.process_token(token);
if let Some(root) = tree_builder.root.take() {
return Ok(root);
} }
Event::Start(ref e) => {
let mut prefixes = prefix_stack.last().unwrap().clone();
let elem = build_element(reader, e, &mut prefixes)?;
stack.push(elem);
prefix_stack.push(prefixes);
}
Event::End(ref e) => {
if stack.len() <= 1 {
break;
}
let prefixes = prefix_stack.pop().unwrap();
let elem = stack.pop().unwrap();
if let Some(to) = stack.last_mut() {
// TODO: check whether this is correct, we are comparing &[u8]s, not &strs
let elem_name = e.name();
let mut split_iter = elem_name.splitn(2, |u| *u == 0x3A);
let possible_prefix = split_iter.next().unwrap(); // Can't be empty.
let opening_prefix = {
let mut tmp: Option<Option<String>> = None;
for (prefix, ns) in prefixes {
if ns == elem.namespace {
tmp = Some(prefix.clone());
break;
}
}
match tmp {
Some(prefix) => prefix,
None => return Err(Error::InvalidPrefix),
}
};
match split_iter.next() {
// There is a prefix on the closing tag
Some(name) => {
// Does the closing prefix match the opening prefix?
match opening_prefix {
Some(prefix) if possible_prefix == prefix.as_bytes() => (),
_ => return Err(Error::InvalidElementClosed),
}
// Does the closing tag name match the opening tag name?
if name != elem.name().as_bytes() {
return Err(Error::InvalidElementClosed);
}
}
// There was no prefix on the closing tag
None => {
// Is there a prefix on the opening tag?
if opening_prefix.is_some() {
return Err(Error::InvalidElementClosed);
}
// Does the opening tag name match the closing one?
if possible_prefix != elem.name().as_bytes() {
return Err(Error::InvalidElementClosed);
}
}
}
to.append_child(elem);
}
}
Event::Text(s) => {
let text = s.unescape_and_decode(reader)?;
if !text.is_empty() {
let current_elem = stack.last_mut().unwrap();
current_elem.append_text_node(text);
}
}
Event::CData(s) => {
let text = s.unescape_and_decode(&reader)?;
if !text.is_empty() {
let current_elem = stack.last_mut().unwrap();
current_elem.append_text_node(text);
}
}
Event::Eof => {
break;
}
Event::Comment(_) => return Err(Error::NoComments),
Event::Decl { .. } | Event::PI { .. } | Event::DocType { .. } => (),
} }
} }
Ok(stack.pop().unwrap()) Err(Error::EndOfDocument)
} }
/// Output a document to a `Writer`. /// Output a document to a `Writer`.
@ -824,68 +728,6 @@ impl Element {
} }
} }
fn split_element_name<S: AsRef<str>>(s: S) -> Result<(Option<String>, String)> {
let name_parts = s.as_ref().split(':').collect::<Vec<&str>>();
match name_parts.len() {
2 => Ok((Some(name_parts[0].to_owned()), name_parts[1].to_owned())),
1 => Ok((None, name_parts[0].to_owned())),
_ => Err(Error::InvalidElement),
}
}
fn build_element<R: BufRead>(
reader: &EventReader<R>,
event: &BytesStart,
prefixes: &mut BTreeMap<Prefix, Namespace>,
) -> Result<Element> {
let (prefix, name) = split_element_name(str::from_utf8(event.name())?)?;
let mut local_prefixes = BTreeMap::new();
let attributes = event
.attributes()
.map(|o| {
let o = o?;
let key = str::from_utf8(o.key)?.to_owned();
let value = o.unescape_and_decode_value(reader)?;
Ok((key, value))
})
.filter(|o| match *o {
Ok((ref key, ref value)) if key == "xmlns" => {
local_prefixes.insert(None, value.clone());
prefixes.insert(None, value.clone());
false
}
Ok((ref key, ref value)) if key.starts_with("xmlns:") => {
local_prefixes.insert(Some(key[6..].to_owned()), value.to_owned());
prefixes.insert(Some(key[6..].to_owned()), value.to_owned());
false
}
_ => true,
})
.collect::<Result<BTreeMap<String, String>>>()?;
let namespace: &String = {
if let Some(namespace) = local_prefixes.get(&prefix) {
namespace
} else if let Some(namespace) = prefixes.get(&prefix) {
namespace
} else {
return Err(Error::MissingNamespace);
}
};
Ok(Element::new(
name,
namespace.clone(),
// Note that this will always be Some(_) as we can't distinguish between the None case and
// Some(None). At least we make sure the prefix has a namespace associated.
Some(prefix),
local_prefixes,
attributes,
Vec::new(),
))
}
/// An iterator over references to child elements of an `Element`. /// An iterator over references to child elements of an `Element`.
pub struct Children<'a> { pub struct Children<'a> {
iter: slice::Iter<'a, Node>, iter: slice::Iter<'a, Node>,
@ -1068,7 +910,7 @@ mod tests {
#[test] #[test]
fn test_from_reader_simple() { fn test_from_reader_simple() {
let xml = "<foo xmlns='ns1'></foo>"; let xml = "<foo xmlns='ns1'></foo>";
let mut reader = EventReader::from_str(xml); let mut reader = Cursor::new(xml);
let elem = Element::from_reader(&mut reader); let elem = Element::from_reader(&mut reader);
let elem2 = Element::builder("foo", "ns1").build(); let elem2 = Element::builder("foo", "ns1").build();
@ -1079,7 +921,7 @@ mod tests {
#[test] #[test]
fn test_from_reader_nested() { fn test_from_reader_nested() {
let xml = "<foo xmlns='ns1'><bar xmlns='ns1' baz='qxx' /></foo>"; let xml = "<foo xmlns='ns1'><bar xmlns='ns1' baz='qxx' /></foo>";
let mut reader = EventReader::from_str(xml); let mut reader = Cursor::new(xml);
let elem = Element::from_reader(&mut reader); let elem = Element::from_reader(&mut reader);
let nested = Element::builder("bar", "ns1").attr("baz", "qxx").build(); let nested = Element::builder("bar", "ns1").attr("baz", "qxx").build();
@ -1091,7 +933,7 @@ mod tests {
#[test] #[test]
fn test_from_reader_with_prefix() { fn test_from_reader_with_prefix() {
let xml = "<foo xmlns='ns1'><prefix:bar xmlns:prefix='ns1' baz='qxx' /></foo>"; let xml = "<foo xmlns='ns1'><prefix:bar xmlns:prefix='ns1' baz='qxx' /></foo>";
let mut reader = EventReader::from_str(xml); let mut reader = Cursor::new(xml);
let elem = Element::from_reader(&mut reader); let elem = Element::from_reader(&mut reader);
let nested = Element::builder("bar", "ns1").attr("baz", "qxx").build(); let nested = Element::builder("bar", "ns1").attr("baz", "qxx").build();
@ -1103,7 +945,7 @@ mod tests {
#[test] #[test]
fn test_from_reader_split_prefix() { fn test_from_reader_split_prefix() {
let xml = "<foo:bar xmlns:foo='ns1'/>"; let xml = "<foo:bar xmlns:foo='ns1'/>";
let mut reader = EventReader::from_str(xml); let mut reader = Cursor::new(xml);
let elem = Element::from_reader(&mut reader).unwrap(); let elem = Element::from_reader(&mut reader).unwrap();
assert_eq!(elem.name(), String::from("bar")); assert_eq!(elem.name(), String::from("bar"));
@ -1123,14 +965,14 @@ mod tests {
<rng:name xmlns:rng="http://relaxng.org/ns/structure/1.0"></rng:name> <rng:name xmlns:rng="http://relaxng.org/ns/structure/1.0"></rng:name>
</rng:grammar> </rng:grammar>
"#; "#;
let mut reader = EventReader::from_str(xml); let mut reader = Cursor::new(xml);
let _ = Element::from_reader(&mut reader).unwrap(); let _ = Element::from_reader(&mut reader).unwrap();
} }
#[test] #[test]
fn does_not_unescape_cdata() { fn does_not_unescape_cdata() {
let xml = "<test xmlns='test'><![CDATA[&apos;&gt;blah<blah>]]></test>"; let xml = "<test xmlns='test'><![CDATA[&apos;&gt;blah<blah>]]></test>";
let mut reader = EventReader::from_str(xml); let mut reader = Cursor::new(xml);
let elem = Element::from_reader(&mut reader).unwrap(); let elem = Element::from_reader(&mut reader).unwrap();
assert_eq!(elem.text(), "&apos;&gt;blah<blah>"); assert_eq!(elem.text(), "&apos;&gt;blah<blah>");
} }
@ -1138,17 +980,17 @@ mod tests {
#[test] #[test]
fn test_compare_all_ns() { fn test_compare_all_ns() {
let xml = "<foo xmlns='foo' xmlns:bar='baz'><bar:meh xmlns:bar='baz' /></foo>"; let xml = "<foo xmlns='foo' xmlns:bar='baz'><bar:meh xmlns:bar='baz' /></foo>";
let mut reader = EventReader::from_str(xml); let mut reader = Cursor::new(xml);
let elem = Element::from_reader(&mut reader).unwrap(); let elem = Element::from_reader(&mut reader).unwrap();
let elem2 = elem.clone(); let elem2 = elem.clone();
let xml3 = "<foo xmlns='foo'><bar:meh xmlns:bar='baz'/></foo>"; let xml3 = "<foo xmlns='foo'><bar:meh xmlns:bar='baz'/></foo>";
let mut reader3 = EventReader::from_str(xml3); let mut reader3 = Cursor::new(xml3);
let elem3 = Element::from_reader(&mut reader3).unwrap(); let elem3 = Element::from_reader(&mut reader3).unwrap();
let xml4 = "<prefix:foo xmlns:prefix='foo'><bar:meh xmlns:bar='baz'/></prefix:foo>"; let xml4 = "<prefix:foo xmlns:prefix='foo'><bar:meh xmlns:bar='baz'/></prefix:foo>";
let mut reader4 = EventReader::from_str(xml4); let mut reader4 = Cursor::new(xml4);
let elem4 = Element::from_reader(&mut reader4).unwrap(); let elem4 = Element::from_reader(&mut reader4).unwrap();
assert_eq!(elem, elem2); assert_eq!(elem, elem2);

View file

@ -20,6 +20,9 @@ pub enum Error {
/// An error from quick_xml. /// An error from quick_xml.
XmlError(::quick_xml::Error), XmlError(::quick_xml::Error),
/// Error from the Tokenizer
TokenizerError(crate::tokenizer::TokenizerError),
/// An UTF-8 conversion error. /// An UTF-8 conversion error.
Utf8Error(::std::str::Utf8Error), Utf8Error(::std::str::Utf8Error),
@ -53,6 +56,7 @@ impl StdError for Error {
fn cause(&self) -> Option<&dyn StdError> { fn cause(&self) -> Option<&dyn StdError> {
match self { match self {
Error::XmlError(e) => Some(e), Error::XmlError(e) => Some(e),
Error::TokenizerError(e) => Some(e),
Error::Utf8Error(e) => Some(e), Error::Utf8Error(e) => Some(e),
Error::IoError(e) => Some(e), Error::IoError(e) => Some(e),
Error::EndOfDocument => None, Error::EndOfDocument => None,
@ -70,6 +74,7 @@ impl std::fmt::Display for Error {
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
match self { match self {
Error::XmlError(e) => write!(fmt, "XML error: {}", e), Error::XmlError(e) => write!(fmt, "XML error: {}", e),
Error::TokenizerError(e) => write!(fmt, "XML tokenizer error: {}", e),
Error::Utf8Error(e) => write!(fmt, "UTF-8 error: {}", e), Error::Utf8Error(e) => write!(fmt, "UTF-8 error: {}", e),
Error::IoError(e) => write!(fmt, "IO error: {}", e), Error::IoError(e) => write!(fmt, "IO error: {}", e),
Error::EndOfDocument => { Error::EndOfDocument => {
@ -96,6 +101,12 @@ impl From<::quick_xml::Error> for Error {
} }
} }
impl From<crate::tokenizer::TokenizerError> for Error {
fn from(err: crate::tokenizer::TokenizerError) -> Error {
Error::TokenizerError(err)
}
}
impl From<::std::str::Utf8Error> for Error { impl From<::std::str::Utf8Error> for Error {
fn from(err: ::std::str::Utf8Error) -> Error { fn from(err: ::std::str::Utf8Error) -> Error {
Error::Utf8Error(err) Error::Utf8Error(err)

View file

@ -85,6 +85,7 @@ pub mod node;
mod prefixes; mod prefixes;
pub mod token; pub mod token;
pub mod tokenizer; pub mod tokenizer;
pub mod tree_builder;
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;

View file

@ -10,16 +10,16 @@
use crate::element::Element; use crate::element::Element;
use crate::error::{Error, ParserError, Result}; use crate::error::{Error, ParserError, Result};
use crate::tokenizer::Tokenizer;
use crate::tree_builder::TreeBuilder;
use bytes::BytesMut;
use quick_xml::Reader as EventReader;
use std::cell::RefCell;
use std::str; use std::str;
/// Parser /// Parser
#[derive(Debug)] #[derive(Debug)]
pub struct Parser { pub struct Parser {
buffer: RefCell<BytesMut>, tokenizer: Tokenizer,
tree_builder: TreeBuilder,
state: ParserState, state: ParserState,
} }
@ -90,7 +90,8 @@ impl Parser {
/// Creates a new Parser /// Creates a new Parser
pub fn new() -> Parser { pub fn new() -> Parser {
Parser { Parser {
buffer: RefCell::new(BytesMut::new()), tokenizer: Tokenizer::new(),
tree_builder: TreeBuilder::new(),
state: ParserState::Empty, state: ParserState::Empty,
} }
} }

View file

@ -10,11 +10,11 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this // License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/. // file, You can obtain one at http://mozilla.org/MPL/2.0/.
use std::io::Cursor;
use crate::element::Element; use crate::element::Element;
use crate::error::Error; use crate::error::Error;
use quick_xml::Reader;
const TEST_STRING: &'static str = r#"<root xmlns="root_ns" a="b" xml:lang="en">meow<child c="d"/><child xmlns="child_ns" d="e" xml:lang="fr"/>nya</root>"#; const TEST_STRING: &'static str = r#"<root xmlns="root_ns" a="b" xml:lang="en">meow<child c="d"/><child xmlns="child_ns" d="e" xml:lang="fr"/>nya</root>"#;
fn build_test_tree() -> Element { fn build_test_tree() -> Element {
@ -36,7 +36,7 @@ fn build_test_tree() -> Element {
#[test] #[test]
fn reader_works() { fn reader_works() {
let mut reader = Reader::from_str(TEST_STRING); let mut reader = Cursor::new(TEST_STRING);
assert_eq!( assert_eq!(
Element::from_reader(&mut reader).unwrap(), Element::from_reader(&mut reader).unwrap(),
build_test_tree() build_test_tree()
@ -348,7 +348,7 @@ fn two_elements_with_same_arguments_different_order_are_equal() {
#[test] #[test]
fn namespace_attributes_works() { fn namespace_attributes_works() {
let mut reader = Reader::from_str(TEST_STRING); let mut reader = Cursor::new(TEST_STRING);
let root = Element::from_reader(&mut reader).unwrap(); let root = Element::from_reader(&mut reader).unwrap();
assert_eq!("en", root.attr("xml:lang").unwrap()); assert_eq!("en", root.attr("xml:lang").unwrap());
assert_eq!( assert_eq!(

View file

@ -4,7 +4,7 @@ use bytes::BytesMut;
use super::Token; use super::Token;
/// `Result::Err` type returned from `Tokenizer` /// `Result::Err` type returned from `Tokenizer`
pub type TokenizerError = nom::error::Error<()>; pub type TokenizerError = nom::error::Error<String>;
/// Streaming tokenizer (SAX parser) /// Streaming tokenizer (SAX parser)
pub struct Tokenizer { pub struct Tokenizer {
@ -33,9 +33,11 @@ impl Tokenizer {
pub fn pull(&mut self) -> Result<Option<Token>, TokenizerError> { pub fn pull(&mut self) -> Result<Option<Token>, TokenizerError> {
/// cannot return an error with location info that points to /// cannot return an error with location info that points to
/// our buffer that we still want to mutate /// our buffer that we still want to mutate
fn erase_location<T>(e: nom::error::Error<T>) -> TokenizerError { fn with_input_to_owned(e: nom::error::Error<&[u8]>) -> TokenizerError {
nom::error::Error { nom::error::Error {
input: (), input: std::str::from_utf8(e.input)
.unwrap_or("invalud UTF-8")
.to_owned(),
code: e.code, code: e.code,
} }
} }
@ -46,9 +48,9 @@ impl Tokenizer {
Result::Err(nom::Err::Incomplete(_)) => Result::Err(nom::Err::Incomplete(_)) =>
None, None,
Result::Err(nom::Err::Error(e)) => Result::Err(nom::Err::Error(e)) =>
return Err(erase_location(e)), return Err(with_input_to_owned(e)),
Result::Err(nom::Err::Failure(e)) => Result::Err(nom::Err::Failure(e)) =>
return Err(erase_location(e)), return Err(with_input_to_owned(e)),
} }; } };
match result { match result {
Some((s_len, token)) => { Some((s_len, token)) => {

127
minidom/src/tree_builder.rs Normal file
View file

@ -0,0 +1,127 @@
//! SAX events to DOM tree conversion
use std::collections::BTreeMap;
use crate::Element;
use crate::prefixes::Prefixes;
use crate::token::{Attribute, LocalName, Token};
/// Tree-building parser state
pub struct TreeBuilder {
/// Parsing stack
stack: Vec<Element>,
/// Namespace set stack by prefix
prefixes_stack: Vec<Prefixes>,
/// Document root element if finished
pub root: Option<Element>,
}
impl TreeBuilder {
/// Create a new one
pub fn new() -> Self {
TreeBuilder {
stack: vec![],
prefixes_stack: vec![],
root: None,
}
}
/// Stack depth
pub fn depth(&self) -> usize {
self.stack.len()
}
/// Pop the top-most element from the stack
pub fn pop(&mut self) -> Option<Element> {
self.prefixes_stack.pop();
self.stack.pop()
}
/// Lookup XML namespace declaration for given prefix (or no prefix)
fn lookup_prefix(&self, prefix: &Option<String>) -> Option<&str> {
for nss in self.prefixes_stack.iter().rev() {
if let Some(ns) = nss.get(prefix) {
return Some(ns);
}
}
None
}
fn process_start_tag(&mut self, name: LocalName, attrs: Vec<Attribute>) {
let mut prefixes = Prefixes::default();
let mut attributes = BTreeMap::new();
for attr in attrs.into_iter() {
match (attr.name.prefix, attr.name.name) {
(None, xmlns) if xmlns == "xmlns" => {
prefixes.insert(None, attr.value);
}
(Some(xmlns), prefix) if xmlns == "xmlns" => {
prefixes.insert(Some(prefix), attr.value);
}
(Some(prefix), name) => {
attributes.insert(format!("{}:{}", prefix, name), attr.value);
}
(None, name) => {
attributes.insert(name, attr.value);
}
}
}
self.prefixes_stack.push(prefixes.clone());
let el = Element::new(
name.name,
self.lookup_prefix(&name.prefix).unwrap_or("").to_owned(),
Some(name.prefix),
prefixes,
attributes,
vec![]
);
self.stack.push(el);
}
fn process_end_tag(&mut self) {
if let Some(el) = self.pop() {
if self.depth() > 0 {
let top = self.stack.len() - 1;
self.stack[top].append_child(el);
} else {
self.root = Some(el);
}
}
}
fn process_text(&mut self, text: String) {
if self.depth() > 0 {
let top = self.stack.len() - 1;
self.stack[top].append_text_node(text);
}
}
/// Process a Token that you got out of a Tokenizer
pub fn process_token(&mut self, token: Token) {
match token {
Token::XmlDecl { .. } => {},
Token::StartTag {
name,
attrs,
self_closing: false,
} => self.process_start_tag(name, attrs),
Token::StartTag {
name,
attrs,
self_closing: true,
} => {
self.process_start_tag(name, attrs);
self.process_end_tag();
}
Token::EndTag { .. } =>
self.process_end_tag(),
Token::Text(text) =>
self.process_text(text),
}
}
}