From 8c1b3b9c4a86836aaf2099f6b7b7e2cc12920862 Mon Sep 17 00:00:00 2001 From: Astro Date: Tue, 22 Mar 2022 23:29:25 +0100 Subject: [PATCH] minidom: add tokenizer --- minidom/Cargo.toml | 2 + minidom/src/lib.rs | 4 + minidom/src/token.rs | 288 +++++++++++++++++++++++++++++++++++++++ minidom/src/tokenizer.rs | 102 ++++++++++++++ 4 files changed, 396 insertions(+) create mode 100644 minidom/src/token.rs create mode 100644 minidom/src/tokenizer.rs diff --git a/minidom/Cargo.toml b/minidom/Cargo.toml index bd951ed..1ae6631 100644 --- a/minidom/Cargo.toml +++ b/minidom/Cargo.toml @@ -22,3 +22,5 @@ gitlab = { repository = "xmpp-rs/xmpp-rs" } [dependencies] quick-xml = "0.22.0" +nom = "7" +bytes = "1" diff --git a/minidom/src/lib.rs b/minidom/src/lib.rs index 4ecf36f..d62691c 100644 --- a/minidom/src/lib.rs +++ b/minidom/src/lib.rs @@ -83,6 +83,8 @@ pub mod error; mod namespaces; pub mod node; mod prefixes; +pub mod token; +pub mod tokenizer; #[cfg(test)] mod tests; @@ -92,3 +94,5 @@ pub use element::{Children, ChildrenMut, Element, ElementBuilder}; pub use error::{Error, Result}; pub use namespaces::NSChoice; pub use node::Node; +pub use token::Token; +pub use tokenizer::{Tokenizer, TokenizerError}; diff --git a/minidom/src/token.rs b/minidom/src/token.rs new file mode 100644 index 0000000..853f64f --- /dev/null +++ b/minidom/src/token.rs @@ -0,0 +1,288 @@ +//! Parsed XML token + +use nom::{ + branch::alt, + bytes::streaming::{tag, take_while1}, + character::{is_space, streaming::{char, digit1, one_of, space0}}, + combinator::{not, peek, value}, + multi::many0, + number::streaming::hex_u32, + IResult, +}; + +/// Parsed XML token +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Token { + /// XML element opening tag + StartTag { + /// Element name + name: String, + /// List of attributes + attrs: Vec<(String, String)>, + /// Is this tag self-closing (`/>`)? + self_closing: bool, + }, + /// XML element closing tag + EndTag { + /// Element name + name: String, + }, + /// Child text + Text(String), +} + +impl Token { + /// Parse one token + pub fn parse(s: &[u8]) -> IResult<&[u8], Token> { + alt(( + Self::parse_tag, + |s| { + let (s, text) = Self::parse_text('<', s)?; + Ok((s, Token::Text(text))) + }, + ))(s) + } + + fn parse_tag(s: &[u8]) -> IResult<&[u8], Token> { + let (s, _) = tag("<")(s)?; + alt((|s| -> IResult<&[u8], Token> { + let (s, _) = tag("![CDATA[")(s)?; + let mut end = None; + for i in 0..s.len() - 2 { + if &s[i..i + 3] == b"]]>" { + end = Some(i); + break + } + } + if let Some(end) = end { + let text = Self::str_from_utf8(&s[..end])?; + Ok((&s[end + 3..], Token::Text(text.to_string()))) + } else { + Err(nom::Err::Incomplete(nom::Needed::Unknown)) + } + }, |s| { + let (s, _) = tag("/")(s)?; + let (s, _) = space0(s)?; + let (s, name) = take_while1(|b| !(is_space(b) || b == b'>'))(s)?; + let (s, _) = space0(s)?; + let (s, _) = tag(">")(s)?; + let name = Self::str_from_utf8(name)?; + Ok((s, Token::EndTag { name: name.to_string() })) + }, |s| { + let (s, _) = space0(s)?; + let (s, name) = take_while1(|b| !(is_space(b) || b == b'>' || b == b'/'))(s)?; + let mut attrs = vec![]; + let mut self_closing = false; + let mut s_ = s; + loop { + let (s, _) = space0(s_)?; + let (s, attr) = alt((|s| { + let (s, _) = tag("/")(s)?; + let (s, _) = space0(s)?; + let (s, _) = tag(">")(s)?; + self_closing = true; + Ok((s, None)) + }, |s| { + let (s, _) = tag(">")(s)?; + Ok((s, None)) + }, |s| { + let (s, (name, value)) = Self::parse_attr(s)?; + Ok((s, Some((name, value)))) + }))(s)?; + s_ = s; + if let Some(attr) = attr { + attrs.push(attr); + } else { + break; + } + } + Ok((s_, Token::StartTag { + name: Self::str_from_utf8(name)? + .to_owned(), + attrs: attrs.into_iter() + .map(|(name, value)| (name.to_owned(), value.to_owned())) + .collect(), + self_closing, + })) + }))(s) + } + + fn parse_attr(s: &[u8]) -> IResult<&[u8], (&str, String)> { + let (s, name) = take_while1(|b| !(is_space(b) || b == b'='))(s)?; + let name = Self::str_from_utf8(name)?; + let (s, _) = space0(s)?; + let (s, _) = tag("=")(s)?; + let (s, _) = space0(s)?; + let (s, delim) = one_of("'\"")(s)?; + let (s, value) = Self::parse_text(delim, s)?; + let (s, _) = char(delim)(s)?; + Ok((s, (name, value))) + } + + fn parse_text(until: char, s: &[u8]) -> IResult<&[u8], String> { + let (s, results) = many0( + alt( + (|s| { + let (s, _) = tag("&#")(s)?; + let (s, num) = digit1(s)?; + let (s, _) = char(';')(s)?; + let num: u32 = Self::str_from_utf8(num)? + .parse() + .map_err(|_| nom::Err::Failure(nom::error::Error::new(s, nom::error::ErrorKind::Fail)))?; + if let Some(c) = std::char::from_u32(num) { + Ok((s, format!("{}", c))) + } else { + Ok((s, format!(""))) + } + }, |s| { + let (s, _) = tag("&#x")(s)?; + let (s, num) = hex_u32(s)?; + let (s, _) = char(';')(s)?; + if let Some(c) = std::char::from_u32(num) { + Ok((s, format!("{}", c))) + } else { + Ok((s, format!(""))) + } + }, |s| { + let (s, _) = char('&')(s)?; + let (s, c) = alt(( + value('&', tag("amp")), + value('<', tag("lt")), + value('>', tag("gt")), + value('"', tag("quot")), + value('\'', tag("apos")), + ))(s)?; + let (s, _) = char(';')(s)?; + Ok((s, format!("{}", c))) + }, |s| { + let (s, _) = not(peek(char(until)))(s)?; + let (s, text) = take_while1(|b| b != until as u8 && b != b'&')(s)?; + let text = Self::str_from_utf8(text)?; + // TODO: CoW + Ok((s, text.to_owned())) + }) + ) + )(s)?; + + let result = results.join(""); + Ok((s, result)) + } + + fn str_from_utf8(s: &[u8]) -> Result<&str, nom::Err>> { + std::str::from_utf8(s) + .map_err(|_| nom::Err::Failure(nom::error::Error::new(s, nom::error::ErrorKind::Fail))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_text() { + assert_eq!( + Ok((&b"'".to_string()))), + Token::parse(b""<foo&bar>']]>") + ); + } + + #[test] + fn test_tag() { + assert_eq!( + Ok((&b""[..], Token::StartTag { + name: "foobar".to_string(), + attrs: vec![], + self_closing: false, + })), + Token::parse(b"") + ); + } + + #[test] + fn test_attrs() { + assert_eq!( + Ok((&b""[..], Token::StartTag { + name: "a".to_string(), + attrs: vec![ + ("a".to_owned(), "2'3".to_owned()), + ("b".to_owned(), "4\"2".to_owned()), + ("c".to_owned(), "".to_owned()), + ], + self_closing: false, + })), + Token::parse(b"") + ); + } + + #[test] + fn test_attrs_entities() { + assert_eq!( + Ok((&b""[..], Token::StartTag { + name: "a".to_string(), + attrs: vec![ + ("a".to_owned(), "<3".to_owned()), + ], + self_closing: false, + })), + Token::parse(b"") + ); + } + + #[test] + fn test_self_closing_tag() { + assert_eq!( + Ok((&b""[..], Token::StartTag { + name: "foobar".to_string(), + attrs: vec![], + self_closing: true, + })), + Token::parse(b"") + ); + } + + #[test] + fn test_end_tag() { + assert_eq!( + Ok((&b""[..], Token::EndTag { + name: "foobar".to_string(), + })), + Token::parse(b"") + ); + } + + // TODO: + // - DOCTYPE + // - xmldecl +} diff --git a/minidom/src/tokenizer.rs b/minidom/src/tokenizer.rs new file mode 100644 index 0000000..e35b228 --- /dev/null +++ b/minidom/src/tokenizer.rs @@ -0,0 +1,102 @@ +//! Streaming tokenizer (SAX parser) + +use bytes::BytesMut; +use super::Token; + +/// `Result::Err` type returned from `Tokenizer` +pub type TokenizerError = nom::error::Error<()>; + +/// Streaming tokenizer (SAX parser) +pub struct Tokenizer { + buffer: BytesMut, +} + +impl Tokenizer { + /// Construct a new tokenizer + pub fn new() -> Self { + Tokenizer { + buffer: BytesMut::new(), + } + } + + /// Add content to the inner buffer + pub fn push(&mut self, bytes: &[u8]) { + self.buffer.extend_from_slice(bytes); + } + + /// Is the internal buffer empty? + pub fn is_empty(&self) -> bool { + self.buffer.is_empty() + } + + /// Parse the next document fragment + pub fn pull(&mut self) -> Result, TokenizerError> { + /// cannot return an error with location info that points to + /// our buffer that we still want to mutate + fn erase_location(e: nom::error::Error) -> TokenizerError { + nom::error::Error { + input: (), + code: e.code, + } + } + + let result: Option<(usize, Token)> = { match Token::parse(&self.buffer) { + Ok((s, token)) => + Some((s.len(), token)), + Result::Err(nom::Err::Incomplete(_)) => + None, + Result::Err(nom::Err::Error(e)) => + return Err(erase_location(e)), + Result::Err(nom::Err::Failure(e)) => + return Err(erase_location(e)), + } }; + match result { + Some((s_len, token)) => { + let _ = self.buffer.split_to(self.buffer.len() - s_len); + Ok(Some(token)) + } + None => Ok(None) + } + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test() { + fn run(chunk_size: usize, buf: &[u8]) -> Vec { + let mut tokenizer = Tokenizer::new(); + let mut tokens = vec![]; + + let mut pos = 0; + while pos < buf.len() { + tokenizer.push(&buf[pos..(pos + chunk_size).min(buf.len())]); + pos += chunk_size; + + while let Some(token) = tokenizer.pull().unwrap() { + tokens.push(token) + } + } + + tokens + } + + let buf = b"quux"; + for chunk_size in 1..=buf.len() { + assert_eq!(vec![ + Token::StartTag { + name: "foo".to_owned(), + attrs: vec![("bar".to_owned(), "baz".to_owned())], + self_closing: false, + }, + Token::Text("quux".to_owned()), + Token::EndTag { + name: "foo".to_owned(), + }, + ], run(chunk_size, buf)); + } + } +}