mirror of
https://gitlab.com/xmpp-rs/xmpp-rs.git
synced 2024-07-12 22:21:53 +00:00
minidom: add tokenizer
This commit is contained in:
parent
8e2224bc0f
commit
8c1b3b9c4a
4 changed files with 396 additions and 0 deletions
|
@ -22,3 +22,5 @@ gitlab = { repository = "xmpp-rs/xmpp-rs" }
|
|||
|
||||
[dependencies]
|
||||
quick-xml = "0.22.0"
|
||||
nom = "7"
|
||||
bytes = "1"
|
||||
|
|
|
@ -83,6 +83,8 @@ pub mod error;
|
|||
mod namespaces;
|
||||
pub mod node;
|
||||
mod prefixes;
|
||||
pub mod token;
|
||||
pub mod tokenizer;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
@ -92,3 +94,5 @@ pub use element::{Children, ChildrenMut, Element, ElementBuilder};
|
|||
pub use error::{Error, Result};
|
||||
pub use namespaces::NSChoice;
|
||||
pub use node::Node;
|
||||
pub use token::Token;
|
||||
pub use tokenizer::{Tokenizer, TokenizerError};
|
||||
|
|
288
minidom/src/token.rs
Normal file
288
minidom/src/token.rs
Normal file
|
@ -0,0 +1,288 @@
|
|||
//! Parsed XML token
|
||||
|
||||
use nom::{
|
||||
branch::alt,
|
||||
bytes::streaming::{tag, take_while1},
|
||||
character::{is_space, streaming::{char, digit1, one_of, space0}},
|
||||
combinator::{not, peek, value},
|
||||
multi::many0,
|
||||
number::streaming::hex_u32,
|
||||
IResult,
|
||||
};
|
||||
|
||||
/// Parsed XML token
|
||||
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub enum Token {
|
||||
/// XML element opening tag
|
||||
StartTag {
|
||||
/// Element name
|
||||
name: String,
|
||||
/// List of attributes
|
||||
attrs: Vec<(String, String)>,
|
||||
/// Is this tag self-closing (`/>`)?
|
||||
self_closing: bool,
|
||||
},
|
||||
/// XML element closing tag
|
||||
EndTag {
|
||||
/// Element name
|
||||
name: String,
|
||||
},
|
||||
/// Child text
|
||||
Text(String),
|
||||
}
|
||||
|
||||
impl Token {
|
||||
/// Parse one token
|
||||
pub fn parse(s: &[u8]) -> IResult<&[u8], Token> {
|
||||
alt((
|
||||
Self::parse_tag,
|
||||
|s| {
|
||||
let (s, text) = Self::parse_text('<', s)?;
|
||||
Ok((s, Token::Text(text)))
|
||||
},
|
||||
))(s)
|
||||
}
|
||||
|
||||
fn parse_tag(s: &[u8]) -> IResult<&[u8], Token> {
|
||||
let (s, _) = tag("<")(s)?;
|
||||
alt((|s| -> IResult<&[u8], Token> {
|
||||
let (s, _) = tag("![CDATA[")(s)?;
|
||||
let mut end = None;
|
||||
for i in 0..s.len() - 2 {
|
||||
if &s[i..i + 3] == b"]]>" {
|
||||
end = Some(i);
|
||||
break
|
||||
}
|
||||
}
|
||||
if let Some(end) = end {
|
||||
let text = Self::str_from_utf8(&s[..end])?;
|
||||
Ok((&s[end + 3..], Token::Text(text.to_string())))
|
||||
} else {
|
||||
Err(nom::Err::Incomplete(nom::Needed::Unknown))
|
||||
}
|
||||
}, |s| {
|
||||
let (s, _) = tag("/")(s)?;
|
||||
let (s, _) = space0(s)?;
|
||||
let (s, name) = take_while1(|b| !(is_space(b) || b == b'>'))(s)?;
|
||||
let (s, _) = space0(s)?;
|
||||
let (s, _) = tag(">")(s)?;
|
||||
let name = Self::str_from_utf8(name)?;
|
||||
Ok((s, Token::EndTag { name: name.to_string() }))
|
||||
}, |s| {
|
||||
let (s, _) = space0(s)?;
|
||||
let (s, name) = take_while1(|b| !(is_space(b) || b == b'>' || b == b'/'))(s)?;
|
||||
let mut attrs = vec![];
|
||||
let mut self_closing = false;
|
||||
let mut s_ = s;
|
||||
loop {
|
||||
let (s, _) = space0(s_)?;
|
||||
let (s, attr) = alt((|s| {
|
||||
let (s, _) = tag("/")(s)?;
|
||||
let (s, _) = space0(s)?;
|
||||
let (s, _) = tag(">")(s)?;
|
||||
self_closing = true;
|
||||
Ok((s, None))
|
||||
}, |s| {
|
||||
let (s, _) = tag(">")(s)?;
|
||||
Ok((s, None))
|
||||
}, |s| {
|
||||
let (s, (name, value)) = Self::parse_attr(s)?;
|
||||
Ok((s, Some((name, value))))
|
||||
}))(s)?;
|
||||
s_ = s;
|
||||
if let Some(attr) = attr {
|
||||
attrs.push(attr);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok((s_, Token::StartTag {
|
||||
name: Self::str_from_utf8(name)?
|
||||
.to_owned(),
|
||||
attrs: attrs.into_iter()
|
||||
.map(|(name, value)| (name.to_owned(), value.to_owned()))
|
||||
.collect(),
|
||||
self_closing,
|
||||
}))
|
||||
}))(s)
|
||||
}
|
||||
|
||||
fn parse_attr(s: &[u8]) -> IResult<&[u8], (&str, String)> {
|
||||
let (s, name) = take_while1(|b| !(is_space(b) || b == b'='))(s)?;
|
||||
let name = Self::str_from_utf8(name)?;
|
||||
let (s, _) = space0(s)?;
|
||||
let (s, _) = tag("=")(s)?;
|
||||
let (s, _) = space0(s)?;
|
||||
let (s, delim) = one_of("'\"")(s)?;
|
||||
let (s, value) = Self::parse_text(delim, s)?;
|
||||
let (s, _) = char(delim)(s)?;
|
||||
Ok((s, (name, value)))
|
||||
}
|
||||
|
||||
fn parse_text(until: char, s: &[u8]) -> IResult<&[u8], String> {
|
||||
let (s, results) = many0(
|
||||
alt(
|
||||
(|s| {
|
||||
let (s, _) = tag("&#")(s)?;
|
||||
let (s, num) = digit1(s)?;
|
||||
let (s, _) = char(';')(s)?;
|
||||
let num: u32 = Self::str_from_utf8(num)?
|
||||
.parse()
|
||||
.map_err(|_| nom::Err::Failure(nom::error::Error::new(s, nom::error::ErrorKind::Fail)))?;
|
||||
if let Some(c) = std::char::from_u32(num) {
|
||||
Ok((s, format!("{}", c)))
|
||||
} else {
|
||||
Ok((s, format!("")))
|
||||
}
|
||||
}, |s| {
|
||||
let (s, _) = tag("&#x")(s)?;
|
||||
let (s, num) = hex_u32(s)?;
|
||||
let (s, _) = char(';')(s)?;
|
||||
if let Some(c) = std::char::from_u32(num) {
|
||||
Ok((s, format!("{}", c)))
|
||||
} else {
|
||||
Ok((s, format!("")))
|
||||
}
|
||||
}, |s| {
|
||||
let (s, _) = char('&')(s)?;
|
||||
let (s, c) = alt((
|
||||
value('&', tag("amp")),
|
||||
value('<', tag("lt")),
|
||||
value('>', tag("gt")),
|
||||
value('"', tag("quot")),
|
||||
value('\'', tag("apos")),
|
||||
))(s)?;
|
||||
let (s, _) = char(';')(s)?;
|
||||
Ok((s, format!("{}", c)))
|
||||
}, |s| {
|
||||
let (s, _) = not(peek(char(until)))(s)?;
|
||||
let (s, text) = take_while1(|b| b != until as u8 && b != b'&')(s)?;
|
||||
let text = Self::str_from_utf8(text)?;
|
||||
// TODO: CoW
|
||||
Ok((s, text.to_owned()))
|
||||
})
|
||||
)
|
||||
)(s)?;
|
||||
|
||||
let result = results.join("");
|
||||
Ok((s, result))
|
||||
}
|
||||
|
||||
fn str_from_utf8(s: &[u8]) -> Result<&str, nom::Err<nom::error::Error<&[u8]>>> {
|
||||
std::str::from_utf8(s)
|
||||
.map_err(|_| nom::Err::Failure(nom::error::Error::new(s, nom::error::ErrorKind::Fail)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_text() {
|
||||
assert_eq!(
|
||||
Ok((&b"</x"[..], Token::Text("foobar".to_string()))),
|
||||
Token::parse(b"foobar</x")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_entities() {
|
||||
assert_eq!(
|
||||
Ok((&b"</x"[..], Token::Text("\"<foo&bar>'".to_string()))),
|
||||
Token::parse(b""<foo&bar>'</x")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_entities_decimal() {
|
||||
assert_eq!(
|
||||
Ok((&b"</x"[..], Token::Text("foo\r\n".to_string()))),
|
||||
Token::parse(b"foo </x")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_entities_hexadecimal() {
|
||||
assert_eq!(
|
||||
Ok((&b"</x"[..], Token::Text("foo\r\n".to_string()))),
|
||||
Token::parse(b"foo
</x")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cdata() {
|
||||
assert_eq!(
|
||||
Ok((&b""[..], Token::Text("<a href='>".to_string()))),
|
||||
Token::parse(b"<![CDATA[<a href='>]]>")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tag() {
|
||||
assert_eq!(
|
||||
Ok((&b""[..], Token::StartTag {
|
||||
name: "foobar".to_string(),
|
||||
attrs: vec![],
|
||||
self_closing: false,
|
||||
})),
|
||||
Token::parse(b"<foobar>")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attrs() {
|
||||
assert_eq!(
|
||||
Ok((&b""[..], Token::StartTag {
|
||||
name: "a".to_string(),
|
||||
attrs: vec![
|
||||
("a".to_owned(), "2'3".to_owned()),
|
||||
("b".to_owned(), "4\"2".to_owned()),
|
||||
("c".to_owned(), "".to_owned()),
|
||||
],
|
||||
self_closing: false,
|
||||
})),
|
||||
Token::parse(b"<a a=\"2'3\" b = '4\"2' c = ''>")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attrs_entities() {
|
||||
assert_eq!(
|
||||
Ok((&b""[..], Token::StartTag {
|
||||
name: "a".to_string(),
|
||||
attrs: vec![
|
||||
("a".to_owned(), "<3".to_owned()),
|
||||
],
|
||||
self_closing: false,
|
||||
})),
|
||||
Token::parse(b"<a a='<3'>")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_self_closing_tag() {
|
||||
assert_eq!(
|
||||
Ok((&b""[..], Token::StartTag {
|
||||
name: "foobar".to_string(),
|
||||
attrs: vec![],
|
||||
self_closing: true,
|
||||
})),
|
||||
Token::parse(b"<foobar/>")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_end_tag() {
|
||||
assert_eq!(
|
||||
Ok((&b""[..], Token::EndTag {
|
||||
name: "foobar".to_string(),
|
||||
})),
|
||||
Token::parse(b"</foobar>")
|
||||
);
|
||||
}
|
||||
|
||||
// TODO:
|
||||
// - DOCTYPE
|
||||
// - xmldecl
|
||||
}
|
102
minidom/src/tokenizer.rs
Normal file
102
minidom/src/tokenizer.rs
Normal file
|
@ -0,0 +1,102 @@
|
|||
//! Streaming tokenizer (SAX parser)
|
||||
|
||||
use bytes::BytesMut;
|
||||
use super::Token;
|
||||
|
||||
/// `Result::Err` type returned from `Tokenizer`
|
||||
pub type TokenizerError = nom::error::Error<()>;
|
||||
|
||||
/// Streaming tokenizer (SAX parser)
|
||||
pub struct Tokenizer {
|
||||
buffer: BytesMut,
|
||||
}
|
||||
|
||||
impl Tokenizer {
|
||||
/// Construct a new tokenizer
|
||||
pub fn new() -> Self {
|
||||
Tokenizer {
|
||||
buffer: BytesMut::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add content to the inner buffer
|
||||
pub fn push(&mut self, bytes: &[u8]) {
|
||||
self.buffer.extend_from_slice(bytes);
|
||||
}
|
||||
|
||||
/// Is the internal buffer empty?
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.buffer.is_empty()
|
||||
}
|
||||
|
||||
/// Parse the next document fragment
|
||||
pub fn pull(&mut self) -> Result<Option<Token>, TokenizerError> {
|
||||
/// cannot return an error with location info that points to
|
||||
/// our buffer that we still want to mutate
|
||||
fn erase_location<T>(e: nom::error::Error<T>) -> TokenizerError {
|
||||
nom::error::Error {
|
||||
input: (),
|
||||
code: e.code,
|
||||
}
|
||||
}
|
||||
|
||||
let result: Option<(usize, Token)> = { match Token::parse(&self.buffer) {
|
||||
Ok((s, token)) =>
|
||||
Some((s.len(), token)),
|
||||
Result::Err(nom::Err::Incomplete(_)) =>
|
||||
None,
|
||||
Result::Err(nom::Err::Error(e)) =>
|
||||
return Err(erase_location(e)),
|
||||
Result::Err(nom::Err::Failure(e)) =>
|
||||
return Err(erase_location(e)),
|
||||
} };
|
||||
match result {
|
||||
Some((s_len, token)) => {
|
||||
let _ = self.buffer.split_to(self.buffer.len() - s_len);
|
||||
Ok(Some(token))
|
||||
}
|
||||
None => Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test() {
|
||||
fn run(chunk_size: usize, buf: &[u8]) -> Vec<Token> {
|
||||
let mut tokenizer = Tokenizer::new();
|
||||
let mut tokens = vec![];
|
||||
|
||||
let mut pos = 0;
|
||||
while pos < buf.len() {
|
||||
tokenizer.push(&buf[pos..(pos + chunk_size).min(buf.len())]);
|
||||
pos += chunk_size;
|
||||
|
||||
while let Some(token) = tokenizer.pull().unwrap() {
|
||||
tokens.push(token)
|
||||
}
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
let buf = b"<foo bar='baz'>quux</foo>";
|
||||
for chunk_size in 1..=buf.len() {
|
||||
assert_eq!(vec![
|
||||
Token::StartTag {
|
||||
name: "foo".to_owned(),
|
||||
attrs: vec![("bar".to_owned(), "baz".to_owned())],
|
||||
self_closing: false,
|
||||
},
|
||||
Token::Text("quux".to_owned()),
|
||||
Token::EndTag {
|
||||
name: "foo".to_owned(),
|
||||
},
|
||||
], run(chunk_size, buf));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue