From 7ebfe3e88100fd91df72d6d25003f0a290a63b65 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Sun, 25 Aug 2019 19:01:51 +0200 Subject: [PATCH] New XHTML-IM parser (XEP-0071). --- ChangeLog | 3 +- doap.xml | 8 + src/lib.rs | 3 + src/ns.rs | 5 + src/xhtml.rs | 472 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 490 insertions(+), 1 deletion(-) create mode 100644 src/xhtml.rs diff --git a/ChangeLog b/ChangeLog index ea8f73c..eec3bbb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,9 @@ Version NEXT: DATE Emmanuel Gil Peyrot * New parsers/serialisers: - - Message Carbons (XEP-0280) + - XHTML-IM (XEP-0071) - Bits of Binary (XEP-0231) + - Message Carbons (XEP-0280) * Breaking changes: - Stop reexporting TryFrom and TryInto, they are available in std::convert nowadays. diff --git a/doap.xml b/doap.xml index b68a716..a3165dc 100644 --- a/doap.xml +++ b/doap.xml @@ -111,6 +111,14 @@ 0.5.0 + + + + complete + 1.5.4 + NEXT + + diff --git a/src/lib.rs b/src/lib.rs index 3a4a5dc..af2fc12 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -75,6 +75,9 @@ pub mod rsm; /// XEP-0060: Publish-Subscribe pub mod pubsub; +/// XEP-0071: XHTML-IM +pub mod xhtml; + /// XEP-0077: In-Band Registration pub mod ibr; diff --git a/src/ns.rs b/src/ns.rs index f5775d5..5c59ec0 100644 --- a/src/ns.rs +++ b/src/ns.rs @@ -53,6 +53,11 @@ pub const PUBSUB_EVENT: &str = "http://jabber.org/protocol/pubsub#event"; /// XEP-0060: Publish-Subscribe pub const PUBSUB_OWNER: &str = "http://jabber.org/protocol/pubsub#owner"; +/// XEP-0071: XHTML-IM +pub const XHTML_IM: &str = "http://jabber.org/protocol/xhtml-im"; +/// XEP-0071: XHTML-IM +pub const XHTML: &str = "http://www.w3.org/1999/xhtml"; + /// XEP-0077: In-Band Registration pub const REGISTER: &str = "jabber:iq:register"; diff --git a/src/xhtml.rs b/src/xhtml.rs new file mode 100644 index 0000000..c4fc0c0 --- /dev/null +++ b/src/xhtml.rs @@ -0,0 +1,472 @@ +// Copyright (c) 2019 Emmanuel Gil Peyrot +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use crate::util::error::Error; +use crate::message::MessagePayload; +use crate::ns; +use minidom::{Element, Node}; +use std::convert::TryFrom; +use std::collections::HashMap; + +// TODO: Use a proper lang type. +type Lang = String; + +/// Container for formatted text. +#[derive(Debug, Clone)] +pub struct XhtmlIm { + /// Map of language to body element. + bodies: HashMap, +} + +impl XhtmlIm { + /// Serialise formatted text to HTML. + pub fn to_html(self) -> String { + let mut html = Vec::new(); + // TODO: use the best language instead. + for (lang, body) in self.bodies { + if let Tag::Body { style: _, xml_lang, children } = body { + if lang.is_empty() { + assert!(xml_lang.is_none()); + } else { + assert_eq!(Some(lang), xml_lang); + } + for tag in children { + html.push(tag.to_html()); + } + break; + } else { + unreachable!(); + } + } + html.concat() + } +} + +impl MessagePayload for XhtmlIm {} + +impl TryFrom for XhtmlIm { + type Error = Error; + + fn try_from(elem: Element) -> Result { + check_self!(elem, "html", XHTML_IM); + check_no_attributes!(elem, "html"); + + let mut bodies = HashMap::new(); + for child in elem.children() { + if child.is("body", ns::XHTML) { + let child = child.clone(); + let lang = match child.attr("xml:lang") { + Some(lang) => lang, + None => "", + }.to_string(); + let body = Tag::try_from(child)?; + match bodies.insert(lang, body) { + None => (), + Some(_) => return Err(Error::ParseError("Two identical language bodies found in XHTML-IM.")) + } + } else { + return Err(Error::ParseError("Unknown element in XHTML-IM.")); + } + } + + Ok(XhtmlIm { bodies }) + } +} + +impl From for Element { + fn from(wrapper: XhtmlIm) -> Element { + Element::builder("html") + .ns(ns::XHTML_IM) + .append(wrapper.bodies.into_iter().map(|(ref lang, ref body)| { + if let Tag::Body { style, xml_lang, children } = body { + assert_eq!(Some(lang), xml_lang.as_ref()); + Element::builder("body") + .ns(ns::XHTML_IM) + .attr("style", get_style_string(style.clone())) + .attr("xml:lang", xml_lang.clone()) + .append(children_to_nodes(children.clone())) + } else { + unreachable!(); + } + }).collect::>()) + .build() + } +} + +#[derive(Debug, Clone)] +enum Child { + Tag(Tag), + Text(String), +} + +impl Child { + fn to_html(self) -> String { + match self { + Child::Tag(tag) => tag.to_html(), + Child::Text(text) => text, + } + } +} + +#[derive(Debug, Clone)] +struct Property { + key: String, + value: String, +} + +type Css = Vec; + +fn get_style_string(style: Css) -> Option { + let mut result = vec![]; + for Property { key, value } in style { + result.push(format!("{}: {}", key, value)); + } + if result.is_empty() { + return None; + } + Some(result.join("; ")) +} + +#[derive(Debug, Clone)] +enum Tag { + A { href: Option, style: Css, type_: Option, children: Vec }, + Blockquote { style: Css, children: Vec }, + Body { style: Css, xml_lang: Option, children: Vec }, + Br, + Cite { style: Css, children: Vec }, + Em { children: Vec }, + Img { src: Option, alt: Option }, // TODO: height, width, style + Li { style: Css, children: Vec }, + Ol { style: Css, children: Vec }, + P { style: Css, children: Vec }, + Span { style: Css, children: Vec }, + Strong { children: Vec }, + Ul { style: Css, children: Vec }, + Unknown(Vec), +} + +impl Tag { + fn to_html(self) -> String { + match self { + Tag::A { href, style, type_, children } => { + let href = write_attr(href, "href"); + let style = write_attr(get_style_string(style), "style"); + let type_ = write_attr(type_, "type"); + format!("{}", href, style, type_, children_to_html(children)) + }, + Tag::Blockquote { style, children } => { + let style = write_attr(get_style_string(style), "style"); + format!("{}", style, children_to_html(children)) + }, + Tag::Body { style, xml_lang: _, children } => { + let style = write_attr(get_style_string(style), "style"); + format!("{}", style, children_to_html(children)) + }, + Tag::Br => String::from("
"), + Tag::Cite { style, children } => { + let style = write_attr(get_style_string(style), "style"); + format!("{}", style, children_to_html(children)) + }, + Tag::Em { children } => format!("{}", children_to_html(children)), + Tag::Img { src, alt } => { + let src = write_attr(src, "src"); + let alt = write_attr(alt, "alt"); + format!("", src, alt) + } + Tag::Li { style, children } => { + let style = write_attr(get_style_string(style), "style"); + format!("{}", style, children_to_html(children)) + } + Tag::Ol { style, children } => { + let style = write_attr(get_style_string(style), "style"); + format!("{}", style, children_to_html(children)) + } + Tag::P { style, children } => { + let style = write_attr(get_style_string(style), "style"); + format!("{}

", style, children_to_html(children)) + } + Tag::Span { style, children } => { + let style = write_attr(get_style_string(style), "style"); + format!("{}", style, children_to_html(children)) + } + Tag::Strong { children } => format!("{}", children.into_iter().map(|child| child.to_html()).collect::>().join("")), + Tag::Ul { style, children } => { + let style = write_attr(get_style_string(style), "style"); + format!("{}", style, children_to_html(children)) + } + Tag::Unknown(children) => children_to_html(children), + } + } +} + +impl TryFrom for Tag { + type Error = Error; + + fn try_from(elem: Element) -> Result { + let mut children = vec![]; + for child in elem.nodes() { + match child { + Node::Element(child) => children.push(Child::Tag(Tag::try_from(child.clone())?)), + Node::Text(text) => children.push(Child::Text(text.clone())), + Node::Comment(_) => unimplemented!() // XXX: remove! + } + } + + Ok(match elem.name() { + "a" => Tag::A { href: elem.attr("href").map(|href| href.to_string()), style: parse_css(elem.attr("style")), type_: elem.attr("type").map(|type_| type_.to_string()), children }, + "blockquote" => Tag::Blockquote { style: parse_css(elem.attr("style")), children }, + "body" => Tag::Body { style: parse_css(elem.attr("style")), xml_lang: elem.attr("xml:lang").map(|xml_lang| xml_lang.to_string()), children }, + "br" => Tag::Br, + "cite" => Tag::Cite { style: parse_css(elem.attr("style")), children }, + "em" => Tag::Em { children }, + "img" => Tag::Img { src: elem.attr("src").map(|src| src.to_string()), alt: elem.attr("alt").map(|alt| alt.to_string()) }, + "li" => Tag::Li { style: parse_css(elem.attr("style")), children }, + "ol" => Tag::Ol { style: parse_css(elem.attr("style")), children }, + "p" => Tag::P { style: parse_css(elem.attr("style")), children }, + "span" => Tag::Span { style: parse_css(elem.attr("style")), children }, + "strong" => Tag::Strong { children }, + "ul" => Tag::Ul { style: parse_css(elem.attr("style")), children }, + _ => Tag::Unknown(children), + }) + } +} + +impl From for Element { + fn from(tag: Tag) -> Element { + let (name, attrs, children) = match tag { + Tag::A { href, style, type_, children } => ("a", { + let mut attrs = vec![]; + if let Some(href) = href { + attrs.push(("href", href)); + } + if let Some(style) = get_style_string(style) { + attrs.push(("style", style)); + } + if let Some(type_) = type_ { + attrs.push(("type", type_)); + } + attrs + }, children), + Tag::Blockquote { style, children } => ("blockquote", match get_style_string(style) { + Some(style) => vec![("style", style)], + None => vec![], + }, children), + Tag::Body { style, xml_lang, children } => ("body", { + let mut attrs = vec![]; + if let Some(style) = get_style_string(style) { + attrs.push(("style", style)); + } + if let Some(xml_lang) = xml_lang { + attrs.push(("xml:lang", xml_lang)); + } + attrs + }, children), + Tag::Br => ("br", vec![], vec![]), + Tag::Cite { style, children } => ("cite", match get_style_string(style) { + Some(style) => vec![("style", style)], + None => vec![], + }, children), + Tag::Em { children } => ("em", vec![], children), + Tag::Img { src, alt } => { + let mut attrs = vec![]; + if let Some(src) = src { + attrs.push(("src", src)); + } + if let Some(alt) = alt { + attrs.push(("alt", alt)); + } + ("img", attrs, vec![]) + }, + Tag::Li { style, children } => ("li", match get_style_string(style) { + Some(style) => vec![("style", style)], + None => vec![], + }, children), + Tag::Ol { style, children } => ("ol", match get_style_string(style) { + Some(style) => vec![("style", style)], + None => vec![], + }, children), + Tag::P { style, children } => ("p", match get_style_string(style) { + Some(style) => vec![("style", style)], + None => vec![], + }, children), + Tag::Span { style, children } => ("span", match get_style_string(style) { + Some(style) => vec![("style", style)], + None => vec![], + }, children), + Tag::Strong { children } => ("strong", vec![], children), + Tag::Ul { style, children } => ("ul", match get_style_string(style) { + Some(style) => vec![("style", style)], + None => vec![], + }, children), + Tag::Unknown(children) => return Element::builder("unknown").ns(ns::XHTML).append(children_to_nodes(children)).build(), + }; + let mut builder = Element::builder(name) + .ns(ns::XHTML) + .append(children_to_nodes(children)); + for (key, value) in attrs { + builder = builder.attr(key, value); + } + builder.build() + } +} + +fn children_to_nodes(children: Vec) -> Vec { + children.into_iter().map(|child| match child { + Child::Tag(tag) => Node::Element(Element::from(tag)), + Child::Text(text) => Node::Text(text), + }).collect::>() +} + +fn children_to_html(children: Vec) -> String { + children.into_iter().map(|child| child.to_html()).collect::>().concat() +} + +fn write_attr(attr: Option, name: &str) -> String { + match attr { + Some(attr) => format!(" {}='{}'", name, attr), + None => String::new(), + } +} + +fn parse_css(style: Option<&str>) -> Css { + let mut properties = vec![]; + if let Some(style) = style { + // TODO: make that parser a bit more resilient to things. + for part in style.split(";") { + let mut part = part.splitn(2, ":").map(|a| a.to_string()).collect::>(); + let key = part.pop().unwrap(); + let value = part.pop().unwrap(); + properties.push(Property { key, value }); + } + } + properties +} + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg(target_pointer_width = "32")] + #[test] + #[ignore] + fn test_size() { + assert_size!(XhtmlIm, 0); + assert_size!(Child, 0); + assert_size!(Tag, 0); + } + + #[cfg(target_pointer_width = "64")] + #[test] + fn test_size() { + assert_size!(XhtmlIm, 56); + assert_size!(Child, 112); + assert_size!(Tag, 104); + } + + #[test] + fn test_empty() { + let elem: Element = "" + .parse() + .unwrap(); + let xhtml = XhtmlIm::try_from(elem).unwrap(); + assert_eq!(xhtml.bodies.len(), 0); + + let elem: Element = "" + .parse() + .unwrap(); + let xhtml = XhtmlIm::try_from(elem).unwrap(); + assert_eq!(xhtml.bodies.len(), 1); + + let elem: Element = "" + .parse() + .unwrap(); + let xhtml = XhtmlIm::try_from(elem).unwrap(); + assert_eq!(xhtml.bodies.len(), 2); + } + + #[test] + fn invalid_two_same_langs() { + let elem: Element = "" + .parse() + .unwrap(); + let error = XhtmlIm::try_from(elem).unwrap_err(); + let message = match error { + Error::ParseError(string) => string, + _ => panic!(), + }; + assert_eq!(message, "Two identical language bodies found in XHTML-IM."); + } + + #[test] + fn test_tag() { + let elem: Element = "" + .parse() + .unwrap(); + let body = Tag::try_from(elem).unwrap(); + match body { + Tag::Body { style: _, xml_lang: _, children } => assert_eq!(children.len(), 0), + _ => panic!(), + } + + let elem: Element = "

Hello world!

" + .parse() + .unwrap(); + let body = Tag::try_from(elem).unwrap(); + let mut children = match body { + Tag::Body { style, xml_lang, children } => { + assert_eq!(style.len(), 0); + assert_eq!(xml_lang, None); + assert_eq!(children.len(), 1); + children + }, + _ => panic!(), + }; + let p = match children.pop() { + Some(Child::Tag(tag)) => tag, + _ => panic!(), + }; + let mut children = match p { + Tag::P { style, children } => { + assert_eq!(style.len(), 0); + assert_eq!(children.len(), 1); + children + }, + _ => panic!(), + }; + let text = match children.pop() { + Some(Child::Text(text)) => text, + _ => panic!(), + }; + assert_eq!(text, "Hello world!"); + } + + #[test] + fn test_unknown_element() { + let elem: Element = "Hello world!" + .parse() + .unwrap(); + let xhtml_im = XhtmlIm::try_from(elem).unwrap(); + let html = xhtml_im.to_html(); + assert_eq!(html, "Hello world!"); + } + + #[test] + fn test_generate_html() { + let elem: Element = "

Hello world!

" + .parse() + .unwrap(); + let xhtml_im = XhtmlIm::try_from(elem).unwrap(); + let html = xhtml_im.to_html(); + assert_eq!(html, "

Hello world!

"); + + let elem: Element = "

Hello world!

" + .parse() + .unwrap(); + let xhtml_im = XhtmlIm::try_from(elem).unwrap(); + let html = xhtml_im.to_html(); + assert_eq!(html, "

Hello world!

"); + } +}