poezio/poezio/xhtml.py
mathieui b83c113575 Fix #3194 (don’t highlight on embedded XHTML image)
Remove the link, which wasn’t really useful anyway
2016-08-26 22:44:07 +02:00

543 lines
16 KiB
Python

# Copyright 2010-2011 Florent Le Coz <louiz@louiz.org>
#
# This file is part of Poezio.
#
# Poezio is free software: you can redistribute it and/or modify
# it under the terms of the zlib license. See the COPYING file.
"""
Various methods to convert
shell colors to poezio colors,
xhtml code to shell colors,
poezio colors to xhtml code
"""
import base64
import curses
import hashlib
import re
from os import path
from slixmpp.xmlstream import ET
from urllib.parse import unquote
from io import BytesIO
from xml import sax
from xml.sax import saxutils
digits = '0123456789' # never trust the modules
XHTML_NS = 'http://www.w3.org/1999/xhtml'
# HTML named colors
colors = {
'aliceblue': 231,
'antiquewhite': 231,
'aqua': 51,
'aquamarine': 122,
'azure': 231,
'beige': 231,
'bisque': 230,
'black': 232,
'blanchedalmond': 230,
'blue': 21,
'blueviolet': 135,
'brown': 124,
'burlywood': 223,
'cadetblue': 109,
'chartreuse': 118,
'chocolate': 172,
'coral': 209,
'cornflowerblue': 111,
'cornsilk': 231,
'crimson': 197,
'cyan': 51,
'darkblue': 19,
'darkcyan': 37,
'darkgoldenrod': 178,
'darkgray': 247,
'darkgreen': 28,
'darkgrey': 247,
'darkkhaki': 186,
'darkmagenta': 127,
'darkolivegreen': 65,
'darkorange': 214,
'darkorchid': 134,
'darkred': 124,
'darksalmon': 216,
'darkseagreen': 151,
'darkslateblue': 61,
'darkslategray': 59,
'darkslategrey': 59,
'darkturquoise': 44,
'darkviolet': 128,
'deeppink': 199,
'deepskyblue': 45,
'dimgray': 241,
'dimgrey': 241,
'dodgerblue': 39,
'firebrick': 160,
'floralwhite': 231,
'forestgreen': 34,
'fuchsia': 201,
'gainsboro': 252,
'ghostwhite': 231,
'gold': 226,
'goldenrod': 214,
'gray': 244,
'green': 34,
'greenyellow': 191,
'grey': 244,
'honeydew': 231,
'hotpink': 212,
'indianred': 174,
'indigo': 55,
'ivory': 231,
'khaki': 229,
'lavender': 231,
'lavenderblush': 231,
'lawngreen': 118,
'lemonchiffon': 230,
'lightblue': 195,
'lightcoral': 217,
'lightcyan': 231,
'lightgoldenrodyellow': 230,
'lightgray': 251,
'lightgreen': 157,
'lightgrey': 251,
'lightpink': 224,
'lightsalmon': 216,
'lightseagreen': 43,
'lightskyblue': 153,
'lightslategray': 109,
'lightslategrey': 109,
'lightsteelblue': 189,
'lightyellow': 231,
'lime': 46,
'limegreen': 77,
'linen': 231,
'magenta': 201,
'maroon': 124,
'mediumaquamarine': 115,
'mediumblue': 20,
'mediumorchid': 170,
'mediumpurple': 141,
'mediumseagreen': 78,
'mediumslateblue': 105,
'mediumspringgreen': 49,
'mediumturquoise': 80,
'mediumvioletred': 163,
'midnightblue': 18,
'mintcream': 231,
'mistyrose': 231,
'moccasin': 230,
'navajowhite': 230,
'navy': 19,
'oldlace': 231,
'olive': 142,
'olivedrab': 106,
'orange': 214,
'orangered': 202,
'orchid': 213,
'palegoldenrod': 229,
'palegreen': 157,
'paleturquoise': 195,
'palevioletred': 211,
'papayawhip': 231,
'peachpuff': 230,
'peru': 179,
'pink': 224,
'plum': 219,
'powderblue': 195,
'purple': 127,
'red': 196,
'rosybrown': 181,
'royalblue': 69,
'saddlebrown': 130,
'salmon': 216,
'sandybrown': 216,
'seagreen': 72,
'seashell': 231,
'sienna': 131,
'silver': 250,
'skyblue': 153,
'slateblue': 104,
'slategray': 109,
'slategrey': 109,
'snow': 231,
'springgreen': 48,
'steelblue': 74,
'tan': 187,
'teal': 37,
'thistle': 225,
'tomato': 209,
'turquoise': 86,
'violet': 219,
'wheat': 230,
'white': 255,
'whitesmoke': 255,
'yellow': 226,
'yellowgreen': 149
}
whitespace_re = re.compile(r'\s+')
xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]')
xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)')
poezio_color_double = re.compile(r'(?:\x19\d+}|\x19\d)+(\x19\d|\x19\d+})')
poezio_format_trim = re.compile(r'(\x19\d+}|\x19\d|\x19[buaio]|\x19o)+\x19o')
xhtml_simple_attr_re = re.compile(r'\x19\d')
def get_body_from_message_stanza(message, use_xhtml=False,
tmp_dir=None, extract_images=False):
"""
Returns a string with xhtml markups converted to
poezio colors if there's an xhtml_im element, or
the body (without any color) otherwise
"""
if use_xhtml:
xhtml = message['html'].xml
xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body')
if xhtml_body:
content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir,
extract_images=extract_images)
content = content if content else message['body']
return content or " "
return message['body']
def ncurses_color_to_html(color):
"""
Takes an int between 0 and 256 and returns
a string of the form #XXXXXX representing an
html color.
"""
if color <= 15:
try:
(r, g, b) = curses.color_content(color)
except: # fallback in faulty terminals (e.g. xterm)
(r, g, b) = curses.color_content(color%8)
r = r / 1000 * 6 - 0.01
g = g / 1000 * 6 - 0.01
b = b / 1000 * 6 - 0.01
elif color <= 231:
color = color - 16
r = color % 6
color = color / 6
g = color % 6
color = color / 6
b = color % 6
else:
color -= 232
r = g = b = color / 24 * 6
return '#%02X%02X%02X' % (int(r*256/6), int(g*256/6), int(b*256/6))
def parse_css(css):
def get_color(value):
if value[0] == '#':
value = value[1:]
length = len(value)
if length != 3 and length != 6:
return -1
value = int(value, 16)
if length == 6:
r = int(value >> 16)
g = int((value >> 8) & 0xff)
b = int(value & 0xff)
if r == g == b:
return 232 + int(r/10.6251)
div = 42.51
else:
r = int(value >> 8)
g = int((value >> 4) & 0xf)
b = int(value & 0xf)
if r == g == b:
return 232 + int(1.54*r)
div = 2.51
return 6*6*int(r/div) + 6*int(g/div) + int(b/div) + 16
if value in colors:
return colors[value]
return -1
shell = ''
rules = css.split(';')
for rule in rules:
if ':' not in rule:
continue
key, value = rule.split(':', 1)
key = key.strip()
value = value.strip()
if key == 'background-color':
pass#shell += '\x191'
elif key == 'color':
color = get_color(value)
if color != -1:
shell += '\x19%d}' % color
elif key == 'font-style':
shell += '\x19i'
elif key == 'font-weight':
shell += '\x19b'
elif key == 'margin-left':
shell += ' '
elif key == 'text-align':
pass
elif key == 'text-decoration':
if value == 'underline':
shell += '\x19u'
elif value == 'blink':
shell += '\x19a'
return shell
def trim(string):
return re.sub(whitespace_re, ' ', string)
class XHTMLHandler(sax.ContentHandler):
def __init__(self, force_ns=False, tmp_dir=None, extract_images=False):
self.builder = []
self.formatting = []
self.attrs = []
self.list_state = []
self.is_pre = False
self.a_start = 0
# do not care about xhtml-in namespace
self.force_ns = force_ns
self.tmp_dir = tmp_dir
self.extract_images = extract_images
@property
def result(self):
sanitized = re.sub(poezio_color_double, r'\1', ''.join(self.builder).strip())
return re.sub(poezio_format_trim, '\x19o', sanitized)
def append_formatting(self, formatting):
self.formatting.append(formatting)
self.builder.append(formatting)
def pop_formatting(self):
self.formatting.pop()
self.builder.append('\x19o' + ''.join(self.formatting))
def characters(self, characters):
self.builder.append(characters if self.is_pre else trim(characters))
def startElementNS(self, name, _, attrs):
if name[0] != XHTML_NS and not self.force_ns:
return
builder = self.builder
attrs = {name: value for ((ns, name), value) in attrs.items() if ns is None}
self.attrs.append(attrs)
if 'style' in attrs:
style = parse_css(attrs['style'])
self.append_formatting(style)
name = name[1]
if name == 'a':
self.append_formatting('\x19u')
self.a_start = len(self.builder)
elif name == 'blockquote':
builder.append('')
elif name == 'br':
builder.append('\n')
elif name == 'cite':
self.append_formatting('\x19u')
elif name == 'em':
self.append_formatting('\x19i')
elif name == 'img':
if re.match(xhtml_data_re, attrs['src']) and self.extract_images:
type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i]
bin_data = base64.b64decode(unquote(data))
filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_
filepath = path.join(self.tmp_dir, filename)
if not path.exists(filepath):
try:
with open(filepath, 'wb') as fd:
fd.write(bin_data)
builder.append('[file stored as %s]' % filename)
except Exception as e:
builder.append('[Error while saving image: %s]' % e)
else:
builder.append('[file stored as %s]' % filename)
else:
builder.append(trim(attrs['src']))
if 'alt' in attrs:
builder.append(' (%s)' % trim(attrs['alt']))
elif name == 'ul':
self.list_state.append('ul')
elif name == 'ol':
self.list_state.append(1)
elif name == 'li':
try:
state = self.list_state[-1]
except IndexError:
state = 'ul'
if state == 'ul':
builder.append('\n')
else:
builder.append('\n%d) ' % state)
state += 1
self.list_state[-1] = state
elif name == 'p':
builder.append('\n')
elif name == 'pre':
builder.append('\n')
self.is_pre = True
elif name == 'strong':
self.append_formatting('\x19b')
def endElementNS(self, name, _):
if name[0] != XHTML_NS and not self.force_ns:
return
builder = self.builder
attrs = self.attrs.pop()
name = name[1]
if name == 'a':
self.pop_formatting()
# do not display the link twice
text_elements = [x for x in self.builder[self.a_start:]
if not x.startswith('\x19')]
link_text = ''.join(text_elements).strip()
if 'href' in attrs and attrs['href'] != link_text:
builder.append(' (%s)' % trim(attrs['href']))
elif name == 'blockquote':
builder.append('')
elif name in ('cite', 'em', 'strong'):
self.pop_formatting()
elif name in ('ol', 'p', 'ul'):
builder.append('\n')
elif name == 'pre':
builder.append('\n')
self.is_pre = False
if 'style' in attrs:
self.pop_formatting()
if 'title' in attrs:
builder.append(' [' + attrs['title'] + ']')
def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None):
if isinstance(xml, str):
xml = xml.encode('utf8')
elif not isinstance(xml, bytes):
xml = ET.tostring(xml)
handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir,
extract_images=extract_images)
parser = sax.make_parser()
parser.setFeature(sax.handler.feature_namespaces, True)
parser.setContentHandler(handler)
parser.parse(BytesIO(xml))
return handler.result
def clean_text(s):
"""
Remove all xhtml-im attributes (\x19etc) from the string with the
complete color format, i.e \x19xxx}
"""
s = re.sub(xhtml_attr_re, "", s)
return s
def clean_text_simple(string):
"""
Remove all \x19 from the string formatted with simple colors:
\x198
"""
pos = string.find('\x19')
while pos != -1:
string = string[:pos] + string[pos+2:]
pos = string.find('\x19')
return string
def convert_simple_to_full_colors(text):
"""
takes a \x19n formatted string and returns
a \x19n} formatted one.
"""
# TODO, have a single list of this. This is some sort of
# dusplicate from windows.format_chars
mapping = str.maketrans({'\x0E': '\x19b', '\x0F': '\x19o', '\x10': '\x19u',
'\x11': '\x191', '\x12': '\x192', '\x13': '\x193',
'\x14': '\x194', '\x15': '\x195', '\x16': '\x196',
'\x17': '\x197', '\x18': '\x198', '\x19': '\x199'})
text = text.translate(mapping)
def add_curly_bracket(match):
return match.group(0) + '}'
return re.sub(xhtml_simple_attr_re, add_curly_bracket, text)
number_to_color_names = {
1: 'red',
2: 'green',
3: 'yellow',
4: 'blue',
5: 'violet',
6: 'turquoise',
7: 'white'
}
def format_inline_css(_dict):
return ''.join(('%s: %s;' % (key, value) for key, value in _dict.items()))
def poezio_colors_to_html(string):
"""
Convert poezio colors to html
(e.g. \x191}: <span style='color: red'>)
"""
# Maintain a list of the current css attributes used
# And check if a tag is open (by design, we only open
# spans tag, and they cannot be nested.
current_attrs = {}
tag_open = False
next_attr_char = string.find('\x19')
build = ["<body xmlns='http://www.w3.org/1999/xhtml'><p>"]
def check_property(key, value):
nonlocal tag_open
if current_attrs.get(key, None) == value:
return
current_attrs[key] = value
if tag_open:
tag_open = False
build.append('</span>')
while next_attr_char != -1:
attr_char = string[next_attr_char+1].lower()
if next_attr_char != 0 and string[:next_attr_char]:
if current_attrs and not tag_open:
build.append('<span style="%s">' % format_inline_css(current_attrs))
tag_open = True
build.append(saxutils.escape(string[:next_attr_char]))
if attr_char == 'o':
if tag_open:
build.append('</span>')
tag_open = False
current_attrs = {}
elif attr_char == 'b':
check_property('font-weight', 'bold')
elif attr_char == 'u':
check_property('text-decoration', 'underline')
if attr_char in digits:
number_str = string[next_attr_char+1:string.find('}', next_attr_char)]
number = int(number_str)
if number in number_to_color_names:
check_property('color', number_to_color_names.get(number, 'black'))
else:
check_property('color', ncurses_color_to_html(number))
string = string[next_attr_char+len(number_str)+2:]
else:
string = string[next_attr_char+2:]
next_attr_char = string.find('\x19')
if current_attrs and not tag_open and string:
build.append('<span style="%s">' % format_inline_css(current_attrs))
tag_open = True
build.append(saxutils.escape(string))
if tag_open:
build.append('</span>')
build.append("</p></body>")
text = ''.join(build)
return text.replace('\n', '<br />')