Extract XHTML-IM inline imags by default

- Add two new options: tmp_image_dir and extract_inline_images
- tmp_image_dir is $XDG_CACHE_HOME(usually ~/.cache)/poezio/images if unset
- Name the images from a SHA-1 of their data and their mimetype
- Output file:// links inside the message
This commit is contained in:
mathieui 2014-10-16 18:49:32 +02:00
parent d4590949f7
commit a9f642f743
No known key found for this signature in database
GPG key ID: C59F84CEEFD616E3
6 changed files with 104 additions and 14 deletions

View file

@ -379,6 +379,14 @@ ack_message_receipts = true
# Ask for message delivery receipts (XEP-0184)
request_message_receipts = true
# Extract base64 images received in XHTML-IM messages
# if true.
extract_inline_images = true
# The directory where the images will be saved; if unset,
# defaults to $XDG_CACHE_HOME/poezio/images.
tmp_image_dir =
# Receive the tune notifications or not (in order to display informations
# in the roster).
# If this is set to false, then the display_tune_notifications

View file

@ -851,6 +851,25 @@ Other
The lang some automated entities will use when replying to you.
extract_inline_images
**Default value:** ``true``
Some clients send inline images in base64 inside some messages, which results in
an useless wall of text. If this option is ``true``, then that base64 text will
be replaced with a :file:`file://` link to the image file extracted in
:term:`tmp_image_dir` or :file:`$XDG_CACHE_HOME/poezio/images` by default, which
is usually :file:`~/.cache/poezio/images`
tmp_image_dir
**Default value:** ``[empty]``
The directory where poezio will save the images received, if
:term:`extract_inline_images` is set to true. If unset, poezio
will default to :file:`$XDG_CACHE_HOME/poezio/images` which is
usually :file:`~/.cache/poezio/images`.
muc_history_length
**Default value:** ``50``

View file

@ -361,7 +361,6 @@ def file_ok(filepath):
def check_create_config_dir():
"""
create the configuration directory if it doesn't exist
and copy the default config in it
"""
CONFIG_HOME = environ.get("XDG_CONFIG_HOME")
if not CONFIG_HOME:
@ -374,6 +373,23 @@ def check_create_config_dir():
pass
return CONFIG_PATH
def check_create_cache_dir():
"""
create the cache directory if it doesn't exist
also create the subdirectories
"""
global CACHE_DIR
CACHE_HOME = environ.get("XDG_CACHE_HOME")
if not CACHE_HOME:
CACHE_HOME = path.join(environ.get('HOME'), '.cache')
CACHE_DIR = path.join(CACHE_HOME, 'poezio')
try:
makedirs(CACHE_DIR)
makedirs(path.join(CACHE_DIR, 'images'))
except OSError:
pass
def run_cmdline_args(CONFIG_PATH):
"Parse the command line arguments"
global options
@ -495,3 +511,6 @@ safeJID = None
# the global log dir
LOG_DIR = ''
# the global cache dir
CACHE_DIR = ''

View file

@ -10,6 +10,7 @@ import ssl
import time
from hashlib import sha1, sha512
from gettext import gettext as _
from os import path
from sleekxmpp import InvalidJID
from sleekxmpp.stanza import Message
@ -24,7 +25,7 @@ import windows
import xhtml
import multiuserchat as muc
from common import safeJID
from config import config
from config import config, CACHE_DIR
from contact import Resource
from logger import logger
from roster import roster
@ -178,7 +179,11 @@ def on_normal_message(self, message):
return self.information('%s says: %s' % (message['from'], message['body']), 'Headline')
use_xhtml = config.get('enable_xhtml_im', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml)
tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images')
extract_images = config.get('extract_inline_images', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
if not body:
return
@ -223,7 +228,9 @@ def on_normal_message(self, message):
self.events.trigger('conversation_msg', message, conversation)
if not message['body']:
return
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
delayed, date = common.find_delayed_tag(message)
def try_modify():
@ -441,7 +448,11 @@ def on_groupchat_message(self, message):
self.events.trigger('muc_msg', message, tab)
use_xhtml = config.get('enable_xhtml_im', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml)
tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images')
extract_images = config.get('extract_inline_images', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
if not body:
return
@ -498,7 +509,11 @@ def on_groupchat_private_message(self, message):
room_from = jid.bare
use_xhtml = config.get('enable_xhtml_im', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml)
tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images')
extract_images = config.get('extract_inline_images', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
tab = self.get_tab_by_name(jid.full, tabs.PrivateTab) # get the tab with the private conversation
ignore = config.get_by_tabname('ignore_private', False, room_from)
if not tab: # It's the first message we receive: create the tab
@ -511,7 +526,9 @@ def on_groupchat_private_message(self, message):
self.xmpp.send_message(mto=jid.full, mbody=msg, mtype='chat')
return
self.events.trigger('private_msg', message, tab)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
if not body or not tab:
return
replaced_id = message['replace']['id']

View file

@ -30,6 +30,7 @@ def main():
config.run_cmdline_args(config_path)
config.create_global_config()
config.check_create_log_dir()
config.check_create_cache_dir()
config.setup_logging()
config.post_logging_setup()

View file

@ -12,9 +12,13 @@ xhtml code to shell colors,
poezio colors to xhtml code
"""
import re
import base64
import curses
import hashlib
import re
from os import path
from sleekxmpp.xmlstream import ET
from urllib.parse import unquote
from io import BytesIO
from xml import sax
@ -178,10 +182,12 @@ colors = {
whitespace_re = re.compile(r'\s+')
xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]')
xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)')
xhtml_simple_attr_re = re.compile(r'\x19\d')
def get_body_from_message_stanza(message, use_xhtml=False):
def get_body_from_message_stanza(message, use_xhtml=False,
tmp_dir=None, extract_images=False):
"""
Returns a string with xhtml markups converted to
poezio colors if there's an xhtml_im element, or
@ -191,7 +197,8 @@ def get_body_from_message_stanza(message, use_xhtml=False):
xhtml = message['html'].xml
xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body')
if xhtml_body:
content = xhtml_to_poezio_colors(xhtml_body)
content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir,
extract_images=extract_images)
content = content if content else message['body']
return content or " "
return message['body']
@ -281,7 +288,7 @@ def trim(string):
return re.sub(whitespace_re, ' ', string)
class XHTMLHandler(sax.ContentHandler):
def __init__(self, force_ns=False):
def __init__(self, force_ns=False, tmp_dir=None, extract_images=False):
self.builder = []
self.formatting = []
self.attrs = []
@ -291,6 +298,9 @@ class XHTMLHandler(sax.ContentHandler):
# do not care about xhtml-in namespace
self.force_ns = force_ns
self.tmp_dir = tmp_dir
self.extract_images = extract_images
@property
def result(self):
return ''.join(self.builder).strip()
@ -331,7 +341,22 @@ class XHTMLHandler(sax.ContentHandler):
elif name == 'em':
self.append_formatting('\x19i')
elif name == 'img':
builder.append(trim(attrs['src']))
if re.match(xhtml_data_re, attrs['src']) and self.extract_images:
type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i]
bin_data = base64.b64decode(unquote(data))
filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_
filepath = path.join(self.tmp_dir, filename)
if not path.exists(filepath):
try:
with open(filepath, 'wb') as fd:
fd.write(bin_data)
builder.append('file://%s' % filepath)
except Exception as e:
builder.append('[Error while saving image: %s]' % e)
else:
builder.append('file://%s' % filepath)
else:
builder.append(trim(attrs['src']))
if 'alt' in attrs:
builder.append(' (%s)' % trim(attrs['alt']))
elif name == 'ul':
@ -389,13 +414,14 @@ class XHTMLHandler(sax.ContentHandler):
if 'title' in attrs:
builder.append(' [' + attrs['title'] + ']')
def xhtml_to_poezio_colors(xml, force=False):
def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None):
if isinstance(xml, str):
xml = xml.encode('utf8')
elif not isinstance(xml, bytes):
xml = ET.tostring(xml)
handler = XHTMLHandler(force_ns=force)
handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir,
extract_images=extract_images)
parser = sax.make_parser()
parser.setFeature(sax.handler.feature_namespaces, True)
parser.setContentHandler(handler)