From a9f642f7438fe4489cdb9cc5ac59c929054656c8 Mon Sep 17 00:00:00 2001 From: mathieui Date: Thu, 16 Oct 2014 18:49:32 +0200 Subject: [PATCH] Extract XHTML-IM inline imags by default - Add two new options: tmp_image_dir and extract_inline_images - tmp_image_dir is $XDG_CACHE_HOME(usually ~/.cache)/poezio/images if unset - Name the images from a SHA-1 of their data and their mimetype - Output file:// links inside the message --- data/default_config.cfg | 8 ++++++++ doc/source/configuration.rst | 19 +++++++++++++++++ src/config.py | 21 ++++++++++++++++++- src/core/handlers.py | 29 ++++++++++++++++++++------ src/poezio.py | 1 + src/xhtml.py | 40 +++++++++++++++++++++++++++++------- 6 files changed, 104 insertions(+), 14 deletions(-) diff --git a/data/default_config.cfg b/data/default_config.cfg index c1f766b0..35bc498b 100644 --- a/data/default_config.cfg +++ b/data/default_config.cfg @@ -379,6 +379,14 @@ ack_message_receipts = true # Ask for message delivery receipts (XEP-0184) request_message_receipts = true +# Extract base64 images received in XHTML-IM messages +# if true. +extract_inline_images = true + +# The directory where the images will be saved; if unset, +# defaults to $XDG_CACHE_HOME/poezio/images. +tmp_image_dir = + # Receive the tune notifications or not (in order to display informations # in the roster). # If this is set to false, then the display_tune_notifications diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 32d82f7a..44fd8e11 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -851,6 +851,25 @@ Other The lang some automated entities will use when replying to you. + extract_inline_images + + **Default value:** ``true`` + + Some clients send inline images in base64 inside some messages, which results in + an useless wall of text. If this option is ``true``, then that base64 text will + be replaced with a :file:`file://` link to the image file extracted in + :term:`tmp_image_dir` or :file:`$XDG_CACHE_HOME/poezio/images` by default, which + is usually :file:`~/.cache/poezio/images` + + tmp_image_dir + + **Default value:** ``[empty]`` + + The directory where poezio will save the images received, if + :term:`extract_inline_images` is set to true. If unset, poezio + will default to :file:`$XDG_CACHE_HOME/poezio/images` which is + usually :file:`~/.cache/poezio/images`. + muc_history_length **Default value:** ``50`` diff --git a/src/config.py b/src/config.py index 354c3447..5bd1ac17 100644 --- a/src/config.py +++ b/src/config.py @@ -361,7 +361,6 @@ def file_ok(filepath): def check_create_config_dir(): """ create the configuration directory if it doesn't exist - and copy the default config in it """ CONFIG_HOME = environ.get("XDG_CONFIG_HOME") if not CONFIG_HOME: @@ -374,6 +373,23 @@ def check_create_config_dir(): pass return CONFIG_PATH +def check_create_cache_dir(): + """ + create the cache directory if it doesn't exist + also create the subdirectories + """ + global CACHE_DIR + CACHE_HOME = environ.get("XDG_CACHE_HOME") + if not CACHE_HOME: + CACHE_HOME = path.join(environ.get('HOME'), '.cache') + CACHE_DIR = path.join(CACHE_HOME, 'poezio') + + try: + makedirs(CACHE_DIR) + makedirs(path.join(CACHE_DIR, 'images')) + except OSError: + pass + def run_cmdline_args(CONFIG_PATH): "Parse the command line arguments" global options @@ -495,3 +511,6 @@ safeJID = None # the global log dir LOG_DIR = '' + +# the global cache dir +CACHE_DIR = '' diff --git a/src/core/handlers.py b/src/core/handlers.py index 75c372bb..87aaecd5 100644 --- a/src/core/handlers.py +++ b/src/core/handlers.py @@ -10,6 +10,7 @@ import ssl import time from hashlib import sha1, sha512 from gettext import gettext as _ +from os import path from sleekxmpp import InvalidJID from sleekxmpp.stanza import Message @@ -24,7 +25,7 @@ import windows import xhtml import multiuserchat as muc from common import safeJID -from config import config +from config import config, CACHE_DIR from contact import Resource from logger import logger from roster import roster @@ -178,7 +179,11 @@ def on_normal_message(self, message): return self.information('%s says: %s' % (message['from'], message['body']), 'Headline') use_xhtml = config.get('enable_xhtml_im', True) - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images') + extract_images = config.get('extract_inline_images', True) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) if not body: return @@ -223,7 +228,9 @@ def on_normal_message(self, message): self.events.trigger('conversation_msg', message, conversation) if not message['body']: return - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) delayed, date = common.find_delayed_tag(message) def try_modify(): @@ -441,7 +448,11 @@ def on_groupchat_message(self, message): self.events.trigger('muc_msg', message, tab) use_xhtml = config.get('enable_xhtml_im', True) - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images') + extract_images = config.get('extract_inline_images', True) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) if not body: return @@ -498,7 +509,11 @@ def on_groupchat_private_message(self, message): room_from = jid.bare use_xhtml = config.get('enable_xhtml_im', True) - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images') + extract_images = config.get('extract_inline_images', True) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) tab = self.get_tab_by_name(jid.full, tabs.PrivateTab) # get the tab with the private conversation ignore = config.get_by_tabname('ignore_private', False, room_from) if not tab: # It's the first message we receive: create the tab @@ -511,7 +526,9 @@ def on_groupchat_private_message(self, message): self.xmpp.send_message(mto=jid.full, mbody=msg, mtype='chat') return self.events.trigger('private_msg', message, tab) - body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) + body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml, + tmp_dir=tmp_dir, + extract_images=extract_images) if not body or not tab: return replaced_id = message['replace']['id'] diff --git a/src/poezio.py b/src/poezio.py index 1baf10eb..f82f103f 100644 --- a/src/poezio.py +++ b/src/poezio.py @@ -30,6 +30,7 @@ def main(): config.run_cmdline_args(config_path) config.create_global_config() config.check_create_log_dir() + config.check_create_cache_dir() config.setup_logging() config.post_logging_setup() diff --git a/src/xhtml.py b/src/xhtml.py index 48664311..69519f8d 100644 --- a/src/xhtml.py +++ b/src/xhtml.py @@ -12,9 +12,13 @@ xhtml code to shell colors, poezio colors to xhtml code """ -import re +import base64 import curses +import hashlib +import re +from os import path from sleekxmpp.xmlstream import ET +from urllib.parse import unquote from io import BytesIO from xml import sax @@ -178,10 +182,12 @@ colors = { whitespace_re = re.compile(r'\s+') xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]') +xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)') xhtml_simple_attr_re = re.compile(r'\x19\d') -def get_body_from_message_stanza(message, use_xhtml=False): +def get_body_from_message_stanza(message, use_xhtml=False, + tmp_dir=None, extract_images=False): """ Returns a string with xhtml markups converted to poezio colors if there's an xhtml_im element, or @@ -191,7 +197,8 @@ def get_body_from_message_stanza(message, use_xhtml=False): xhtml = message['html'].xml xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body') if xhtml_body: - content = xhtml_to_poezio_colors(xhtml_body) + content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir, + extract_images=extract_images) content = content if content else message['body'] return content or " " return message['body'] @@ -281,7 +288,7 @@ def trim(string): return re.sub(whitespace_re, ' ', string) class XHTMLHandler(sax.ContentHandler): - def __init__(self, force_ns=False): + def __init__(self, force_ns=False, tmp_dir=None, extract_images=False): self.builder = [] self.formatting = [] self.attrs = [] @@ -291,6 +298,9 @@ class XHTMLHandler(sax.ContentHandler): # do not care about xhtml-in namespace self.force_ns = force_ns + self.tmp_dir = tmp_dir + self.extract_images = extract_images + @property def result(self): return ''.join(self.builder).strip() @@ -331,7 +341,22 @@ class XHTMLHandler(sax.ContentHandler): elif name == 'em': self.append_formatting('\x19i') elif name == 'img': - builder.append(trim(attrs['src'])) + if re.match(xhtml_data_re, attrs['src']) and self.extract_images: + type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i] + bin_data = base64.b64decode(unquote(data)) + filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_ + filepath = path.join(self.tmp_dir, filename) + if not path.exists(filepath): + try: + with open(filepath, 'wb') as fd: + fd.write(bin_data) + builder.append('file://%s' % filepath) + except Exception as e: + builder.append('[Error while saving image: %s]' % e) + else: + builder.append('file://%s' % filepath) + else: + builder.append(trim(attrs['src'])) if 'alt' in attrs: builder.append(' (%s)' % trim(attrs['alt'])) elif name == 'ul': @@ -389,13 +414,14 @@ class XHTMLHandler(sax.ContentHandler): if 'title' in attrs: builder.append(' [' + attrs['title'] + ']') -def xhtml_to_poezio_colors(xml, force=False): +def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None): if isinstance(xml, str): xml = xml.encode('utf8') elif not isinstance(xml, bytes): xml = ET.tostring(xml) - handler = XHTMLHandler(force_ns=force) + handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir, + extract_images=extract_images) parser = sax.make_parser() parser.setFeature(sax.handler.feature_namespaces, True) parser.setContentHandler(handler)