Extract XHTML-IM inline imags by default

- Add two new options: tmp_image_dir and extract_inline_images
- tmp_image_dir is $XDG_CACHE_HOME(usually ~/.cache)/poezio/images if unset
- Name the images from a SHA-1 of their data and their mimetype
- Output file:// links inside the message
This commit is contained in:
mathieui 2014-10-16 18:49:32 +02:00
parent d4590949f7
commit a9f642f743
No known key found for this signature in database
GPG key ID: C59F84CEEFD616E3
6 changed files with 104 additions and 14 deletions

View file

@ -379,6 +379,14 @@ ack_message_receipts = true
# Ask for message delivery receipts (XEP-0184) # Ask for message delivery receipts (XEP-0184)
request_message_receipts = true request_message_receipts = true
# Extract base64 images received in XHTML-IM messages
# if true.
extract_inline_images = true
# The directory where the images will be saved; if unset,
# defaults to $XDG_CACHE_HOME/poezio/images.
tmp_image_dir =
# Receive the tune notifications or not (in order to display informations # Receive the tune notifications or not (in order to display informations
# in the roster). # in the roster).
# If this is set to false, then the display_tune_notifications # If this is set to false, then the display_tune_notifications

View file

@ -851,6 +851,25 @@ Other
The lang some automated entities will use when replying to you. The lang some automated entities will use when replying to you.
extract_inline_images
**Default value:** ``true``
Some clients send inline images in base64 inside some messages, which results in
an useless wall of text. If this option is ``true``, then that base64 text will
be replaced with a :file:`file://` link to the image file extracted in
:term:`tmp_image_dir` or :file:`$XDG_CACHE_HOME/poezio/images` by default, which
is usually :file:`~/.cache/poezio/images`
tmp_image_dir
**Default value:** ``[empty]``
The directory where poezio will save the images received, if
:term:`extract_inline_images` is set to true. If unset, poezio
will default to :file:`$XDG_CACHE_HOME/poezio/images` which is
usually :file:`~/.cache/poezio/images`.
muc_history_length muc_history_length
**Default value:** ``50`` **Default value:** ``50``

View file

@ -361,7 +361,6 @@ def file_ok(filepath):
def check_create_config_dir(): def check_create_config_dir():
""" """
create the configuration directory if it doesn't exist create the configuration directory if it doesn't exist
and copy the default config in it
""" """
CONFIG_HOME = environ.get("XDG_CONFIG_HOME") CONFIG_HOME = environ.get("XDG_CONFIG_HOME")
if not CONFIG_HOME: if not CONFIG_HOME:
@ -374,6 +373,23 @@ def check_create_config_dir():
pass pass
return CONFIG_PATH return CONFIG_PATH
def check_create_cache_dir():
"""
create the cache directory if it doesn't exist
also create the subdirectories
"""
global CACHE_DIR
CACHE_HOME = environ.get("XDG_CACHE_HOME")
if not CACHE_HOME:
CACHE_HOME = path.join(environ.get('HOME'), '.cache')
CACHE_DIR = path.join(CACHE_HOME, 'poezio')
try:
makedirs(CACHE_DIR)
makedirs(path.join(CACHE_DIR, 'images'))
except OSError:
pass
def run_cmdline_args(CONFIG_PATH): def run_cmdline_args(CONFIG_PATH):
"Parse the command line arguments" "Parse the command line arguments"
global options global options
@ -495,3 +511,6 @@ safeJID = None
# the global log dir # the global log dir
LOG_DIR = '' LOG_DIR = ''
# the global cache dir
CACHE_DIR = ''

View file

@ -10,6 +10,7 @@ import ssl
import time import time
from hashlib import sha1, sha512 from hashlib import sha1, sha512
from gettext import gettext as _ from gettext import gettext as _
from os import path
from sleekxmpp import InvalidJID from sleekxmpp import InvalidJID
from sleekxmpp.stanza import Message from sleekxmpp.stanza import Message
@ -24,7 +25,7 @@ import windows
import xhtml import xhtml
import multiuserchat as muc import multiuserchat as muc
from common import safeJID from common import safeJID
from config import config from config import config, CACHE_DIR
from contact import Resource from contact import Resource
from logger import logger from logger import logger
from roster import roster from roster import roster
@ -178,7 +179,11 @@ def on_normal_message(self, message):
return self.information('%s says: %s' % (message['from'], message['body']), 'Headline') return self.information('%s says: %s' % (message['from'], message['body']), 'Headline')
use_xhtml = config.get('enable_xhtml_im', True) use_xhtml = config.get('enable_xhtml_im', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images')
extract_images = config.get('extract_inline_images', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
if not body: if not body:
return return
@ -223,7 +228,9 @@ def on_normal_message(self, message):
self.events.trigger('conversation_msg', message, conversation) self.events.trigger('conversation_msg', message, conversation)
if not message['body']: if not message['body']:
return return
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
delayed, date = common.find_delayed_tag(message) delayed, date = common.find_delayed_tag(message)
def try_modify(): def try_modify():
@ -441,7 +448,11 @@ def on_groupchat_message(self, message):
self.events.trigger('muc_msg', message, tab) self.events.trigger('muc_msg', message, tab)
use_xhtml = config.get('enable_xhtml_im', True) use_xhtml = config.get('enable_xhtml_im', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images')
extract_images = config.get('extract_inline_images', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
if not body: if not body:
return return
@ -498,7 +509,11 @@ def on_groupchat_private_message(self, message):
room_from = jid.bare room_from = jid.bare
use_xhtml = config.get('enable_xhtml_im', True) use_xhtml = config.get('enable_xhtml_im', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) tmp_dir = config.get('tmp_image_dir', '') or path.join(CACHE_DIR, 'images')
extract_images = config.get('extract_inline_images', True)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
tab = self.get_tab_by_name(jid.full, tabs.PrivateTab) # get the tab with the private conversation tab = self.get_tab_by_name(jid.full, tabs.PrivateTab) # get the tab with the private conversation
ignore = config.get_by_tabname('ignore_private', False, room_from) ignore = config.get_by_tabname('ignore_private', False, room_from)
if not tab: # It's the first message we receive: create the tab if not tab: # It's the first message we receive: create the tab
@ -511,7 +526,9 @@ def on_groupchat_private_message(self, message):
self.xmpp.send_message(mto=jid.full, mbody=msg, mtype='chat') self.xmpp.send_message(mto=jid.full, mbody=msg, mtype='chat')
return return
self.events.trigger('private_msg', message, tab) self.events.trigger('private_msg', message, tab)
body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml) body = xhtml.get_body_from_message_stanza(message, use_xhtml=use_xhtml,
tmp_dir=tmp_dir,
extract_images=extract_images)
if not body or not tab: if not body or not tab:
return return
replaced_id = message['replace']['id'] replaced_id = message['replace']['id']

View file

@ -30,6 +30,7 @@ def main():
config.run_cmdline_args(config_path) config.run_cmdline_args(config_path)
config.create_global_config() config.create_global_config()
config.check_create_log_dir() config.check_create_log_dir()
config.check_create_cache_dir()
config.setup_logging() config.setup_logging()
config.post_logging_setup() config.post_logging_setup()

View file

@ -12,9 +12,13 @@ xhtml code to shell colors,
poezio colors to xhtml code poezio colors to xhtml code
""" """
import re import base64
import curses import curses
import hashlib
import re
from os import path
from sleekxmpp.xmlstream import ET from sleekxmpp.xmlstream import ET
from urllib.parse import unquote
from io import BytesIO from io import BytesIO
from xml import sax from xml import sax
@ -178,10 +182,12 @@ colors = {
whitespace_re = re.compile(r'\s+') whitespace_re = re.compile(r'\s+')
xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]') xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]')
xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)')
xhtml_simple_attr_re = re.compile(r'\x19\d') xhtml_simple_attr_re = re.compile(r'\x19\d')
def get_body_from_message_stanza(message, use_xhtml=False): def get_body_from_message_stanza(message, use_xhtml=False,
tmp_dir=None, extract_images=False):
""" """
Returns a string with xhtml markups converted to Returns a string with xhtml markups converted to
poezio colors if there's an xhtml_im element, or poezio colors if there's an xhtml_im element, or
@ -191,7 +197,8 @@ def get_body_from_message_stanza(message, use_xhtml=False):
xhtml = message['html'].xml xhtml = message['html'].xml
xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body') xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body')
if xhtml_body: if xhtml_body:
content = xhtml_to_poezio_colors(xhtml_body) content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir,
extract_images=extract_images)
content = content if content else message['body'] content = content if content else message['body']
return content or " " return content or " "
return message['body'] return message['body']
@ -281,7 +288,7 @@ def trim(string):
return re.sub(whitespace_re, ' ', string) return re.sub(whitespace_re, ' ', string)
class XHTMLHandler(sax.ContentHandler): class XHTMLHandler(sax.ContentHandler):
def __init__(self, force_ns=False): def __init__(self, force_ns=False, tmp_dir=None, extract_images=False):
self.builder = [] self.builder = []
self.formatting = [] self.formatting = []
self.attrs = [] self.attrs = []
@ -291,6 +298,9 @@ class XHTMLHandler(sax.ContentHandler):
# do not care about xhtml-in namespace # do not care about xhtml-in namespace
self.force_ns = force_ns self.force_ns = force_ns
self.tmp_dir = tmp_dir
self.extract_images = extract_images
@property @property
def result(self): def result(self):
return ''.join(self.builder).strip() return ''.join(self.builder).strip()
@ -331,7 +341,22 @@ class XHTMLHandler(sax.ContentHandler):
elif name == 'em': elif name == 'em':
self.append_formatting('\x19i') self.append_formatting('\x19i')
elif name == 'img': elif name == 'img':
builder.append(trim(attrs['src'])) if re.match(xhtml_data_re, attrs['src']) and self.extract_images:
type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i]
bin_data = base64.b64decode(unquote(data))
filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_
filepath = path.join(self.tmp_dir, filename)
if not path.exists(filepath):
try:
with open(filepath, 'wb') as fd:
fd.write(bin_data)
builder.append('file://%s' % filepath)
except Exception as e:
builder.append('[Error while saving image: %s]' % e)
else:
builder.append('file://%s' % filepath)
else:
builder.append(trim(attrs['src']))
if 'alt' in attrs: if 'alt' in attrs:
builder.append(' (%s)' % trim(attrs['alt'])) builder.append(' (%s)' % trim(attrs['alt']))
elif name == 'ul': elif name == 'ul':
@ -389,13 +414,14 @@ class XHTMLHandler(sax.ContentHandler):
if 'title' in attrs: if 'title' in attrs:
builder.append(' [' + attrs['title'] + ']') builder.append(' [' + attrs['title'] + ']')
def xhtml_to_poezio_colors(xml, force=False): def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None):
if isinstance(xml, str): if isinstance(xml, str):
xml = xml.encode('utf8') xml = xml.encode('utf8')
elif not isinstance(xml, bytes): elif not isinstance(xml, bytes):
xml = ET.tostring(xml) xml = ET.tostring(xml)
handler = XHTMLHandler(force_ns=force) handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir,
extract_images=extract_images)
parser = sax.make_parser() parser = sax.make_parser()
parser.setFeature(sax.handler.feature_namespaces, True) parser.setFeature(sax.handler.feature_namespaces, True)
parser.setContentHandler(handler) parser.setContentHandler(handler)