poezio/plugins/untrackme.py

"""
    UntrackMe wannabe plugin
"""

from typing import Callable, Dict, List, Tuple, Union

import re
import logging
from slixmpp import Message
from poezio import tabs
from poezio.plugin import BasePlugin
from urllib.parse import quote as urlquote


log = logging.getLogger(__name__)

ChatTabs = Union[
    tabs.MucTab,
    tabs.DynamicConversationTab,
    tabs.StaticConversationTab,
    tabs.PrivateTab,
]

RE_URL: re.Pattern = re.compile('https?://(?P<host>[^/]+)(?P<rest>[^ ]*)')

SERVICES: Dict[str, Tuple[str, bool]] = {  # host: (service, proxy)
    'm.youtube.com': ('invidious', False),
    'www.youtube.com': ('invidious', False),
    'youtube.com': ('invidious', False),
    'youtu.be': ('invidious', False),
    'youtube-nocookie.com': ('invidious', False),
    'mobile.twitter.com': ('nitter', False),
    'www.twitter.com': ('nitter', False),
    'twitter.com': ('nitter', False),
    'pic.twitter.com': ('nitter_img', True),
    'pbs.twimg.com': ('nitter_img', True),
    'instagram.com': ('bibliogram', False),
    'www.instagram.com': ('bibliogram', False),
    'm.instagram.com': ('bibliogram', False),
}

def proxy(service: str) -> Callable[[str], str]:
    """Some services require the original url"""
    def inner(origin: str) -> str:
        return service + urlquote(origin)
    return inner


class Plugin(BasePlugin):
    """UntrackMe"""

    default_config: Dict[str, str] = {
        'default': {
            'cleanup': True,
            'redirect': True,
            'display_corrections': False,
        },
        'services': {
            'invidious': 'https://invidious.snopyta.org',
            'nitter': 'https://nitter.net',
            'bibliogram': 'https://bibliogram.art',
        },
    }

    def init(self):
        nitter_img = self.config.get('nitter', section='services') + '/pic/'
        self.config.set('nitter_img', nitter_img, section='services')

        self.api.add_event_handler('muc_say', self.handle_msg)
        self.api.add_event_handler('conversation_say', self.handle_msg)
        self.api.add_event_handler('private_say', self.handle_msg)

        self.api.add_event_handler('muc_msg', self.handle_msg)
        self.api.add_event_handler('conversation_msg', self.handle_msg)
        self.api.add_event_handler('private_msg', self.handle_msg)

    def map_services(self, match: re.Match) -> str:
        """
            If it matches a host that we know about, change the domain for the
            alternative service. Some hosts needs to be proxied instead (such
            as twitter pictures), so they're url encoded and appended to the
            proxy service.
        """

        host = match.group('host')

        dest = SERVICES.get(host)
        if dest is None:
            return match.group(0)

        destname, proxy = dest
        replaced = self.config.get(destname, section='services')
        result = replaced + match.group('rest')

        if proxy:
            url = urlquote(match.group(0))
            result = replaced + url

            # TODO: count parenthesis?
            # Removes comma at the end of a link.
            if result[-3] == '%2C':
                result = result[:-3] + ','

        return result

    def handle_msg(self, msg: Message, tab: ChatTabs) -> None:
        orig = msg['body']

        if self.config.get('cleanup', section='default'):
            msg['body'] = self.cleanup_url(msg['body'])
        if self.config.get('redirect', section='default'):
            msg['body'] = self.redirect_url(msg['body'])

        if self.config.get('display_corrections', section='default') and \
           msg['body'] != orig:
            log.debug(
                'UntrackMe in tab \'%s\':\nOriginal: %s\nModified: %s',
                tab.name, orig, msg['body'],
            )

            self.api.information(
                'UntrackMe in tab \'{}\':\nOriginal: {}\nModified: {}'.format(
                    tab.name, orig, msg['body']
                ),
                'Info',
            )

    def cleanup_url(self, txt: str) -> str:
        # fbclid: used globally (Facebook)
        # utm_*: used globally https://en.wikipedia.org/wiki/UTM_parameters
        # ncid: DoubleClick (Google)
        # ref_src, ref_url: twitter
        # Others exist but are excluded because they are not common.
        # See https://en.wikipedia.org/wiki/UTM_parameters
        return re.sub('(https?://[^ ]+)&?(fbclid|dclid|ncid|utm_source|utm_medium|utm_campaign|utm_term|utm_content|ref_src|ref_url)=[^ &#]*',
                             r'\1',
                             txt)

    def redirect_url(self, txt: str) -> str:
        return RE_URL.sub(self.map_services, txt)
plugins/untrackme: new plugin, based on remove_get_trackers Thanks Perdu for the original work! Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-08-13 23:05:25 +00:00			`"""`
			`UntrackMe wannabe plugin`
			`"""`

			`from typing import Callable, Dict, List, Tuple, Union`

			`import re`
			`import logging`
			`from slixmpp import Message`
			`from poezio import tabs`
			`from poezio.plugin import BasePlugin`
			`from urllib.parse import quote as urlquote`


			`log = logging.getLogger(__name__)`

			`ChatTabs = Union[`
			`tabs.MucTab,`
			`tabs.DynamicConversationTab,`
			`tabs.StaticConversationTab,`
			`tabs.PrivateTab,`
			`]`

			`RE_URL: re.Pattern = re.compile('https?://(?P<host>[^/]+)(?P<rest>[^ ]*)')`

			`SERVICES: Dict[str, Tuple[str, bool]] = { # host: (service, proxy)`
			`'m.youtube.com': ('invidious', False),`
			`'www.youtube.com': ('invidious', False),`
			`'youtube.com': ('invidious', False),`
			`'youtu.be': ('invidious', False),`
			`'youtube-nocookie.com': ('invidious', False),`
			`'mobile.twitter.com': ('nitter', False),`
			`'www.twitter.com': ('nitter', False),`
			`'twitter.com': ('nitter', False),`
			`'pic.twitter.com': ('nitter_img', True),`
			`'pbs.twimg.com': ('nitter_img', True),`
			`'instagram.com': ('bibliogram', False),`
			`'www.instagram.com': ('bibliogram', False),`
			`'m.instagram.com': ('bibliogram', False),`
			`}`

			`def proxy(service: str) -> Callable[[str], str]:`
			`"""Some services require the original url"""`
			`def inner(origin: str) -> str:`
			`return service + urlquote(origin)`
			`return inner`


			`class Plugin(BasePlugin):`
			`"""UntrackMe"""`

			`default_config: Dict[str, str] = {`
			`'default': {`
			`'cleanup': True,`
			`'redirect': True,`
plugins/untrackme: toggle display_corrections off Seems to work ok, and it's quite noisy in the information buffer. Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-09-08 07:51:06 +00:00			`'display_corrections': False,`
plugins/untrackme: new plugin, based on remove_get_trackers Thanks Perdu for the original work! Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-08-13 23:05:25 +00:00			`},`
			`'services': {`
plugins/untrackme: invidio.us closed. Changed default instance Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-09-08 13:56:43 +00:00			`'invidious': 'https://invidious.snopyta.org',`
plugins/untrackme: new plugin, based on remove_get_trackers Thanks Perdu for the original work! Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-08-13 23:05:25 +00:00			`'nitter': 'https://nitter.net',`
			`'bibliogram': 'https://bibliogram.art',`
			`},`
			`}`

			`def init(self):`
			`nitter_img = self.config.get('nitter', section='services') + '/pic/'`
			`self.config.set('nitter_img', nitter_img, section='services')`

			`self.api.add_event_handler('muc_say', self.handle_msg)`
			`self.api.add_event_handler('conversation_say', self.handle_msg)`
			`self.api.add_event_handler('private_say', self.handle_msg)`

			`self.api.add_event_handler('muc_msg', self.handle_msg)`
			`self.api.add_event_handler('conversation_msg', self.handle_msg)`
			`self.api.add_event_handler('private_msg', self.handle_msg)`

			`def map_services(self, match: re.Match) -> str:`
plugins/untrackme: docstring to map_services method Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-08-13 23:13:12 +00:00			`"""`
			`If it matches a host that we know about, change the domain for the`
			`alternative service. Some hosts needs to be proxied instead (such`
			`as twitter pictures), so they're url encoded and appended to the`
			`proxy service.`
			`"""`

plugins/untrackme: new plugin, based on remove_get_trackers Thanks Perdu for the original work! Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-08-13 23:05:25 +00:00			`host = match.group('host')`

			`dest = SERVICES.get(host)`
			`if dest is None:`
			`return match.group(0)`

			`destname, proxy = dest`
			`replaced = self.config.get(destname, section='services')`
			`result = replaced + match.group('rest')`

			`if proxy:`
			`url = urlquote(match.group(0))`
			`result = replaced + url`

			`# TODO: count parenthesis?`
			`# Removes comma at the end of a link.`
			`if result[-3] == '%2C':`
			`result = result[:-3] + ','`

			`return result`

			`def handle_msg(self, msg: Message, tab: ChatTabs) -> None:`
			`orig = msg['body']`

			`if self.config.get('cleanup', section='default'):`
			`msg['body'] = self.cleanup_url(msg['body'])`
			`if self.config.get('redirect', section='default'):`
			`msg['body'] = self.redirect_url(msg['body'])`

			`if self.config.get('display_corrections', section='default') and \`
			`msg['body'] != orig:`
plugins/untrackme: move logging to a less noisy place Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-09-09 15:17:10 +00:00			`log.debug(`
			`'UntrackMe in tab \'%s\':\nOriginal: %s\nModified: %s',`
			`tab.name, orig, msg['body'],`
			`)`

plugins/untrackme: new plugin, based on remove_get_trackers Thanks Perdu for the original work! Signed-off-by: Maxime “pep” Buquet <pep@bouah.net> 2020-08-13 23:05:25 +00:00			`self.api.information(`
			`'UntrackMe in tab \'{}\':\nOriginal: {}\nModified: {}'.format(`
			`tab.name, orig, msg['body']`
			`),`
			`'Info',`
			`)`

			`def cleanup_url(self, txt: str) -> str:`
			`# fbclid: used globally (Facebook)`
			`# utm_*: used globally https://en.wikipedia.org/wiki/UTM_parameters`
			`# ncid: DoubleClick (Google)`
			`# ref_src, ref_url: twitter`
			`# Others exist but are excluded because they are not common.`
			`# See https://en.wikipedia.org/wiki/UTM_parameters`
			`return re.sub('(https?://[^ ]+)&?(fbclid\|dclid\|ncid\|utm_source\|utm_medium\|utm_campaign\|utm_term\|utm_content\|ref_src\|ref_url)=[^ &#]*',`
			`r'\1',`
			`txt)`

			`def redirect_url(self, txt: str) -> str:`
			`return RE_URL.sub(self.map_services, txt)`