Merge branch 'clean_url' into cfg_file

# Conflicts: # twoot.py
2025-05-05 11:03:57 +00:00 · 2022-11-23 11:00:06 +01:00 · 2022-11-23 11:00:06 +01:00 · 0b58df16e2
commit 0b58df16e2
parent 14d698f057 f0b5ee98d2
4 changed files with 216 additions and 31 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,9 @@
 **XX NOV 2022** VERSION 2.4 Added command-line option (`-u`) to
 remove tracking parameters from URLs included in tweets. A tracking URL
 is a normal URL with parameters attached to it. These parameters are used
 by marketing companies to identify the source of a click and the effectiveness
 of a communication campaign.
 **15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to
 skip retweets. With this option, retweets will be ignored and not posted
 on Mastodon.
--- a/README.md
+++ b/README.md
@ -1,11 +1,11 @@
 # Twoot
-Twoot is a python script that extracts tweets from a twitter feed and
+Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account.
-reposts them as toots on a Mastodon account.
+It is simple to set-up on a local machine, configurable and feature-rich.
-**UPDATE 15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to
+**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove redirection
-skip retweets. With this option, retweets will be ignored and not posted
+from links included in tweets. Obfuscated links are replaced by the URL that the resource
-on Mastodon.
+is directly downloaded from.
 > Previous updates can be found in CHANGELOG.
@ -23,15 +23,15 @@ on Mastodon.
 * Optionally ignore retweets
 * Allows rate-limiting posts to Mastodon instance
-## usage
+## Usage
 ```
 twoot.py [-h] -t <twitter account> -i <mastodon instance> -m <mastodon account>
-                -p <mastodon password> [-r] [-s] [-v] [-a <max age in days)>]
+                -p <mastodon password> [-r] [-s] [-u] [-v] [-a <max age in days)>]
                [-d <min delay (in mins)>] [-c <max # of toots to post>]
 ```
-## arguments
+## Arguments
 Assuming that the Twitter handle is @SuperDuperBot and the Mastodon account
 is @superduperbot@botsin.space
@ -40,15 +40,24 @@ is @superduperbot@botsin.space
 |-------|--------------------------------------------------|--------------------|-----|
 | -t    | twitter account name without '@'                 | `SuperDuper`       | Yes |
 | -i    | Mastodon instance domain name                    | `botsin.space`     | Yes |
-| -m    | Mastodon username                                | `superduperbot`    | Yes |
+| -m    | Mastodon username                                | `sd@example.com`   | Yes |
 | -p    | Mastodon password                                | `my_Sup3r-S4f3*pw` | Yes |
 | -v    | upload videos to Mastodon                        | *N/A*              | No  |
 | -r    | Post reply-to tweets (ignored by default)        | *N/A*              | No  |
 | -s    | Skip retweets (posted by default)                | *N/A*              | No  |
 | -l    | Remove link redirection                          | *N/A*              | No  |
 | -u    | Remove trackers from URLs                        | *N/A*              | No  |
 | -a    | Max. age of tweet to post (in days)              | `5`                | No  |
 | -d    | Min. age before posting new tweet (in minutes)   | `15`               | No  |
 | -c    | Max number of toots allowed to post (cap)        | `1`                | No  |
 ## Notes
 `-l` will follow every link included in the tweet and replace them with the url that the
 resource is directly dowmnloaded from (if applicable). e.g. bit.ly/xxyyyzz -> example.com
 Every link visit can take up to 5 sec (timeout) therefore this option will slow down
 tweet processing.
 When using the `-v` switch consider:
 * whether the copyright of the content that you want to cross-post allows it
@ -61,7 +70,8 @@ Default min delay is 0 minutes.
 No limitation is applied to the number of toots uploaded if `-c` is not specified.
-## installation
+
 ## Installation
 Make sure python3 is installed.
@ -104,5 +114,5 @@ Twoot is known to be used for the following feeds (older first):
 ## Background
 I started twoot when [tootbot](https://github.com/cquest/tootbot)
-stopped working. Tootbot relies on rss feeds from https://twitrss.me
+stopped working. Tootbot relied on RSS feeds from https://twitrss.me
 that broke when Twitter refreshed their web UI in July 2019.
--- a/test.py
+++ b/test.py
@ -0,0 +1,99 @@
 #! /usr/bin/env python3
 # -*- coding: utf-8 -*-
 from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
 import requests
 def deredir_url(url):
    """
    Given a URL, return the URL that the page really downloads from
    :param url: url to be de-redirected
    :return: direct url
    """
    ret = None
    try:
        # Download the page
        ret = requests.get(url, timeout=5)
    except:
        # If anything goes wrong keep the URL intact
        return url
    # Return the URL that the page was downloaded from
    return ret.url
 def _remove_tracker_params(query_str):
    """
    private function
    Given a query string from a URL, strip out the known trackers
    :param query_str: query to be cleaned
    :return: query cleaned
    """
    # Avalaible URL tracking parameters :
    # UTM tags by Google Ads, M$ Ads, ...
    # tag by TikTok
    # tags by Snapchat
    # tags by Facebook
    params_to_remove = [
        "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
        "mkt_tok",
        "campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
        "media", "interest_group_name",
        "xtor"
    ]
    query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
    query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove]
    return urlencode(query_cleaned, safe='#', doseq=True)
 def _remove_trackers_fragment(fragment_str):
    """
    private function
    Given a fragment string from a URL, strip out the known trackers
    :param query_str: fragment to be cleaned
    :return: cleaned fragment
    """
    # Not implemented
    # Unclear what, if anything, can be done
    # Need better understanding of fragment-based tracking
    # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/
    return fragment_str
 def clean_url(dirty_url):
    """
    Given a URL, return it with the UTM parameters removed from query and fragment
    :param dirty_url: url to be cleaned
    :return: url cleaned
    >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
    'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
    """
    url_parsed = urlparse(dirty_url, allow_fragments=False)
    cleaned_url = urlunparse([
        url_parsed.scheme,
        url_parsed.netloc,
        url_parsed.path,
        url_parsed.params,
        _remove_tracker_params(url_parsed.query),
        _remove_trackers_fragment(url_parsed.fragment)
    ])
    return cleaned_url
 def main():
    # url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok'
    # url = "https://docs.helix-editor.com/keymap.html#movement"
    # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title"
    # url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim"
    url = "https://shorturl.at/qwP38"
    print('Orig: ' + url)
    direct_url = deredir_url(url)
    print('dir : ' + direct_url)
    print('to  : ' + clean_url(direct_url))
 if __name__=="__main__":
    main()
--- a/twoot.py
+++ b/twoot.py
@ -46,13 +46,13 @@ LOGGING_LEVEL = logging.DEBUG
 HTTPS_REQ_TIMEOUT = 10
 NITTER_URLS = [
-    'https://nitter.42l.fr',
+    'https://nitter.lacontrevoie.fr',
    'https://nitter.pussthecat.org',
    'https://nitter.fdn.fr',
    'https://nitter.eu',
    'https://nitter.namazso.eu',
-    'https://nitter.moomoo.me',
+    'https://n.l5.ca',
-    'https://n.ramle.be',
+    'https://nitter.bus-hit.me',
 ]
 # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
@ -67,7 +67,39 @@ USER_AGENTS = [
 ]
-def _remove_tracker_params(query_str):
+def deredir_url(url):
    """
    Given a URL, return the URL that the page really downloads from
    :param url: url to be de-redirected
    :return: direct url
    """
    # Get a copy of the default headers that requests would use
    headers = requests.utils.default_headers()
    # Update default headers with randomly selected user agent
    headers.update(
        {
            'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)],
        }
    )
    ret = None
    try:
        # Download the page
        ret = requests.get(url, headers=headers, timeout=5)
    except:
        # If anything goes wrong keep the URL intact
        return url
    if ret.url != url:
        logging.debug("Removed redirection from: " + url + " to: " + ret.url)
    # Return the URL that the page was downloaded from
    return ret.url
 def _remove_trackers_query(query_str):
    """
    private function
    Given a query string from a URL, strip out the known trackers
@ -79,25 +111,49 @@ def _remove_tracker_params(query_str):
    # tag by TikTok
    # tags by Snapchat
    # tags by Facebook
-    params_to_remove = [
+    params_to_remove = {
-        "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
+        "gclid", "_ga", "gclsrc", "dclid",
        "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "utm_cid", "utm_reader", "utm_name", "utm_referrer", "utm_social", "utm_social-type",
        "mkt_tok",
        "campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
-        "media", "interest_group_name",
+        "fbclid", "campaign_name", "ad_set_name", "ad_set_id", "media", "interest_group_name", "ad_set_id"
-        "xtor"
+        "igshid",
-    ]
+        "cvid", "oicd", "msclkid",
        "soc_src", "soc_trk",
        "_openstat", "yclid",
        "xtor", "xtref", "adid",
    }
    query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
-    query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove]
+    query_cleaned = [(k, v) for k, v in query_to_clean.items() if k not in params_to_remove]
    return urlencode(query_cleaned, doseq=True)
 def _remove_trackers_fragment(fragment_str):
    """
    private function
    Given a fragment string from a URL, strip out the known trackers
    :param query_str: fragment to be cleaned
    :return: cleaned fragment
    """
    params_to_remove = {
        "Echobox",
    }
    if '=' in fragment_str:
        fragment_str = fragment_str.split('&')
        query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove]
        fragment_str = '&'.join(query_cleaned)
    return fragment_str
 def clean_url(dirty_url):
    """
    Given a URL, return it with the UTM parameters removed from query and fragment
    :param dirty_url: url to be cleaned
    :return: url cleaned
-    >>> clean_url('https://exemple.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
+    >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
-    'https://exemple.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
+    'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
    """
    url_parsed = urlparse(dirty_url)
@ -107,18 +163,23 @@ def clean_url(dirty_url):
        url_parsed.netloc,
        url_parsed.path,
        url_parsed.params,
-        _remove_tracker_params(url_parsed.query),
+        _remove_trackers_query(url_parsed.query),
-        _remove_tracker_params(url_parsed.fragment)
+        _remove_trackers_fragment(url_parsed.fragment)
    ])
    if cleaned_url != dirty_url:
        logging.debug('Cleaned URL from: ' + dirty_url + ' to: ' + cleaned_url)
    return cleaned_url
-def process_media_body(tt_iter):
+def process_media_body(tt_iter, remove_redir, remove_trackers):
    """
    Receives an iterator over all the elements contained in the tweet-text container.
    Processes them to make them suitable for posting on Mastodon
    :param tt_iter: iterator over the HTML elements in the text of the tweet
    :param remove_redir: bool to indicate if redirections should be removed
    :param remove_trackers: bool to indicate if trackers should be removed
    :return:        cleaned up text of the tweet
    """
    tweet_text = ''
@ -138,8 +199,16 @@ def process_media_body(tt_iter):
                # Only keep hashtag text
                tweet_text += tag_text
            else:
-                # This is a real link, keep url
+                # This is a real link
-                tweet_text += clean_url(tag.get('href'))
+                if remove_redir:
                    url = deredir_url(tag.get('href'))
                else:
                    url = tag.get('href')
                if remove_trackers:
                    tweet_text += clean_url(url)
                else:
                    tweet_text += url
        else:
            logging.warning("No handler for tag in twitter text: " + tag.prettify())
@ -319,7 +388,8 @@ def main(argv):
    parser.add_argument('-m', metavar='<mastodon account>', action='store')
    parser.add_argument('-p', metavar='<mastodon password>', action='store')
    parser.add_argument('-r', action='store_true', help='Also post replies to other tweets')
-    parser.add_argument('-s', action='store_true', help='Skip retweets')
+    parser.add_argument('-s', action='store_true', help='Suppress retweets')
    parser.add_argument('-l', action='store_true', help='Remove link redirection')
    parser.add_argument('-u', action='store_true', help='Remove trackers from URLs')
    parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance')
    parser.add_argument('-a', metavar='<max age (in days)>', action='store', type=float)
@ -463,7 +533,7 @@ def main(argv):
    if toml['options']['post_reply_to']:
        url += '/with_replies'
-    # Download twitter page of user.
+    # Download twitter page of user
    try:
        twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
    except requests.exceptions.ConnectionError:
@ -576,7 +646,7 @@ def main(argv):
        tt_iter = status.find('div', class_='tweet-content media-body').children
        # Process text of tweet
-        tweet_text += process_media_body(tt_iter)
+        tweet_text += process_media_body(tt_iter, remove_redir, remove_trackers)
        # Process quote: append link to tweet_text
        quote_div = status.find('a', class_='quote-link')