Merge branch 'clean_url'

2025-05-30 04:20:16 +00:00 · 2022-11-23 21:54:46 +01:00 · 2022-11-23 21:54:46 +01:00 · 8dbce09530
commit 8dbce09530
parent 65c3f73bb2 91ffbde963
3 changed files with 96 additions and 31 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,8 +1,14 @@
+**23 NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove
+redirection from links included in tweets. Obfuscated links are replaced
+by the URL that the resource is directly downloaded from. Also improved
+tracker removal by cleaning URL fragments as well (contrib: mathdatech,
+thanks!).
+
 **22 NOV 2022** VERSION 2.4 Added command-line option (`-u`) to
 remove tracking parameters from URLs included in tweets. A tracking URL
 is a normal URL with parameters attached to it. These parameters are used
 by marketing companies to identify the source of a click and the effectiveness
-of a communication campaign.
+of a communication campaign (contrib: mathdatech, thanks!).

 **15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to
 skip retweets. With this option, retweets will be ignored and not posted
--- a/README.md
+++ b/README.md
@ -1,13 +1,12 @@
 # Twoot

-Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account.
-It is simple to set-up on a local machine, configurable and feature-rich.
+**Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account.
+It is simple to set-up on a local machine, configurable and feature-rich.**

-**UPDATE 22 NOV 2022** VERSION 2.4 Added command-line option (`-u`) to
-remove tracking parameters from URLs included in tweets. A tracking URL is a
-normal URL with additional parameters attached to it. These parameters are used
-by marketing companies to identify the source of a click and the effectiveness
-of a communication campaign.
+**UPDATE 23 NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove redirection
+from links included in tweets. Obfuscated links are replaced by the URL that the resource
+is directly downloaded from. Also improved tracker removal by cleaning URL fragments as well
+(contrib: mathdatech, thanks!).

 > Previous updates can be found in CHANGELOG.

@ -25,7 +24,7 @@ of a communication campaign.
 * Optionally ignore retweets
 * Allows rate-limiting posts to Mastodon instance

-## usage
+## Usage

 ```
 twoot.py [-h] -t <twitter account> -i <mastodon instance> -m <mastodon account>
@ -33,7 +32,7 @@ twoot.py [-h] -t <twitter account> -i <mastodon instance> -m <mastodon account>
                [-d <min delay (in mins)>] [-c <max # of toots to post>]
 ```

-## arguments
+## Arguments

 Assuming that the Twitter handle is @SuperDuperBot and the Mastodon account
 is @superduperbot@botsin.space
@ -42,16 +41,27 @@ is @superduperbot@botsin.space
 |-------|--------------------------------------------------|--------------------|-----|
 | -t    | twitter account name without '@'                 | `SuperDuper`       | Yes |
 | -i    | Mastodon instance domain name                    | `botsin.space`     | Yes |
-| -m    | Mastodon username                                | `superduperbot`    | Yes |
+| -m    | Mastodon username                                | `sd@example.com`   | Yes |
 | -p    | Mastodon password                                | `my_Sup3r-S4f3*pw` | Yes |
 | -v    | upload videos to Mastodon                        | *N/A*              | No  |
 | -r    | Post reply-to tweets (ignored by default)        | *N/A*              | No  |
 | -s    | Skip retweets (posted by default)                | *N/A*              | No  |
+| -l    | Remove link redirection                          | *N/A*              | No  |
 | -u    | Remove trackers from URLs                        | *N/A*              | No  |
 | -a    | Max. age of tweet to post (in days)              | `5`                | No  |
 | -d    | Min. age before posting new tweet (in minutes)   | `15`               | No  |
 | -c    | Max number of toots allowed to post (cap)        | `1`                | No  |

+## Notes
+
+`-l` will follow every link included in the tweet and replace them with the url that the
+resource is directly dowmnloaded from (if applicable). e.g. bit.ly/xxyyyzz -> example.com
+Every link visit can take up to 5 sec (timeout) therefore this option will slow down
+tweet processing.
+
+If you are interested by tracker removal (`-u`) you should also select redirection removal(`-l`)
+as trackers are often hidden behind the redirection of a short URL.
+
 When using the `-v` switch consider:

 * whether the copyright of the content that you want to cross-post allows it
@ -64,7 +74,7 @@ Default min delay is 0 minutes.

 No limitation is applied to the number of toots uploaded if `-c` is not specified.

-## installation
+## Installation

 Make sure python3 is installed.

--- a/twoot.py
+++ b/twoot.py
@ -40,7 +40,7 @@ MAX_REC_COUNT = 50

 # Set the desired verbosity of logging
 # One of logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR, logging.CRITICAL
-LOGGING_LEVEL = logging.WARNING
+LOGGING_LEVEL = logging.INFO

 # How many seconds to wait before giving up on a download (except video download)
 HTTPS_REQ_TIMEOUT = 10
@ -67,6 +67,38 @@ USER_AGENTS = [
 ]


+def deredir_url(url):
+    """
+    Given a URL, return the URL that the page really downloads from
+    :param url: url to be de-redirected
+    :return: direct url
+    """
+
+    # Get a copy of the default headers that requests would use
+    headers = requests.utils.default_headers()
+
+    # Update default headers with randomly selected user agent
+    headers.update(
+        {
+            'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)],
+        }
+    )
+
+    ret = None
+    try:
+        # Download the page
+        ret = requests.get(url, headers=headers, timeout=5)
+    except:
+        # If anything goes wrong keep the URL intact
+        return url
+
+    if ret.url != url:
+        logging.debug("Removed redirection from: " + url + " to: " + ret.url)
+
+    # Return the URL that the page was downloaded from
+    return ret.url
+
+
 def _remove_trackers_query(query_str):
    """
    private function
@ -79,15 +111,20 @@ def _remove_trackers_query(query_str):
    # tag by TikTok
    # tags by Snapchat
    # tags by Facebook
-    params_to_remove = [
-        "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
+    params_to_remove = {
+        "gclid", "_ga", "gclsrc", "dclid",
+        "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "utm_cid", "utm_reader", "utm_name", "utm_referrer", "utm_social", "utm_social-type",
        "mkt_tok",
        "campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
-        "media", "interest_group_name",
-        "xtor"
-    ]
+        "fbclid", "campaign_name", "ad_set_name", "ad_set_id", "media", "interest_group_name", "ad_set_id"
+        "igshid",
+        "cvid", "oicd", "msclkid",
+        "soc_src", "soc_trk",
+        "_openstat", "yclid",
+        "xtor", "xtref", "adid",
+    }
    query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
-    query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove]
+    query_cleaned = [(k, v) for k, v in query_to_clean.items() if k not in params_to_remove]
    return urlencode(query_cleaned, doseq=True)


@ -98,12 +135,15 @@ def _remove_trackers_fragment(fragment_str):
    :param query_str: fragment to be cleaned
    :return: cleaned fragment
    """
- 
-    # Not implemented
-    # Unclear what, if anything, can be done
-    # Need better understanding of fragment-based tracking
-    # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/

+    params_to_remove = {
+        "Echobox",
+    }
+
+    if '=' in fragment_str:
+        fragment_str = fragment_str.split('&')
+        query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove]
+        fragment_str = '&'.join(query_cleaned)
    return fragment_str


@ -133,11 +173,12 @@ def clean_url(dirty_url):
    return cleaned_url


-def process_media_body(tt_iter, remove_trackers):
+def process_media_body(tt_iter, remove_redir, remove_trackers):
    """
    Receives an iterator over all the elements contained in the tweet-text container.
    Processes them to make them suitable for posting on Mastodon
    :param tt_iter: iterator over the HTML elements in the text of the tweet
+    :param remove_redir: bool to indicate if redirections should be removed
    :param remove_trackers: bool to indicate if trackers should be removed
    :return:        cleaned up text of the tweet
    """
@ -158,11 +199,16 @@ def process_media_body(tt_iter, remove_trackers):
                # Only keep hashtag text
                tweet_text += tag_text
            else:
-                # This is a real link, keep url
-                if remove_trackers:
-                    tweet_text += clean_url(tag.get('href'))
+                # This is a real link
+                if remove_redir:
+                    url = deredir_url(tag.get('href'))
                else:
-                    tweet_text += tag.get('href')
+                    url = tag.get('href')
+
+                if remove_trackers:
+                    tweet_text += clean_url(url)
+                else:
+                    tweet_text += url
        else:
            logging.warning("No handler for tag in twitter text: " + tag.prettify())

@ -342,6 +388,7 @@ def main(argv):
    parser.add_argument('-p', metavar='<mastodon password>', action='store', required=True)
    parser.add_argument('-r', action='store_true', help='Also post replies to other tweets')
    parser.add_argument('-s', action='store_true', help='Suppress retweets')
+    parser.add_argument('-l', action='store_true', help='Remove link redirection')
    parser.add_argument('-u', action='store_true', help='Remove trackers from URLs')
    parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance')
    parser.add_argument('-a', metavar='<max age (in days)>', action='store', type=float, default=1)
@ -357,6 +404,7 @@ def main(argv):
    mast_password = args['p']
    tweets_and_replies = args['r']
    suppress_retweets = args['s']
+    remove_redir = args['l']
    remove_trackers = args['u']
    get_vids = args['v']
    max_age = float(args['a'])
@ -383,6 +431,7 @@ def main(argv):
    logging.info('    -m ' + mast_account)
    logging.info('    -r ' + str(tweets_and_replies))
    logging.info('    -s ' + str(suppress_retweets))
+    logging.info('    -l ' + str(remove_redir))
    logging.info('    -u ' + str(remove_trackers))
    logging.info('    -v ' + str(get_vids))
    logging.info('    -a ' + str(max_age))
@ -426,7 +475,7 @@ def main(argv):
    if tweets_and_replies:
        url += '/with_replies'

-    # Download twitter page of user.
+    # Download twitter page of user
    try:
        twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
    except requests.exceptions.ConnectionError:
@ -539,7 +588,7 @@ def main(argv):
        tt_iter = status.find('div', class_='tweet-content media-body').children

        # Process text of tweet
-        tweet_text += process_media_body(tt_iter, remove_trackers)
+        tweet_text += process_media_body(tt_iter, remove_redir, remove_trackers)

        # Process quote: append link to tweet_text
        quote_div = status.find('a', class_='quote-link')