Updated config file

2025-05-05 11:03:57 +00:00 · 2022-11-23 11:17:53 +01:00 · 2022-11-23 11:17:53 +01:00 · 85c5c2ef48
commit 85c5c2ef48
parent 0b58df16e2
4 changed files with 19 additions and 105 deletions
--- a/README.md
+++ b/README.md
@ -45,7 +45,7 @@ is @superduperbot@botsin.space
 | -v    | upload videos to Mastodon                        | *N/A*              | No  |
 | -r    | Post reply-to tweets (ignored by default)        | *N/A*              | No  |
 | -s    | Skip retweets (posted by default)                | *N/A*              | No  |
-| -l    | Remove link redirection                          | *N/A*              | No  |
+| -l    | Remove link redirections                         | *N/A*              | No  |
 | -u    | Remove trackers from URLs                        | *N/A*              | No  |
 | -a    | Max. age of tweet to post (in days)              | `5`                | No  |
 | -d    | Min. age before posting new tweet (in minutes)   | `15`               | No  |
--- a/default.toml
+++ b/default.toml
@ -21,6 +21,10 @@ post_reply_to = false
 # Default is false
 skip_retweets = false

+# Replace redirected links in tweets with direct URLs
+# Default is false
+remove_link_redirections = false
+
 # Clean up URLs in tweets to remove trackers
 # Default is false
 remove_trackers_from_urls = false
--- a/test.py
+++ b/test.py
@ -1,99 +0,0 @@
-#! /usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
-import requests
-
-def deredir_url(url):
-    """
-    Given a URL, return the URL that the page really downloads from
-    :param url: url to be de-redirected
-    :return: direct url
-    """
-
-    ret = None
-    try:
-        # Download the page
-        ret = requests.get(url, timeout=5)
-    except:
-        # If anything goes wrong keep the URL intact
-        return url
-
-    # Return the URL that the page was downloaded from
-    return ret.url
-
-def _remove_tracker_params(query_str):
-    """
-    private function
-    Given a query string from a URL, strip out the known trackers
-    :param query_str: query to be cleaned
-    :return: query cleaned
-    """
-    # Avalaible URL tracking parameters :
-    # UTM tags by Google Ads, M$ Ads, ...
-    # tag by TikTok
-    # tags by Snapchat
-    # tags by Facebook
-    params_to_remove = [
-        "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
-        "mkt_tok",
-        "campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
-        "media", "interest_group_name",
-        "xtor"
-    ]
-    query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
-    query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove]
-    return urlencode(query_cleaned, safe='#', doseq=True)
-
-
-def _remove_trackers_fragment(fragment_str):
-    """
-    private function
-    Given a fragment string from a URL, strip out the known trackers
-    :param query_str: fragment to be cleaned
-    :return: cleaned fragment
-    """
- 
-    # Not implemented
-    # Unclear what, if anything, can be done
-    # Need better understanding of fragment-based tracking
-    # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/
-
-    return fragment_str
-
-
-def clean_url(dirty_url):
-    """
-    Given a URL, return it with the UTM parameters removed from query and fragment
-    :param dirty_url: url to be cleaned
-    :return: url cleaned
-    >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
-    'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
-    """
-
-    url_parsed = urlparse(dirty_url, allow_fragments=False)
-
-    cleaned_url = urlunparse([
-        url_parsed.scheme,
-        url_parsed.netloc,
-        url_parsed.path,
-        url_parsed.params,
-        _remove_tracker_params(url_parsed.query),
-        _remove_trackers_fragment(url_parsed.fragment)
-    ])
-
-    return cleaned_url
-
-def main():
-    # url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok'
-    # url = "https://docs.helix-editor.com/keymap.html#movement"
-    # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title"
-    # url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim"
-    url = "https://shorturl.at/qwP38"
-    print('Orig: ' + url)
-    direct_url = deredir_url(url)
-    print('dir : ' + direct_url)
-    print('to  : ' + clean_url(direct_url))
-
-if __name__=="__main__":
-    main()
--- a/twoot.py
+++ b/twoot.py
@ -409,6 +409,7 @@ def main(argv):
        'upload_videos': False,
        'post_reply_to': False,
        'skip_retweets': False,
+        'remove_link_redirections': False,
        'remove_trackers_from_urls': False,
        'tweet_max_age': float(1),
        'tweet_delay': float(0),
@ -416,8 +417,7 @@ def main(argv):
    }

    # Default empty toml
-    # toml = {'config': {}, 'options': options}
-    toml = {}
+    toml = {'config': {}, 'options': options}

    # Load config file if it was provided
    toml_file = args['f']
@ -446,6 +446,8 @@ def main(argv):
        toml['options']['post_reply_to'] = args['r']
    if args['s'] is True:
        toml['options']['skip_retweets'] = args['s']
+    if args['l'] is True:
+        toml['options']['remove_link_redirections'] = args['l']
    if args['u'] is True:
        toml['options']['remove_trackers_from_urls'] = args['u']
    if args['a'] is not None:
@ -646,7 +648,10 @@ def main(argv):
        tt_iter = status.find('div', class_='tweet-content media-body').children

        # Process text of tweet
-        tweet_text += process_media_body(tt_iter, remove_redir, remove_trackers)
+        tweet_text += process_media_body(tt_iter,
+                                         toml['options']['remove_link_redirections'],
+                                         toml['options']['remove_trackers_from_urls']
+        )

        # Process quote: append link to tweet_text
        quote_div = status.find('a', class_='quote-link')
@ -661,8 +666,12 @@ def main(argv):
        # Process attachment: capture image or .mp4 url or download twitter video
        attachments_class = status.find('div', class_='attachments')
        if attachments_class is not None:
-            pics, vid_in_tweet = process_attachments(nitter_url, attachments_class, toml['options']['upload_videos'], toml['config']['twitter_account'], status_id,
-                                                     author_account)
+            pics, vid_in_tweet = process_attachments(nitter_url,
+                                                     attachments_class,
+                                                     toml['options']['upload_videos'],
+                                                     toml['config']['twitter_account'], 
+                                                     status_id, author_account
+            )
            photos.extend(pics)
            if vid_in_tweet:
                tweet_text += '\n\n[Video embedded in original tweet]'