From 85c5c2ef48a63aed62066cdfa9365a818b0e0047 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 11:17:53 +0100 Subject: [PATCH] Updated config file --- README.md | 2 +- default.toml | 4 +++ test.py | 99 ---------------------------------------------------- twoot.py | 19 +++++++--- 4 files changed, 19 insertions(+), 105 deletions(-) delete mode 100755 test.py diff --git a/README.md b/README.md index 304f428..42dfa55 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ is @superduperbot@botsin.space | -v | upload videos to Mastodon | *N/A* | No | | -r | Post reply-to tweets (ignored by default) | *N/A* | No | | -s | Skip retweets (posted by default) | *N/A* | No | -| -l | Remove link redirection | *N/A* | No | +| -l | Remove link redirections | *N/A* | No | | -u | Remove trackers from URLs | *N/A* | No | | -a | Max. age of tweet to post (in days) | `5` | No | | -d | Min. age before posting new tweet (in minutes) | `15` | No | diff --git a/default.toml b/default.toml index b5c2953..b0be8ea 100644 --- a/default.toml +++ b/default.toml @@ -21,6 +21,10 @@ post_reply_to = false # Default is false skip_retweets = false +# Replace redirected links in tweets with direct URLs +# Default is false +remove_link_redirections = false + # Clean up URLs in tweets to remove trackers # Default is false remove_trackers_from_urls = false diff --git a/test.py b/test.py deleted file mode 100755 index 5b4a630..0000000 --- a/test.py +++ /dev/null @@ -1,99 +0,0 @@ -#! /usr/bin/env python3 -# -*- coding: utf-8 -*- - -from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse -import requests - -def deredir_url(url): - """ - Given a URL, return the URL that the page really downloads from - :param url: url to be de-redirected - :return: direct url - """ - - ret = None - try: - # Download the page - ret = requests.get(url, timeout=5) - except: - # If anything goes wrong keep the URL intact - return url - - # Return the URL that the page was downloaded from - return ret.url - -def _remove_tracker_params(query_str): - """ - private function - Given a query string from a URL, strip out the known trackers - :param query_str: query to be cleaned - :return: query cleaned - """ - # Avalaible URL tracking parameters : - # UTM tags by Google Ads, M$ Ads, ... - # tag by TikTok - # tags by Snapchat - # tags by Facebook - params_to_remove = [ - "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", - "mkt_tok", - "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", - "media", "interest_group_name", - "xtor" - ] - query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) - query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] - return urlencode(query_cleaned, safe='#', doseq=True) - - -def _remove_trackers_fragment(fragment_str): - """ - private function - Given a fragment string from a URL, strip out the known trackers - :param query_str: fragment to be cleaned - :return: cleaned fragment - """ - - # Not implemented - # Unclear what, if anything, can be done - # Need better understanding of fragment-based tracking - # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ - - return fragment_str - - -def clean_url(dirty_url): - """ - Given a URL, return it with the UTM parameters removed from query and fragment - :param dirty_url: url to be cleaned - :return: url cleaned - >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') - 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' - """ - - url_parsed = urlparse(dirty_url, allow_fragments=False) - - cleaned_url = urlunparse([ - url_parsed.scheme, - url_parsed.netloc, - url_parsed.path, - url_parsed.params, - _remove_tracker_params(url_parsed.query), - _remove_trackers_fragment(url_parsed.fragment) - ]) - - return cleaned_url - -def main(): - # url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok' - # url = "https://docs.helix-editor.com/keymap.html#movement" - # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title" - # url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim" - url = "https://shorturl.at/qwP38" - print('Orig: ' + url) - direct_url = deredir_url(url) - print('dir : ' + direct_url) - print('to : ' + clean_url(direct_url)) - -if __name__=="__main__": - main() diff --git a/twoot.py b/twoot.py index 1afbc39..c082ce5 100755 --- a/twoot.py +++ b/twoot.py @@ -409,6 +409,7 @@ def main(argv): 'upload_videos': False, 'post_reply_to': False, 'skip_retweets': False, + 'remove_link_redirections': False, 'remove_trackers_from_urls': False, 'tweet_max_age': float(1), 'tweet_delay': float(0), @@ -416,8 +417,7 @@ def main(argv): } # Default empty toml - # toml = {'config': {}, 'options': options} - toml = {} + toml = {'config': {}, 'options': options} # Load config file if it was provided toml_file = args['f'] @@ -446,6 +446,8 @@ def main(argv): toml['options']['post_reply_to'] = args['r'] if args['s'] is True: toml['options']['skip_retweets'] = args['s'] + if args['l'] is True: + toml['options']['remove_link_redirections'] = args['l'] if args['u'] is True: toml['options']['remove_trackers_from_urls'] = args['u'] if args['a'] is not None: @@ -646,7 +648,10 @@ def main(argv): tt_iter = status.find('div', class_='tweet-content media-body').children # Process text of tweet - tweet_text += process_media_body(tt_iter, remove_redir, remove_trackers) + tweet_text += process_media_body(tt_iter, + toml['options']['remove_link_redirections'], + toml['options']['remove_trackers_from_urls'] + ) # Process quote: append link to tweet_text quote_div = status.find('a', class_='quote-link') @@ -661,8 +666,12 @@ def main(argv): # Process attachment: capture image or .mp4 url or download twitter video attachments_class = status.find('div', class_='attachments') if attachments_class is not None: - pics, vid_in_tweet = process_attachments(nitter_url, attachments_class, toml['options']['upload_videos'], toml['config']['twitter_account'], status_id, - author_account) + pics, vid_in_tweet = process_attachments(nitter_url, + attachments_class, + toml['options']['upload_videos'], + toml['config']['twitter_account'], + status_id, author_account + ) photos.extend(pics) if vid_in_tweet: tweet_text += '\n\n[Video embedded in original tweet]'