From 40d14c4d5db49deb08a70376cb7bc01d159b4776 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 11:05:16 +0100 Subject: [PATCH 01/13] Added de-redirection of URL in tweet --- twoot.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/twoot.py b/twoot.py index f01e710..cd15e73 100755 --- a/twoot.py +++ b/twoot.py @@ -66,6 +66,33 @@ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Vivaldi/5.4.2753.51', ] +def deredir_url(url): + """ + Given a URL, return the URL that the page really downloads from + :param url: url to be de-redirected + :return: direct url + """ + + # Get a copy of the default headers that requests would use + headers = requests.utils.default_headers() + + # Update default headers with randomly selected user agent + headers.update( + { + 'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)], + } + ) + + ret = None + try: + # Download the page + ret = requests.get(url, headers, timeout=5) + except: + # If anything goes wrong keep the URL intact + return url + + # Return the URL that the page was downloaded from + return ret.url def _remove_trackers_query(query_str): """ @@ -158,11 +185,12 @@ def process_media_body(tt_iter, remove_trackers): # Only keep hashtag text tweet_text += tag_text else: - # This is a real link, keep url + # This is a real link + url = deredir_url(tag.get('href')) if remove_trackers: - tweet_text += clean_url(tag.get('href')) + tweet_text += clean_url(url) else: - tweet_text += tag.get('href') + tweet_text += url else: logging.warning("No handler for tag in twitter text: " + tag.prettify()) @@ -426,7 +454,7 @@ def main(argv): if tweets_and_replies: url += '/with_replies' - # Download twitter page of user. + # Download twitter page of user try: twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT) except requests.exceptions.ConnectionError: From 68e4918b02804e5d2a782d0e2b17d6ac0742a650 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 11:08:29 +0100 Subject: [PATCH 02/13] Added debug message --- twoot.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/twoot.py b/twoot.py index cd15e73..43bc435 100755 --- a/twoot.py +++ b/twoot.py @@ -91,6 +91,9 @@ def deredir_url(url): # If anything goes wrong keep the URL intact return url + if ret.url != url: + logging.debug("Removed redirection from: " + url + " to: " + ret.url) + # Return the URL that the page was downloaded from return ret.url From e11102f4a6018b8bda94d12f62332d8d87525ee7 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 11:33:45 +0100 Subject: [PATCH 03/13] User agent removed --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 43bc435..b987679 100755 --- a/twoot.py +++ b/twoot.py @@ -86,7 +86,7 @@ def deredir_url(url): ret = None try: # Download the page - ret = requests.get(url, headers, timeout=5) + ret = requests.get(url, timeout=5) except: # If anything goes wrong keep the URL intact return url From 9625c2128bc866cc450347fc892310731e68fef0 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 11:38:49 +0100 Subject: [PATCH 04/13] modified get request in deredir_url() --- twoot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index b987679..8fb3d5e 100755 --- a/twoot.py +++ b/twoot.py @@ -86,7 +86,7 @@ def deredir_url(url): ret = None try: # Download the page - ret = requests.get(url, timeout=5) + ret = requests.get(url, headers=headers, timeout=5) except: # If anything goes wrong keep the URL intact return url @@ -97,6 +97,7 @@ def deredir_url(url): # Return the URL that the page was downloaded from return ret.url + def _remove_trackers_query(query_str): """ private function From 9b5a76db60aeb8bb9af67f33e94439896ceca8df Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 12:50:34 +0100 Subject: [PATCH 05/13] updated README.md --- README.md | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d117fda..17700d3 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,7 @@ Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. It is simple to set-up on a local machine, configurable and feature-rich. -**UPDATE 22 NOV 2022** VERSION 2.4 Added command-line option (`-u`) to -remove tracking parameters from URLs included in tweets. A tracking URL is a -normal URL with additional parameters attached to it. These parameters are used -by marketing companies to identify the source of a click and the effectiveness -of a communication campaign. +**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) > Previous updates can be found in CHANGELOG. @@ -25,7 +21,7 @@ of a communication campaign. * Optionally ignore retweets * Allows rate-limiting posts to Mastodon instance -## usage +## Usage ``` twoot.py [-h] -t -i -m @@ -33,7 +29,7 @@ twoot.py [-h] -t -i -m [-d ] [-c ] ``` -## arguments +## Arguments Assuming that the Twitter handle is @SuperDuperBot and the Mastodon account is @superduperbot@botsin.space @@ -42,16 +38,24 @@ is @superduperbot@botsin.space |-------|--------------------------------------------------|--------------------|-----| | -t | twitter account name without '@' | `SuperDuper` | Yes | | -i | Mastodon instance domain name | `botsin.space` | Yes | -| -m | Mastodon username | `superduperbot` | Yes | +| -m | Mastodon username | `sd@example.com` | Yes | | -p | Mastodon password | `my_Sup3r-S4f3*pw` | Yes | | -v | upload videos to Mastodon | *N/A* | No | | -r | Post reply-to tweets (ignored by default) | *N/A* | No | | -s | Skip retweets (posted by default) | *N/A* | No | +| -l | Remove link redirection | *N/A* | No | | -u | Remove trackers from URLs | *N/A* | No | | -a | Max. age of tweet to post (in days) | `5` | No | | -d | Min. age before posting new tweet (in minutes) | `15` | No | | -c | Max number of toots allowed to post (cap) | `1` | No | +## Notes + +`-l` will follow every link included in the tweet and replace them with the url that the +resource is directly dowmnloaded from (if applicable). e.g. bit.ly/xxyyyzz -> example.com +Every link visit can take up to 5 sec (timeout) therefore this option will slow down +tweet processing. + When using the `-v` switch consider: * whether the copyright of the content that you want to cross-post allows it @@ -64,7 +68,8 @@ Default min delay is 0 minutes. No limitation is applied to the number of toots uploaded if `-c` is not specified. -## installation + +## Installation Make sure python3 is installed. From 0d1be42dcc473ac6d6f3cac6c96829bdcbd50270 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 22:01:27 +0100 Subject: [PATCH 06/13] Added code to remove trackers from fragments --- twoot.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/twoot.py b/twoot.py index 8fb3d5e..15b72ee 100755 --- a/twoot.py +++ b/twoot.py @@ -110,13 +110,18 @@ def _remove_trackers_query(query_str): # tag by TikTok # tags by Snapchat # tags by Facebook - params_to_remove = [ - "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + params_to_remove = { + "gclid", "_ga", "gclsrc", "dclid", + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "utm_cid", "utm_reader", "utm_name", "utm_referrer", "utm_social", "utm_social-type", "mkt_tok", "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", - "media", "interest_group_name", - "xtor" - ] + "fbclid", "campaign_name", "ad_set_name", "ad_set_id", "media", "interest_group_name", "ad_set_id" + "igshid", + "cvid", "oicd", "msclkid", + "soc_src", "soc_trk", + "_openstat", "yclid", + "xtor", "xtref", "adid", + } query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] return urlencode(query_cleaned, doseq=True) @@ -129,12 +134,15 @@ def _remove_trackers_fragment(fragment_str): :param query_str: fragment to be cleaned :return: cleaned fragment """ - - # Not implemented - # Unclear what, if anything, can be done - # Need better understanding of fragment-based tracking - # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ + params_to_remove = { + "Echobox", + } + + if '=' in fragment_str: + fragment_str = fragment_str.split('&') + query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove] + fragment_str = '&'.join(query_cleaned) return fragment_str From 7e7fa4620ffcfd89bdca727595711a6029e90a34 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 09:59:06 +0100 Subject: [PATCH 07/13] Implemented -l command-line option --- twoot.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/twoot.py b/twoot.py index 15b72ee..f98d9cb 100755 --- a/twoot.py +++ b/twoot.py @@ -66,6 +66,7 @@ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Vivaldi/5.4.2753.51', ] + def deredir_url(url): """ Given a URL, return the URL that the page really downloads from @@ -123,7 +124,7 @@ def _remove_trackers_query(query_str): "xtor", "xtref", "adid", } query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) - query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] + query_cleaned = [(k, v) for k, v in query_to_clean.items() if k not in params_to_remove] return urlencode(query_cleaned, doseq=True) @@ -138,7 +139,7 @@ def _remove_trackers_fragment(fragment_str): params_to_remove = { "Echobox", } - + if '=' in fragment_str: fragment_str = fragment_str.split('&') query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove] @@ -172,7 +173,7 @@ def clean_url(dirty_url): return cleaned_url -def process_media_body(tt_iter, remove_trackers): +def process_media_body(tt_iter, remove_redir, remove_trackers): """ Receives an iterator over all the elements contained in the tweet-text container. Processes them to make them suitable for posting on Mastodon @@ -198,7 +199,11 @@ def process_media_body(tt_iter, remove_trackers): tweet_text += tag_text else: # This is a real link - url = deredir_url(tag.get('href')) + if remove_redir: + url = deredir_url(tag.get('href')) + else: + url = tag.get('href') + if remove_trackers: tweet_text += clean_url(url) else: @@ -382,6 +387,7 @@ def main(argv): parser.add_argument('-p', metavar='', action='store', required=True) parser.add_argument('-r', action='store_true', help='Also post replies to other tweets') parser.add_argument('-s', action='store_true', help='Suppress retweets') + parser.add_argument('-l', action='store_true', help='Remove link redirection') parser.add_argument('-u', action='store_true', help='Remove trackers from URLs') parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance') parser.add_argument('-a', metavar='', action='store', type=float, default=1) @@ -397,6 +403,7 @@ def main(argv): mast_password = args['p'] tweets_and_replies = args['r'] suppress_retweets = args['s'] + remove_redir = args['l'] remove_trackers = args['u'] get_vids = args['v'] max_age = float(args['a']) @@ -423,6 +430,7 @@ def main(argv): logging.info(' -m ' + mast_account) logging.info(' -r ' + str(tweets_and_replies)) logging.info(' -s ' + str(suppress_retweets)) + logging.info(' -l ' + str(remove_redir)) logging.info(' -u ' + str(remove_trackers)) logging.info(' -v ' + str(get_vids)) logging.info(' -a ' + str(max_age)) @@ -579,7 +587,7 @@ def main(argv): tt_iter = status.find('div', class_='tweet-content media-body').children # Process text of tweet - tweet_text += process_media_body(tt_iter, remove_trackers) + tweet_text += process_media_body(tt_iter, remove_redir, remove_trackers) # Process quote: append link to tweet_text quote_div = status.find('a', class_='quote-link') From 3930acc93ffca633893bc8a9b3cf260ef51e7be7 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 09:59:45 +0100 Subject: [PATCH 08/13] Updated README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 17700d3..304f428 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,9 @@ Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. It is simple to set-up on a local machine, configurable and feature-rich. -**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) +**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove redirection +from links included in tweets. Obfuscated links are replaced by the URL that the resource +is directly downloaded from. > Previous updates can be found in CHANGELOG. From f0b5ee98d2d1b1316d7a3d76bc7429fb07229497 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 10:50:41 +0100 Subject: [PATCH 09/13] Added missing parameter in docstring --- test.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ twoot.py | 1 + 2 files changed, 100 insertions(+) create mode 100755 test.py diff --git a/test.py b/test.py new file mode 100755 index 0000000..5b4a630 --- /dev/null +++ b/test.py @@ -0,0 +1,99 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- + +from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse +import requests + +def deredir_url(url): + """ + Given a URL, return the URL that the page really downloads from + :param url: url to be de-redirected + :return: direct url + """ + + ret = None + try: + # Download the page + ret = requests.get(url, timeout=5) + except: + # If anything goes wrong keep the URL intact + return url + + # Return the URL that the page was downloaded from + return ret.url + +def _remove_tracker_params(query_str): + """ + private function + Given a query string from a URL, strip out the known trackers + :param query_str: query to be cleaned + :return: query cleaned + """ + # Avalaible URL tracking parameters : + # UTM tags by Google Ads, M$ Ads, ... + # tag by TikTok + # tags by Snapchat + # tags by Facebook + params_to_remove = [ + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "mkt_tok", + "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", + "media", "interest_group_name", + "xtor" + ] + query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) + query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] + return urlencode(query_cleaned, safe='#', doseq=True) + + +def _remove_trackers_fragment(fragment_str): + """ + private function + Given a fragment string from a URL, strip out the known trackers + :param query_str: fragment to be cleaned + :return: cleaned fragment + """ + + # Not implemented + # Unclear what, if anything, can be done + # Need better understanding of fragment-based tracking + # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ + + return fragment_str + + +def clean_url(dirty_url): + """ + Given a URL, return it with the UTM parameters removed from query and fragment + :param dirty_url: url to be cleaned + :return: url cleaned + >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') + 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' + """ + + url_parsed = urlparse(dirty_url, allow_fragments=False) + + cleaned_url = urlunparse([ + url_parsed.scheme, + url_parsed.netloc, + url_parsed.path, + url_parsed.params, + _remove_tracker_params(url_parsed.query), + _remove_trackers_fragment(url_parsed.fragment) + ]) + + return cleaned_url + +def main(): + # url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok' + # url = "https://docs.helix-editor.com/keymap.html#movement" + # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title" + # url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim" + url = "https://shorturl.at/qwP38" + print('Orig: ' + url) + direct_url = deredir_url(url) + print('dir : ' + direct_url) + print('to : ' + clean_url(direct_url)) + +if __name__=="__main__": + main() diff --git a/twoot.py b/twoot.py index f98d9cb..03892a5 100755 --- a/twoot.py +++ b/twoot.py @@ -178,6 +178,7 @@ def process_media_body(tt_iter, remove_redir, remove_trackers): Receives an iterator over all the elements contained in the tweet-text container. Processes them to make them suitable for posting on Mastodon :param tt_iter: iterator over the HTML elements in the text of the tweet + :param remove_redir: bool to indicate if redirections should be removed :param remove_trackers: bool to indicate if trackers should be removed :return: cleaned up text of the tweet """ From 89dc01a97e04e4f61599cc3a83056b8df983f3f0 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 11:50:19 +0100 Subject: [PATCH 10/13] Removed wrong gile --- test.py | 99 --------------------------------------------------------- 1 file changed, 99 deletions(-) delete mode 100755 test.py diff --git a/test.py b/test.py deleted file mode 100755 index 5b4a630..0000000 --- a/test.py +++ /dev/null @@ -1,99 +0,0 @@ -#! /usr/bin/env python3 -# -*- coding: utf-8 -*- - -from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse -import requests - -def deredir_url(url): - """ - Given a URL, return the URL that the page really downloads from - :param url: url to be de-redirected - :return: direct url - """ - - ret = None - try: - # Download the page - ret = requests.get(url, timeout=5) - except: - # If anything goes wrong keep the URL intact - return url - - # Return the URL that the page was downloaded from - return ret.url - -def _remove_tracker_params(query_str): - """ - private function - Given a query string from a URL, strip out the known trackers - :param query_str: query to be cleaned - :return: query cleaned - """ - # Avalaible URL tracking parameters : - # UTM tags by Google Ads, M$ Ads, ... - # tag by TikTok - # tags by Snapchat - # tags by Facebook - params_to_remove = [ - "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", - "mkt_tok", - "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", - "media", "interest_group_name", - "xtor" - ] - query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) - query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] - return urlencode(query_cleaned, safe='#', doseq=True) - - -def _remove_trackers_fragment(fragment_str): - """ - private function - Given a fragment string from a URL, strip out the known trackers - :param query_str: fragment to be cleaned - :return: cleaned fragment - """ - - # Not implemented - # Unclear what, if anything, can be done - # Need better understanding of fragment-based tracking - # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ - - return fragment_str - - -def clean_url(dirty_url): - """ - Given a URL, return it with the UTM parameters removed from query and fragment - :param dirty_url: url to be cleaned - :return: url cleaned - >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') - 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' - """ - - url_parsed = urlparse(dirty_url, allow_fragments=False) - - cleaned_url = urlunparse([ - url_parsed.scheme, - url_parsed.netloc, - url_parsed.path, - url_parsed.params, - _remove_tracker_params(url_parsed.query), - _remove_trackers_fragment(url_parsed.fragment) - ]) - - return cleaned_url - -def main(): - # url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok' - # url = "https://docs.helix-editor.com/keymap.html#movement" - # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title" - # url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim" - url = "https://shorturl.at/qwP38" - print('Orig: ' + url) - direct_url = deredir_url(url) - print('dir : ' + direct_url) - print('to : ' + clean_url(direct_url)) - -if __name__=="__main__": - main() From 50e961b70f04befdf8d80783bb94763259974d25 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 11:51:32 +0100 Subject: [PATCH 11/13] Removed wrong file --- default.toml | 37 ------------------------------------- 1 file changed, 37 deletions(-) delete mode 100644 default.toml diff --git a/default.toml b/default.toml deleted file mode 100644 index 2b7d373..0000000 --- a/default.toml +++ /dev/null @@ -1,37 +0,0 @@ -[config] -# twitter account name without '@' -twitter_account = "SuperDuper" - -# Domain name of Mastodon instance -mastodon_instance = "botsin.space" - -# Mastodon username -mastodon_user = "superduperbot" - -[options] -# Download videos from twitter and upload them on Mastodon -upload_videos = false - -# Also post the "reply-to" tweets from twitter account -post_reply_to = false - -# Do not post the retweets of other twitter accounts -skip_retweets = false - -# Clean up URLs in tweets to remove trackers (UNIMPLEMENTED) -remove_trackers_from_URL = false - -# Rewrite URLs to use invidious instance instead of youtube (UNIMPLEMENTED) -substitute_invidious = false - -# Rewrite URLs to use nitter instance instead of twitter (UNIMPLEMENTED) -substitute_nitter = false - -# Maximum age of tweet to post (in days, decimal values accepted) -tweet_max_age = 1 - -# Minimum age of tweet before posting (in minutes) -tweet_delay = 15 - -# Maximum number of toots to post in this run -twoot_cap = 1 From 2c4d6bd7e00c621f707b099e471b13f448666128 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 21:46:05 +0100 Subject: [PATCH 12/13] Last updates before release --- CHANGELOG.md | 10 ++++++++-- README.md | 9 +++++---- twoot.py | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7bf795..0e1c48e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,14 @@ -**XX NOV 2022** VERSION 2.4 Added command-line option (`-u`) to +**23 NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove +redirection from links included in tweets. Obfuscated links are replaced +by the URL that the resource is directly downloaded from. Also improved +tracker removal by cleaning URL fragments as well (contrib: mathdatech, +thanks!). + +**22 NOV 2022** VERSION 2.4 Added command-line option (`-u`) to remove tracking parameters from URLs included in tweets. A tracking URL is a normal URL with parameters attached to it. These parameters are used by marketing companies to identify the source of a click and the effectiveness -of a communication campaign. +of a communication campaign (contrib: mathdatech, thanks!). **15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to skip retweets. With this option, retweets will be ignored and not posted diff --git a/README.md b/README.md index 304f428..6c5a027 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ # Twoot -Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. -It is simple to set-up on a local machine, configurable and feature-rich. +**Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. +It is simple to set-up on a local machine, configurable and feature-rich.** -**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove redirection +**UPDATE 23 NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove redirection from links included in tweets. Obfuscated links are replaced by the URL that the resource -is directly downloaded from. +is directly downloaded from. Also improved tracker removal by cleaning URL fragments as well +(contrib: mathdatech, thanks!). > Previous updates can be found in CHANGELOG. diff --git a/twoot.py b/twoot.py index 03892a5..618af8e 100755 --- a/twoot.py +++ b/twoot.py @@ -40,7 +40,7 @@ MAX_REC_COUNT = 50 # Set the desired verbosity of logging # One of logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR, logging.CRITICAL -LOGGING_LEVEL = logging.DEBUG +LOGGING_LEVEL = logging.INFO # How many seconds to wait before giving up on a download (except video download) HTTPS_REQ_TIMEOUT = 10 From 91ffbde963f795d32413d04ca59fd8c0a48e1976 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 21:51:57 +0100 Subject: [PATCH 13/13] Last minute thought --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 6c5a027..6844689 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,9 @@ resource is directly dowmnloaded from (if applicable). e.g. bit.ly/xxyyyzz -> ex Every link visit can take up to 5 sec (timeout) therefore this option will slow down tweet processing. +If you are interested by tracker removal (`-u`) you should also select redirection removal(`-l`) +as trackers are often hidden behind the redirection of a short URL. + When using the `-v` switch consider: * whether the copyright of the content that you want to cross-post allows it