From 26b06198809e1336c9c0b7f1b07b75580fcef521 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 11:55:06 +0100 Subject: [PATCH 01/23] added command-line option --- twoot.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/twoot.py b/twoot.py index 1284dc4..67784bf 100644 --- a/twoot.py +++ b/twoot.py @@ -96,8 +96,8 @@ def clean_url(dirty_url): Given a URL, return it with the UTM parameters removed from query and fragment :param dirty_url: url to be cleaned :return: url cleaned - >>> clean_url('https://exemple.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') - 'https://exemple.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' + >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') + 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' """ url_parsed = urlparse(dirty_url) @@ -114,11 +114,12 @@ def clean_url(dirty_url): return cleaned_url -def process_media_body(tt_iter): +def process_media_body(tt_iter, remove_trackers): """ Receives an iterator over all the elements contained in the tweet-text container. Processes them to make them suitable for posting on Mastodon :param tt_iter: iterator over the HTML elements in the text of the tweet + :param remove_trackers: bool to indicate if trackers should be removed :return: cleaned up text of the tweet """ tweet_text = '' @@ -139,7 +140,10 @@ def process_media_body(tt_iter): tweet_text += tag_text else: # This is a real link, keep url - tweet_text += clean_url(tag.get('href')) + if remove_trackers: + tweet_text += clean_url(tag.get('href')) + else: + tweet_text += tag.get('href') else: logging.warning("No handler for tag in twitter text: " + tag.prettify()) @@ -319,6 +323,7 @@ def main(argv): parser.add_argument('-p', metavar='', action='store', required=True) parser.add_argument('-r', action='store_true', help='Also post replies to other tweets') parser.add_argument('-s', action='store_true', help='Suppress retweets') + parser.add_argument('-u', action='store_true', help='Remove trackers from URLs') parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance') parser.add_argument('-a', metavar='', action='store', type=float, default=1) parser.add_argument('-d', metavar='', action='store', type=float, default=0) @@ -333,6 +338,7 @@ def main(argv): mast_password = args['p'] tweets_and_replies = args['r'] suppress_retweets = args['s'] + remove_trackers = args['u'] get_vids = args['v'] max_age = float(args['a']) min_delay = float(args['d']) @@ -358,6 +364,7 @@ def main(argv): logging.info(' -m ' + mast_account) logging.info(' -r ' + str(tweets_and_replies)) logging.info(' -s ' + str(suppress_retweets)) + logging.info(' -u ' + str(remove_trackers)) logging.info(' -v ' + str(get_vids)) logging.info(' -a ' + str(max_age)) logging.info(' -d ' + str(min_delay)) From e2eff0445c890ca21b97e7d6ab9ceb2b9217b3b6 Mon Sep 17 00:00:00 2001 From: BuildTools Date: Fri, 18 Nov 2022 12:07:02 +0100 Subject: [PATCH 02/23] Changed mode of twoot.py --- twoot.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 twoot.py diff --git a/twoot.py b/twoot.py old mode 100644 new mode 100755 From 2a736de0c79d007db8b3c2e9b127001aa7a4988c Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 12:17:29 +0100 Subject: [PATCH 03/23] Replaced poor performing nitter instances --- twoot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/twoot.py b/twoot.py index 67784bf..2aabe8c 100755 --- a/twoot.py +++ b/twoot.py @@ -51,8 +51,8 @@ NITTER_URLS = [ 'https://nitter.fdn.fr', 'https://nitter.eu', 'https://nitter.namazso.eu', - 'https://nitter.moomoo.me', - 'https://n.ramle.be', + 'https://n.l5.ca', + 'https://nitter.nl', ] # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/ From 203e90dcd425d9aee1ad5a825dcb3e670e739f84 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 12:57:44 +0100 Subject: [PATCH 04/23] Added debug messager to clean_url() --- twoot.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/twoot.py b/twoot.py index 2aabe8c..aef7954 100755 --- a/twoot.py +++ b/twoot.py @@ -111,6 +111,8 @@ def clean_url(dirty_url): _remove_tracker_params(url_parsed.fragment) ]) + logging.debug('Cleaned URL from: ' + dirty_url + ' to: ' + cleaned_url) + return cleaned_url From 9b1f4c9ceeedc7017cfdf36c189ecc9cf5a7e176 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 13:04:30 +0100 Subject: [PATCH 05/23] Swapped another nitter instance --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index aef7954..1b57be7 100755 --- a/twoot.py +++ b/twoot.py @@ -46,7 +46,7 @@ LOGGING_LEVEL = logging.DEBUG HTTPS_REQ_TIMEOUT = 10 NITTER_URLS = [ - 'https://nitter.42l.fr', + 'https://nitter.lacontrevoie.fr', 'https://nitter.pussthecat.org', 'https://nitter.fdn.fr', 'https://nitter.eu', From 37a4419ea68b6d12517b4c798511d6c556a24f80 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 13:32:16 +0100 Subject: [PATCH 06/23] Added missing parameter to process_media_body() --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 1b57be7..bb74357 100755 --- a/twoot.py +++ b/twoot.py @@ -522,7 +522,7 @@ def main(argv): tt_iter = status.find('div', class_='tweet-content media-body').children # Process text of tweet - tweet_text += process_media_body(tt_iter) + tweet_text += process_media_body(tt_iter, remove_trackers) # Process quote: append link to tweet_text quote_div = status.find('a', class_='quote-link') From 6308fdc34899b1e20752b34aa3fd2a1bc1e3cfbd Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 13:56:22 +0100 Subject: [PATCH 07/23] Reduced debug logging to essential in clean_url() --- twoot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index bb74357..e88d2ec 100755 --- a/twoot.py +++ b/twoot.py @@ -111,7 +111,8 @@ def clean_url(dirty_url): _remove_tracker_params(url_parsed.fragment) ]) - logging.debug('Cleaned URL from: ' + dirty_url + ' to: ' + cleaned_url) + if cleaned_url != dirty_url: + logging.debug('Cleaned URL from: ' + dirty_url + ' to: ' + cleaned_url) return cleaned_url From e6e6a77d3ed5e8626f4af0e12770708e1904b968 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 13:59:34 +0100 Subject: [PATCH 08/23] Looking for better nitter instances --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index e88d2ec..61d7daf 100755 --- a/twoot.py +++ b/twoot.py @@ -52,7 +52,7 @@ NITTER_URLS = [ 'https://nitter.eu', 'https://nitter.namazso.eu', 'https://n.l5.ca', - 'https://nitter.nl', + 'https://nitter.spaceint.fr', ] # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/ From 2d0d1bc688e48edc650b766e970f60eb9a233e37 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 14:10:19 +0100 Subject: [PATCH 09/23] Updated README and CHANGELOG --- CHANGELOG.md | 6 ++++++ README.md | 8 +++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e3acee..c7bf795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +**XX NOV 2022** VERSION 2.4 Added command-line option (`-u`) to +remove tracking parameters from URLs included in tweets. A tracking URL +is a normal URL with parameters attached to it. These parameters are used +by marketing companies to identify the source of a click and the effectiveness +of a communication campaign. + **15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to skip retweets. With this option, retweets will be ignored and not posted on Mastodon. diff --git a/README.md b/README.md index a5aa8b4..22043c2 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,11 @@ Twoot is a python script that extracts tweets from a twitter feed and reposts them as toots on a Mastodon account. -**UPDATE 15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to -skip retweets. With this option, retweets will be ignored and not posted -on Mastodon. +**UPDATE XX NOV 2022** VERSION 2.4 Added command-line option (`-u`) to +remove tracking parameters from URLs included in tweets. A tracking URL +is a normal URL with parameters attached to it. These parameters are used +by marketing companies to identify the source of a click and the effectiveness +of a communication campaign. > Previous updates can be found in CHANGELOG. From 94294c67929d58722e2330af349a8eb144460d66 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Nov 2022 14:16:04 +0100 Subject: [PATCH 10/23] Updated command-line description --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 22043c2..762cc67 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Twoot -Twoot is a python script that extracts tweets from a twitter feed and -reposts them as toots on a Mastodon account. +Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. +It is simple to set-up on a local machine, configurable and feature-rich. **UPDATE XX NOV 2022** VERSION 2.4 Added command-line option (`-u`) to remove tracking parameters from URLs included in tweets. A tracking URL @@ -29,7 +29,7 @@ of a communication campaign. ``` twoot.py [-h] -t -i -m - -p [-r] [-s] [-v] [-a ] + -p [-r] [-s] [-u] [-v] [-a ] [-d ] [-c ] ``` @@ -47,6 +47,7 @@ is @superduperbot@botsin.space | -v | upload videos to Mastodon | *N/A* | No | | -r | Post reply-to tweets (ignored by default) | *N/A* | No | | -s | Skip retweets (posted by default) | *N/A* | No | +| -u | Remove trackers from URLs | *N/A* | No | | -a | Max. age of tweet to post (in days) | `5` | No | | -d | Min. age before posting new tweet (in minutes) | `15` | No | | -c | Max number of toots allowed to post (cap) | `1` | No | @@ -106,5 +107,5 @@ Twoot is known to be used for the following feeds (older first): ## Background I started twoot when [tootbot](https://github.com/cquest/tootbot) -stopped working. Tootbot relies on rss feeds from https://twitrss.me +stopped working. Tootbot relied on RSS feeds from https://twitrss.me that broke when Twitter refreshed their web UI in July 2019. From f88414bb35ea68712fd253c7f9a1eb72f16fce9c Mon Sep 17 00:00:00 2001 From: jeancf Date: Sat, 19 Nov 2022 13:12:41 +0100 Subject: [PATCH 11/23] Added _remove_tracker_fragment() --- twoot.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/twoot.py b/twoot.py index 61d7daf..b4c9c6f 100755 --- a/twoot.py +++ b/twoot.py @@ -67,7 +67,7 @@ USER_AGENTS = [ ] -def _remove_tracker_params(query_str): +def _remove_trackers_query(query_str): """ private function Given a query string from a URL, strip out the known trackers @@ -91,6 +91,22 @@ def _remove_tracker_params(query_str): return urlencode(query_cleaned, doseq=True) +def _remove_trackers_fragment(fragment_str): + """ + private function + Given a fragment string from a URL, strip out the known trackers + :param query_str: fragment to be cleaned + :return: cleaned fragment + """ + + # Not implemented + # Unclear what, if anything, can be done + # Need better understanding of fragment-based tracking + # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ + + return fragment_str + + def clean_url(dirty_url): """ Given a URL, return it with the UTM parameters removed from query and fragment @@ -107,8 +123,8 @@ def clean_url(dirty_url): url_parsed.netloc, url_parsed.path, url_parsed.params, - _remove_tracker_params(url_parsed.query), - _remove_tracker_params(url_parsed.fragment) + _remove_trackers_query(url_parsed.query), + _remove_trackers_fragment(url_parsed.fragment) ]) if cleaned_url != dirty_url: From 19eae4f2103876b3f74365b3b6a949ff11dec7f1 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 09:56:56 +0100 Subject: [PATCH 12/23] Removed unreliable nitter instance --- twoot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/twoot.py b/twoot.py index b4c9c6f..679bfa9 100755 --- a/twoot.py +++ b/twoot.py @@ -52,7 +52,6 @@ NITTER_URLS = [ 'https://nitter.eu', 'https://nitter.namazso.eu', 'https://n.l5.ca', - 'https://nitter.spaceint.fr', ] # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/ From 6860c53b11c6378483571943f847f6688d228807 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 10:11:02 +0100 Subject: [PATCH 13/23] Trying additional instance --- twoot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/twoot.py b/twoot.py index 679bfa9..f01e710 100755 --- a/twoot.py +++ b/twoot.py @@ -52,6 +52,7 @@ NITTER_URLS = [ 'https://nitter.eu', 'https://nitter.namazso.eu', 'https://n.l5.ca', + 'https://nitter.bus-hit.me', ] # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/ From 8930d5329f76eb97f0a3b35b4f4b06063cda7b7a Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 10:14:42 +0100 Subject: [PATCH 14/23] Updated README for release --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 762cc67..d117fda 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. It is simple to set-up on a local machine, configurable and feature-rich. -**UPDATE XX NOV 2022** VERSION 2.4 Added command-line option (`-u`) to -remove tracking parameters from URLs included in tweets. A tracking URL -is a normal URL with parameters attached to it. These parameters are used +**UPDATE 22 NOV 2022** VERSION 2.4 Added command-line option (`-u`) to +remove tracking parameters from URLs included in tweets. A tracking URL is a +normal URL with additional parameters attached to it. These parameters are used by marketing companies to identify the source of a click and the effectiveness of a communication campaign. From 40d14c4d5db49deb08a70376cb7bc01d159b4776 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 11:05:16 +0100 Subject: [PATCH 15/23] Added de-redirection of URL in tweet --- twoot.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/twoot.py b/twoot.py index f01e710..cd15e73 100755 --- a/twoot.py +++ b/twoot.py @@ -66,6 +66,33 @@ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Vivaldi/5.4.2753.51', ] +def deredir_url(url): + """ + Given a URL, return the URL that the page really downloads from + :param url: url to be de-redirected + :return: direct url + """ + + # Get a copy of the default headers that requests would use + headers = requests.utils.default_headers() + + # Update default headers with randomly selected user agent + headers.update( + { + 'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)], + } + ) + + ret = None + try: + # Download the page + ret = requests.get(url, headers, timeout=5) + except: + # If anything goes wrong keep the URL intact + return url + + # Return the URL that the page was downloaded from + return ret.url def _remove_trackers_query(query_str): """ @@ -158,11 +185,12 @@ def process_media_body(tt_iter, remove_trackers): # Only keep hashtag text tweet_text += tag_text else: - # This is a real link, keep url + # This is a real link + url = deredir_url(tag.get('href')) if remove_trackers: - tweet_text += clean_url(tag.get('href')) + tweet_text += clean_url(url) else: - tweet_text += tag.get('href') + tweet_text += url else: logging.warning("No handler for tag in twitter text: " + tag.prettify()) @@ -426,7 +454,7 @@ def main(argv): if tweets_and_replies: url += '/with_replies' - # Download twitter page of user. + # Download twitter page of user try: twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT) except requests.exceptions.ConnectionError: From 68e4918b02804e5d2a782d0e2b17d6ac0742a650 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 11:08:29 +0100 Subject: [PATCH 16/23] Added debug message --- twoot.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/twoot.py b/twoot.py index cd15e73..43bc435 100755 --- a/twoot.py +++ b/twoot.py @@ -91,6 +91,9 @@ def deredir_url(url): # If anything goes wrong keep the URL intact return url + if ret.url != url: + logging.debug("Removed redirection from: " + url + " to: " + ret.url) + # Return the URL that the page was downloaded from return ret.url From e11102f4a6018b8bda94d12f62332d8d87525ee7 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 11:33:45 +0100 Subject: [PATCH 17/23] User agent removed --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 43bc435..b987679 100755 --- a/twoot.py +++ b/twoot.py @@ -86,7 +86,7 @@ def deredir_url(url): ret = None try: # Download the page - ret = requests.get(url, headers, timeout=5) + ret = requests.get(url, timeout=5) except: # If anything goes wrong keep the URL intact return url From 9625c2128bc866cc450347fc892310731e68fef0 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 11:38:49 +0100 Subject: [PATCH 18/23] modified get request in deredir_url() --- twoot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index b987679..8fb3d5e 100755 --- a/twoot.py +++ b/twoot.py @@ -86,7 +86,7 @@ def deredir_url(url): ret = None try: # Download the page - ret = requests.get(url, timeout=5) + ret = requests.get(url, headers=headers, timeout=5) except: # If anything goes wrong keep the URL intact return url @@ -97,6 +97,7 @@ def deredir_url(url): # Return the URL that the page was downloaded from return ret.url + def _remove_trackers_query(query_str): """ private function From 9b5a76db60aeb8bb9af67f33e94439896ceca8df Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 12:50:34 +0100 Subject: [PATCH 19/23] updated README.md --- README.md | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d117fda..17700d3 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,7 @@ Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. It is simple to set-up on a local machine, configurable and feature-rich. -**UPDATE 22 NOV 2022** VERSION 2.4 Added command-line option (`-u`) to -remove tracking parameters from URLs included in tweets. A tracking URL is a -normal URL with additional parameters attached to it. These parameters are used -by marketing companies to identify the source of a click and the effectiveness -of a communication campaign. +**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) > Previous updates can be found in CHANGELOG. @@ -25,7 +21,7 @@ of a communication campaign. * Optionally ignore retweets * Allows rate-limiting posts to Mastodon instance -## usage +## Usage ``` twoot.py [-h] -t -i -m @@ -33,7 +29,7 @@ twoot.py [-h] -t -i -m [-d ] [-c ] ``` -## arguments +## Arguments Assuming that the Twitter handle is @SuperDuperBot and the Mastodon account is @superduperbot@botsin.space @@ -42,16 +38,24 @@ is @superduperbot@botsin.space |-------|--------------------------------------------------|--------------------|-----| | -t | twitter account name without '@' | `SuperDuper` | Yes | | -i | Mastodon instance domain name | `botsin.space` | Yes | -| -m | Mastodon username | `superduperbot` | Yes | +| -m | Mastodon username | `sd@example.com` | Yes | | -p | Mastodon password | `my_Sup3r-S4f3*pw` | Yes | | -v | upload videos to Mastodon | *N/A* | No | | -r | Post reply-to tweets (ignored by default) | *N/A* | No | | -s | Skip retweets (posted by default) | *N/A* | No | +| -l | Remove link redirection | *N/A* | No | | -u | Remove trackers from URLs | *N/A* | No | | -a | Max. age of tweet to post (in days) | `5` | No | | -d | Min. age before posting new tweet (in minutes) | `15` | No | | -c | Max number of toots allowed to post (cap) | `1` | No | +## Notes + +`-l` will follow every link included in the tweet and replace them with the url that the +resource is directly dowmnloaded from (if applicable). e.g. bit.ly/xxyyyzz -> example.com +Every link visit can take up to 5 sec (timeout) therefore this option will slow down +tweet processing. + When using the `-v` switch consider: * whether the copyright of the content that you want to cross-post allows it @@ -64,7 +68,8 @@ Default min delay is 0 minutes. No limitation is applied to the number of toots uploaded if `-c` is not specified. -## installation + +## Installation Make sure python3 is installed. From 0d1be42dcc473ac6d6f3cac6c96829bdcbd50270 Mon Sep 17 00:00:00 2001 From: jeancf Date: Tue, 22 Nov 2022 22:01:27 +0100 Subject: [PATCH 20/23] Added code to remove trackers from fragments --- twoot.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/twoot.py b/twoot.py index 8fb3d5e..15b72ee 100755 --- a/twoot.py +++ b/twoot.py @@ -110,13 +110,18 @@ def _remove_trackers_query(query_str): # tag by TikTok # tags by Snapchat # tags by Facebook - params_to_remove = [ - "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + params_to_remove = { + "gclid", "_ga", "gclsrc", "dclid", + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "utm_cid", "utm_reader", "utm_name", "utm_referrer", "utm_social", "utm_social-type", "mkt_tok", "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", - "media", "interest_group_name", - "xtor" - ] + "fbclid", "campaign_name", "ad_set_name", "ad_set_id", "media", "interest_group_name", "ad_set_id" + "igshid", + "cvid", "oicd", "msclkid", + "soc_src", "soc_trk", + "_openstat", "yclid", + "xtor", "xtref", "adid", + } query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] return urlencode(query_cleaned, doseq=True) @@ -129,12 +134,15 @@ def _remove_trackers_fragment(fragment_str): :param query_str: fragment to be cleaned :return: cleaned fragment """ - - # Not implemented - # Unclear what, if anything, can be done - # Need better understanding of fragment-based tracking - # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ + params_to_remove = { + "Echobox", + } + + if '=' in fragment_str: + fragment_str = fragment_str.split('&') + query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove] + fragment_str = '&'.join(query_cleaned) return fragment_str From 7e7fa4620ffcfd89bdca727595711a6029e90a34 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 09:59:06 +0100 Subject: [PATCH 21/23] Implemented -l command-line option --- twoot.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/twoot.py b/twoot.py index 15b72ee..f98d9cb 100755 --- a/twoot.py +++ b/twoot.py @@ -66,6 +66,7 @@ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Vivaldi/5.4.2753.51', ] + def deredir_url(url): """ Given a URL, return the URL that the page really downloads from @@ -123,7 +124,7 @@ def _remove_trackers_query(query_str): "xtor", "xtref", "adid", } query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) - query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] + query_cleaned = [(k, v) for k, v in query_to_clean.items() if k not in params_to_remove] return urlencode(query_cleaned, doseq=True) @@ -138,7 +139,7 @@ def _remove_trackers_fragment(fragment_str): params_to_remove = { "Echobox", } - + if '=' in fragment_str: fragment_str = fragment_str.split('&') query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove] @@ -172,7 +173,7 @@ def clean_url(dirty_url): return cleaned_url -def process_media_body(tt_iter, remove_trackers): +def process_media_body(tt_iter, remove_redir, remove_trackers): """ Receives an iterator over all the elements contained in the tweet-text container. Processes them to make them suitable for posting on Mastodon @@ -198,7 +199,11 @@ def process_media_body(tt_iter, remove_trackers): tweet_text += tag_text else: # This is a real link - url = deredir_url(tag.get('href')) + if remove_redir: + url = deredir_url(tag.get('href')) + else: + url = tag.get('href') + if remove_trackers: tweet_text += clean_url(url) else: @@ -382,6 +387,7 @@ def main(argv): parser.add_argument('-p', metavar='', action='store', required=True) parser.add_argument('-r', action='store_true', help='Also post replies to other tweets') parser.add_argument('-s', action='store_true', help='Suppress retweets') + parser.add_argument('-l', action='store_true', help='Remove link redirection') parser.add_argument('-u', action='store_true', help='Remove trackers from URLs') parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance') parser.add_argument('-a', metavar='', action='store', type=float, default=1) @@ -397,6 +403,7 @@ def main(argv): mast_password = args['p'] tweets_and_replies = args['r'] suppress_retweets = args['s'] + remove_redir = args['l'] remove_trackers = args['u'] get_vids = args['v'] max_age = float(args['a']) @@ -423,6 +430,7 @@ def main(argv): logging.info(' -m ' + mast_account) logging.info(' -r ' + str(tweets_and_replies)) logging.info(' -s ' + str(suppress_retweets)) + logging.info(' -l ' + str(remove_redir)) logging.info(' -u ' + str(remove_trackers)) logging.info(' -v ' + str(get_vids)) logging.info(' -a ' + str(max_age)) @@ -579,7 +587,7 @@ def main(argv): tt_iter = status.find('div', class_='tweet-content media-body').children # Process text of tweet - tweet_text += process_media_body(tt_iter, remove_trackers) + tweet_text += process_media_body(tt_iter, remove_redir, remove_trackers) # Process quote: append link to tweet_text quote_div = status.find('a', class_='quote-link') From 3930acc93ffca633893bc8a9b3cf260ef51e7be7 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 09:59:45 +0100 Subject: [PATCH 22/23] Updated README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 17700d3..304f428 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,9 @@ Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. It is simple to set-up on a local machine, configurable and feature-rich. -**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) +**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove redirection +from links included in tweets. Obfuscated links are replaced by the URL that the resource +is directly downloaded from. > Previous updates can be found in CHANGELOG. From f0b5ee98d2d1b1316d7a3d76bc7429fb07229497 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 23 Nov 2022 10:50:41 +0100 Subject: [PATCH 23/23] Added missing parameter in docstring --- test.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ twoot.py | 1 + 2 files changed, 100 insertions(+) create mode 100755 test.py diff --git a/test.py b/test.py new file mode 100755 index 0000000..5b4a630 --- /dev/null +++ b/test.py @@ -0,0 +1,99 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- + +from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse +import requests + +def deredir_url(url): + """ + Given a URL, return the URL that the page really downloads from + :param url: url to be de-redirected + :return: direct url + """ + + ret = None + try: + # Download the page + ret = requests.get(url, timeout=5) + except: + # If anything goes wrong keep the URL intact + return url + + # Return the URL that the page was downloaded from + return ret.url + +def _remove_tracker_params(query_str): + """ + private function + Given a query string from a URL, strip out the known trackers + :param query_str: query to be cleaned + :return: query cleaned + """ + # Avalaible URL tracking parameters : + # UTM tags by Google Ads, M$ Ads, ... + # tag by TikTok + # tags by Snapchat + # tags by Facebook + params_to_remove = [ + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "mkt_tok", + "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", + "media", "interest_group_name", + "xtor" + ] + query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) + query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] + return urlencode(query_cleaned, safe='#', doseq=True) + + +def _remove_trackers_fragment(fragment_str): + """ + private function + Given a fragment string from a URL, strip out the known trackers + :param query_str: fragment to be cleaned + :return: cleaned fragment + """ + + # Not implemented + # Unclear what, if anything, can be done + # Need better understanding of fragment-based tracking + # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ + + return fragment_str + + +def clean_url(dirty_url): + """ + Given a URL, return it with the UTM parameters removed from query and fragment + :param dirty_url: url to be cleaned + :return: url cleaned + >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') + 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' + """ + + url_parsed = urlparse(dirty_url, allow_fragments=False) + + cleaned_url = urlunparse([ + url_parsed.scheme, + url_parsed.netloc, + url_parsed.path, + url_parsed.params, + _remove_tracker_params(url_parsed.query), + _remove_trackers_fragment(url_parsed.fragment) + ]) + + return cleaned_url + +def main(): + # url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok' + # url = "https://docs.helix-editor.com/keymap.html#movement" + # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title" + # url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim" + url = "https://shorturl.at/qwP38" + print('Orig: ' + url) + direct_url = deredir_url(url) + print('dir : ' + direct_url) + print('to : ' + clean_url(direct_url)) + +if __name__=="__main__": + main() diff --git a/twoot.py b/twoot.py index f98d9cb..03892a5 100755 --- a/twoot.py +++ b/twoot.py @@ -178,6 +178,7 @@ def process_media_body(tt_iter, remove_redir, remove_trackers): Receives an iterator over all the elements contained in the tweet-text container. Processes them to make them suitable for posting on Mastodon :param tt_iter: iterator over the HTML elements in the text of the tweet + :param remove_redir: bool to indicate if redirections should be removed :param remove_trackers: bool to indicate if trackers should be removed :return: cleaned up text of the tweet """