diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e3acee..c7bf795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +**XX NOV 2022** VERSION 2.4 Added command-line option (`-u`) to +remove tracking parameters from URLs included in tweets. A tracking URL +is a normal URL with parameters attached to it. These parameters are used +by marketing companies to identify the source of a click and the effectiveness +of a communication campaign. + **15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to skip retweets. With this option, retweets will be ignored and not posted on Mastodon. diff --git a/README.md b/README.md index a5aa8b4..304f428 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # Twoot -Twoot is a python script that extracts tweets from a twitter feed and -reposts them as toots on a Mastodon account. +Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account. +It is simple to set-up on a local machine, configurable and feature-rich. -**UPDATE 15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to -skip retweets. With this option, retweets will be ignored and not posted -on Mastodon. +**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove redirection +from links included in tweets. Obfuscated links are replaced by the URL that the resource +is directly downloaded from. > Previous updates can be found in CHANGELOG. @@ -23,15 +23,15 @@ on Mastodon. * Optionally ignore retweets * Allows rate-limiting posts to Mastodon instance -## usage +## Usage ``` twoot.py [-h] -t -i -m - -p [-r] [-s] [-v] [-a ] + -p [-r] [-s] [-u] [-v] [-a ] [-d ] [-c ] ``` -## arguments +## Arguments Assuming that the Twitter handle is @SuperDuperBot and the Mastodon account is @superduperbot@botsin.space @@ -40,15 +40,24 @@ is @superduperbot@botsin.space |-------|--------------------------------------------------|--------------------|-----| | -t | twitter account name without '@' | `SuperDuper` | Yes | | -i | Mastodon instance domain name | `botsin.space` | Yes | -| -m | Mastodon username | `superduperbot` | Yes | +| -m | Mastodon username | `sd@example.com` | Yes | | -p | Mastodon password | `my_Sup3r-S4f3*pw` | Yes | | -v | upload videos to Mastodon | *N/A* | No | | -r | Post reply-to tweets (ignored by default) | *N/A* | No | | -s | Skip retweets (posted by default) | *N/A* | No | +| -l | Remove link redirection | *N/A* | No | +| -u | Remove trackers from URLs | *N/A* | No | | -a | Max. age of tweet to post (in days) | `5` | No | | -d | Min. age before posting new tweet (in minutes) | `15` | No | | -c | Max number of toots allowed to post (cap) | `1` | No | +## Notes + +`-l` will follow every link included in the tweet and replace them with the url that the +resource is directly dowmnloaded from (if applicable). e.g. bit.ly/xxyyyzz -> example.com +Every link visit can take up to 5 sec (timeout) therefore this option will slow down +tweet processing. + When using the `-v` switch consider: * whether the copyright of the content that you want to cross-post allows it @@ -61,7 +70,8 @@ Default min delay is 0 minutes. No limitation is applied to the number of toots uploaded if `-c` is not specified. -## installation + +## Installation Make sure python3 is installed. @@ -104,5 +114,5 @@ Twoot is known to be used for the following feeds (older first): ## Background I started twoot when [tootbot](https://github.com/cquest/tootbot) -stopped working. Tootbot relies on rss feeds from https://twitrss.me +stopped working. Tootbot relied on RSS feeds from https://twitrss.me that broke when Twitter refreshed their web UI in July 2019. diff --git a/test.py b/test.py new file mode 100755 index 0000000..5b4a630 --- /dev/null +++ b/test.py @@ -0,0 +1,99 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- + +from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse +import requests + +def deredir_url(url): + """ + Given a URL, return the URL that the page really downloads from + :param url: url to be de-redirected + :return: direct url + """ + + ret = None + try: + # Download the page + ret = requests.get(url, timeout=5) + except: + # If anything goes wrong keep the URL intact + return url + + # Return the URL that the page was downloaded from + return ret.url + +def _remove_tracker_params(query_str): + """ + private function + Given a query string from a URL, strip out the known trackers + :param query_str: query to be cleaned + :return: query cleaned + """ + # Avalaible URL tracking parameters : + # UTM tags by Google Ads, M$ Ads, ... + # tag by TikTok + # tags by Snapchat + # tags by Facebook + params_to_remove = [ + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "mkt_tok", + "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", + "media", "interest_group_name", + "xtor" + ] + query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) + query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] + return urlencode(query_cleaned, safe='#', doseq=True) + + +def _remove_trackers_fragment(fragment_str): + """ + private function + Given a fragment string from a URL, strip out the known trackers + :param query_str: fragment to be cleaned + :return: cleaned fragment + """ + + # Not implemented + # Unclear what, if anything, can be done + # Need better understanding of fragment-based tracking + # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ + + return fragment_str + + +def clean_url(dirty_url): + """ + Given a URL, return it with the UTM parameters removed from query and fragment + :param dirty_url: url to be cleaned + :return: url cleaned + >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') + 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' + """ + + url_parsed = urlparse(dirty_url, allow_fragments=False) + + cleaned_url = urlunparse([ + url_parsed.scheme, + url_parsed.netloc, + url_parsed.path, + url_parsed.params, + _remove_tracker_params(url_parsed.query), + _remove_trackers_fragment(url_parsed.fragment) + ]) + + return cleaned_url + +def main(): + # url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok' + # url = "https://docs.helix-editor.com/keymap.html#movement" + # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title" + # url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim" + url = "https://shorturl.at/qwP38" + print('Orig: ' + url) + direct_url = deredir_url(url) + print('dir : ' + direct_url) + print('to : ' + clean_url(direct_url)) + +if __name__=="__main__": + main() diff --git a/twoot.py b/twoot.py index e124d72..1afbc39 100755 --- a/twoot.py +++ b/twoot.py @@ -46,13 +46,13 @@ LOGGING_LEVEL = logging.DEBUG HTTPS_REQ_TIMEOUT = 10 NITTER_URLS = [ - 'https://nitter.42l.fr', + 'https://nitter.lacontrevoie.fr', 'https://nitter.pussthecat.org', 'https://nitter.fdn.fr', 'https://nitter.eu', 'https://nitter.namazso.eu', - 'https://nitter.moomoo.me', - 'https://n.ramle.be', + 'https://n.l5.ca', + 'https://nitter.bus-hit.me', ] # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/ @@ -67,7 +67,39 @@ USER_AGENTS = [ ] -def _remove_tracker_params(query_str): +def deredir_url(url): + """ + Given a URL, return the URL that the page really downloads from + :param url: url to be de-redirected + :return: direct url + """ + + # Get a copy of the default headers that requests would use + headers = requests.utils.default_headers() + + # Update default headers with randomly selected user agent + headers.update( + { + 'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)], + } + ) + + ret = None + try: + # Download the page + ret = requests.get(url, headers=headers, timeout=5) + except: + # If anything goes wrong keep the URL intact + return url + + if ret.url != url: + logging.debug("Removed redirection from: " + url + " to: " + ret.url) + + # Return the URL that the page was downloaded from + return ret.url + + +def _remove_trackers_query(query_str): """ private function Given a query string from a URL, strip out the known trackers @@ -79,25 +111,49 @@ def _remove_tracker_params(query_str): # tag by TikTok # tags by Snapchat # tags by Facebook - params_to_remove = [ - "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + params_to_remove = { + "gclid", "_ga", "gclsrc", "dclid", + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "utm_cid", "utm_reader", "utm_name", "utm_referrer", "utm_social", "utm_social-type", "mkt_tok", "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", - "media", "interest_group_name", - "xtor" - ] + "fbclid", "campaign_name", "ad_set_name", "ad_set_id", "media", "interest_group_name", "ad_set_id" + "igshid", + "cvid", "oicd", "msclkid", + "soc_src", "soc_trk", + "_openstat", "yclid", + "xtor", "xtref", "adid", + } query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) - query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] + query_cleaned = [(k, v) for k, v in query_to_clean.items() if k not in params_to_remove] return urlencode(query_cleaned, doseq=True) +def _remove_trackers_fragment(fragment_str): + """ + private function + Given a fragment string from a URL, strip out the known trackers + :param query_str: fragment to be cleaned + :return: cleaned fragment + """ + + params_to_remove = { + "Echobox", + } + + if '=' in fragment_str: + fragment_str = fragment_str.split('&') + query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove] + fragment_str = '&'.join(query_cleaned) + return fragment_str + + def clean_url(dirty_url): """ Given a URL, return it with the UTM parameters removed from query and fragment :param dirty_url: url to be cleaned :return: url cleaned - >>> clean_url('https://exemple.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') - 'https://exemple.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' + >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') + 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' """ url_parsed = urlparse(dirty_url) @@ -107,18 +163,23 @@ def clean_url(dirty_url): url_parsed.netloc, url_parsed.path, url_parsed.params, - _remove_tracker_params(url_parsed.query), - _remove_tracker_params(url_parsed.fragment) + _remove_trackers_query(url_parsed.query), + _remove_trackers_fragment(url_parsed.fragment) ]) + if cleaned_url != dirty_url: + logging.debug('Cleaned URL from: ' + dirty_url + ' to: ' + cleaned_url) + return cleaned_url -def process_media_body(tt_iter): +def process_media_body(tt_iter, remove_redir, remove_trackers): """ Receives an iterator over all the elements contained in the tweet-text container. Processes them to make them suitable for posting on Mastodon :param tt_iter: iterator over the HTML elements in the text of the tweet + :param remove_redir: bool to indicate if redirections should be removed + :param remove_trackers: bool to indicate if trackers should be removed :return: cleaned up text of the tweet """ tweet_text = '' @@ -138,8 +199,16 @@ def process_media_body(tt_iter): # Only keep hashtag text tweet_text += tag_text else: - # This is a real link, keep url - tweet_text += clean_url(tag.get('href')) + # This is a real link + if remove_redir: + url = deredir_url(tag.get('href')) + else: + url = tag.get('href') + + if remove_trackers: + tweet_text += clean_url(url) + else: + tweet_text += url else: logging.warning("No handler for tag in twitter text: " + tag.prettify()) @@ -319,7 +388,8 @@ def main(argv): parser.add_argument('-m', metavar='', action='store') parser.add_argument('-p', metavar='', action='store') parser.add_argument('-r', action='store_true', help='Also post replies to other tweets') - parser.add_argument('-s', action='store_true', help='Skip retweets') + parser.add_argument('-s', action='store_true', help='Suppress retweets') + parser.add_argument('-l', action='store_true', help='Remove link redirection') parser.add_argument('-u', action='store_true', help='Remove trackers from URLs') parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance') parser.add_argument('-a', metavar='', action='store', type=float) @@ -463,7 +533,7 @@ def main(argv): if toml['options']['post_reply_to']: url += '/with_replies' - # Download twitter page of user. + # Download twitter page of user try: twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT) except requests.exceptions.ConnectionError: @@ -576,7 +646,7 @@ def main(argv): tt_iter = status.find('div', class_='tweet-content media-body').children # Process text of tweet - tweet_text += process_media_body(tt_iter) + tweet_text += process_media_body(tt_iter, remove_redir, remove_trackers) # Process quote: append link to tweet_text quote_div = status.find('a', class_='quote-link')