diff --git a/test.py b/test.py new file mode 100755 index 0000000..5b4a630 --- /dev/null +++ b/test.py @@ -0,0 +1,99 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- + +from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse +import requests + +def deredir_url(url): + """ + Given a URL, return the URL that the page really downloads from + :param url: url to be de-redirected + :return: direct url + """ + + ret = None + try: + # Download the page + ret = requests.get(url, timeout=5) + except: + # If anything goes wrong keep the URL intact + return url + + # Return the URL that the page was downloaded from + return ret.url + +def _remove_tracker_params(query_str): + """ + private function + Given a query string from a URL, strip out the known trackers + :param query_str: query to be cleaned + :return: query cleaned + """ + # Avalaible URL tracking parameters : + # UTM tags by Google Ads, M$ Ads, ... + # tag by TikTok + # tags by Snapchat + # tags by Facebook + params_to_remove = [ + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "mkt_tok", + "campaign_name", "ad_set_name", "campaign_id", "ad_set_id", + "media", "interest_group_name", + "xtor" + ] + query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True)) + query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove] + return urlencode(query_cleaned, safe='#', doseq=True) + + +def _remove_trackers_fragment(fragment_str): + """ + private function + Given a fragment string from a URL, strip out the known trackers + :param query_str: fragment to be cleaned + :return: cleaned fragment + """ + + # Not implemented + # Unclear what, if anything, can be done + # Need better understanding of fragment-based tracking + # https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/ + + return fragment_str + + +def clean_url(dirty_url): + """ + Given a URL, return it with the UTM parameters removed from query and fragment + :param dirty_url: url to be cleaned + :return: url cleaned + >>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok') + 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' + """ + + url_parsed = urlparse(dirty_url, allow_fragments=False) + + cleaned_url = urlunparse([ + url_parsed.scheme, + url_parsed.netloc, + url_parsed.path, + url_parsed.params, + _remove_tracker_params(url_parsed.query), + _remove_trackers_fragment(url_parsed.fragment) + ]) + + return cleaned_url + +def main(): + # url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok' + # url = "https://docs.helix-editor.com/keymap.html#movement" + # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title" + # url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim" + url = "https://shorturl.at/qwP38" + print('Orig: ' + url) + direct_url = deredir_url(url) + print('dir : ' + direct_url) + print('to : ' + clean_url(direct_url)) + +if __name__=="__main__": + main() diff --git a/twoot.py b/twoot.py index f98d9cb..03892a5 100755 --- a/twoot.py +++ b/twoot.py @@ -178,6 +178,7 @@ def process_media_body(tt_iter, remove_redir, remove_trackers): Receives an iterator over all the elements contained in the tweet-text container. Processes them to make them suitable for posting on Mastodon :param tt_iter: iterator over the HTML elements in the text of the tweet + :param remove_redir: bool to indicate if redirections should be removed :param remove_trackers: bool to indicate if trackers should be removed :return: cleaned up text of the tweet """