mirror of
https://gitlab.com/jeancf/twoot.git
synced 2025-02-23 08:38:30 +00:00
Compare commits
No commits in common. "50e961b70f04befdf8d80783bb94763259974d25" and "f0b5ee98d2d1b1316d7a3d76bc7429fb07229497" have entirely different histories.
50e961b70f
...
f0b5ee98d2
37
default.toml
Normal file
37
default.toml
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
[config]
|
||||||
|
# twitter account name without '@'
|
||||||
|
twitter_account = "SuperDuper"
|
||||||
|
|
||||||
|
# Domain name of Mastodon instance
|
||||||
|
mastodon_instance = "botsin.space"
|
||||||
|
|
||||||
|
# Mastodon username
|
||||||
|
mastodon_user = "superduperbot"
|
||||||
|
|
||||||
|
[options]
|
||||||
|
# Download videos from twitter and upload them on Mastodon
|
||||||
|
upload_videos = false
|
||||||
|
|
||||||
|
# Also post the "reply-to" tweets from twitter account
|
||||||
|
post_reply_to = false
|
||||||
|
|
||||||
|
# Do not post the retweets of other twitter accounts
|
||||||
|
skip_retweets = false
|
||||||
|
|
||||||
|
# Clean up URLs in tweets to remove trackers (UNIMPLEMENTED)
|
||||||
|
remove_trackers_from_URL = false
|
||||||
|
|
||||||
|
# Rewrite URLs to use invidious instance instead of youtube (UNIMPLEMENTED)
|
||||||
|
substitute_invidious = false
|
||||||
|
|
||||||
|
# Rewrite URLs to use nitter instance instead of twitter (UNIMPLEMENTED)
|
||||||
|
substitute_nitter = false
|
||||||
|
|
||||||
|
# Maximum age of tweet to post (in days, decimal values accepted)
|
||||||
|
tweet_max_age = 1
|
||||||
|
|
||||||
|
# Minimum age of tweet before posting (in minutes)
|
||||||
|
tweet_delay = 15
|
||||||
|
|
||||||
|
# Maximum number of toots to post in this run
|
||||||
|
twoot_cap = 1
|
99
test.py
Executable file
99
test.py
Executable file
|
@ -0,0 +1,99 @@
|
||||||
|
#! /usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def deredir_url(url):
|
||||||
|
"""
|
||||||
|
Given a URL, return the URL that the page really downloads from
|
||||||
|
:param url: url to be de-redirected
|
||||||
|
:return: direct url
|
||||||
|
"""
|
||||||
|
|
||||||
|
ret = None
|
||||||
|
try:
|
||||||
|
# Download the page
|
||||||
|
ret = requests.get(url, timeout=5)
|
||||||
|
except:
|
||||||
|
# If anything goes wrong keep the URL intact
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Return the URL that the page was downloaded from
|
||||||
|
return ret.url
|
||||||
|
|
||||||
|
def _remove_tracker_params(query_str):
|
||||||
|
"""
|
||||||
|
private function
|
||||||
|
Given a query string from a URL, strip out the known trackers
|
||||||
|
:param query_str: query to be cleaned
|
||||||
|
:return: query cleaned
|
||||||
|
"""
|
||||||
|
# Avalaible URL tracking parameters :
|
||||||
|
# UTM tags by Google Ads, M$ Ads, ...
|
||||||
|
# tag by TikTok
|
||||||
|
# tags by Snapchat
|
||||||
|
# tags by Facebook
|
||||||
|
params_to_remove = [
|
||||||
|
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
||||||
|
"mkt_tok",
|
||||||
|
"campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
|
||||||
|
"media", "interest_group_name",
|
||||||
|
"xtor"
|
||||||
|
]
|
||||||
|
query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
|
||||||
|
query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove]
|
||||||
|
return urlencode(query_cleaned, safe='#', doseq=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_trackers_fragment(fragment_str):
|
||||||
|
"""
|
||||||
|
private function
|
||||||
|
Given a fragment string from a URL, strip out the known trackers
|
||||||
|
:param query_str: fragment to be cleaned
|
||||||
|
:return: cleaned fragment
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Not implemented
|
||||||
|
# Unclear what, if anything, can be done
|
||||||
|
# Need better understanding of fragment-based tracking
|
||||||
|
# https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/
|
||||||
|
|
||||||
|
return fragment_str
|
||||||
|
|
||||||
|
|
||||||
|
def clean_url(dirty_url):
|
||||||
|
"""
|
||||||
|
Given a URL, return it with the UTM parameters removed from query and fragment
|
||||||
|
:param dirty_url: url to be cleaned
|
||||||
|
:return: url cleaned
|
||||||
|
>>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
|
||||||
|
'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
|
||||||
|
"""
|
||||||
|
|
||||||
|
url_parsed = urlparse(dirty_url, allow_fragments=False)
|
||||||
|
|
||||||
|
cleaned_url = urlunparse([
|
||||||
|
url_parsed.scheme,
|
||||||
|
url_parsed.netloc,
|
||||||
|
url_parsed.path,
|
||||||
|
url_parsed.params,
|
||||||
|
_remove_tracker_params(url_parsed.query),
|
||||||
|
_remove_trackers_fragment(url_parsed.fragment)
|
||||||
|
])
|
||||||
|
|
||||||
|
return cleaned_url
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok'
|
||||||
|
# url = "https://docs.helix-editor.com/keymap.html#movement"
|
||||||
|
# url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title"
|
||||||
|
# url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim"
|
||||||
|
url = "https://shorturl.at/qwP38"
|
||||||
|
print('Orig: ' + url)
|
||||||
|
direct_url = deredir_url(url)
|
||||||
|
print('dir : ' + direct_url)
|
||||||
|
print('to : ' + clean_url(direct_url))
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user