mirror of
https://gitlab.com/jeancf/twoot.git
synced 2025-02-26 09:58:43 +00:00
Merge branch 'clean_url' into cfg_file
# Conflicts: # twoot.py
This commit is contained in:
commit
0b58df16e2
@ -1,3 +1,9 @@
|
|||||||
|
**XX NOV 2022** VERSION 2.4 Added command-line option (`-u`) to
|
||||||
|
remove tracking parameters from URLs included in tweets. A tracking URL
|
||||||
|
is a normal URL with parameters attached to it. These parameters are used
|
||||||
|
by marketing companies to identify the source of a click and the effectiveness
|
||||||
|
of a communication campaign.
|
||||||
|
|
||||||
**15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to
|
**15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to
|
||||||
skip retweets. With this option, retweets will be ignored and not posted
|
skip retweets. With this option, retweets will be ignored and not posted
|
||||||
on Mastodon.
|
on Mastodon.
|
||||||
|
32
README.md
32
README.md
@ -1,11 +1,11 @@
|
|||||||
# Twoot
|
# Twoot
|
||||||
|
|
||||||
Twoot is a python script that extracts tweets from a twitter feed and
|
Twoot is a python script that mirrors tweets from a twitter account to a Mastodon account.
|
||||||
reposts them as toots on a Mastodon account.
|
It is simple to set-up on a local machine, configurable and feature-rich.
|
||||||
|
|
||||||
**UPDATE 15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to
|
**UPDATE XX NOV 2022** VERSION 2.5 Added command-line option (`-l`) to remove redirection
|
||||||
skip retweets. With this option, retweets will be ignored and not posted
|
from links included in tweets. Obfuscated links are replaced by the URL that the resource
|
||||||
on Mastodon.
|
is directly downloaded from.
|
||||||
|
|
||||||
> Previous updates can be found in CHANGELOG.
|
> Previous updates can be found in CHANGELOG.
|
||||||
|
|
||||||
@ -23,15 +23,15 @@ on Mastodon.
|
|||||||
* Optionally ignore retweets
|
* Optionally ignore retweets
|
||||||
* Allows rate-limiting posts to Mastodon instance
|
* Allows rate-limiting posts to Mastodon instance
|
||||||
|
|
||||||
## usage
|
## Usage
|
||||||
|
|
||||||
```
|
```
|
||||||
twoot.py [-h] -t <twitter account> -i <mastodon instance> -m <mastodon account>
|
twoot.py [-h] -t <twitter account> -i <mastodon instance> -m <mastodon account>
|
||||||
-p <mastodon password> [-r] [-s] [-v] [-a <max age in days)>]
|
-p <mastodon password> [-r] [-s] [-u] [-v] [-a <max age in days)>]
|
||||||
[-d <min delay (in mins)>] [-c <max # of toots to post>]
|
[-d <min delay (in mins)>] [-c <max # of toots to post>]
|
||||||
```
|
```
|
||||||
|
|
||||||
## arguments
|
## Arguments
|
||||||
|
|
||||||
Assuming that the Twitter handle is @SuperDuperBot and the Mastodon account
|
Assuming that the Twitter handle is @SuperDuperBot and the Mastodon account
|
||||||
is @superduperbot@botsin.space
|
is @superduperbot@botsin.space
|
||||||
@ -40,15 +40,24 @@ is @superduperbot@botsin.space
|
|||||||
|-------|--------------------------------------------------|--------------------|-----|
|
|-------|--------------------------------------------------|--------------------|-----|
|
||||||
| -t | twitter account name without '@' | `SuperDuper` | Yes |
|
| -t | twitter account name without '@' | `SuperDuper` | Yes |
|
||||||
| -i | Mastodon instance domain name | `botsin.space` | Yes |
|
| -i | Mastodon instance domain name | `botsin.space` | Yes |
|
||||||
| -m | Mastodon username | `superduperbot` | Yes |
|
| -m | Mastodon username | `sd@example.com` | Yes |
|
||||||
| -p | Mastodon password | `my_Sup3r-S4f3*pw` | Yes |
|
| -p | Mastodon password | `my_Sup3r-S4f3*pw` | Yes |
|
||||||
| -v | upload videos to Mastodon | *N/A* | No |
|
| -v | upload videos to Mastodon | *N/A* | No |
|
||||||
| -r | Post reply-to tweets (ignored by default) | *N/A* | No |
|
| -r | Post reply-to tweets (ignored by default) | *N/A* | No |
|
||||||
| -s | Skip retweets (posted by default) | *N/A* | No |
|
| -s | Skip retweets (posted by default) | *N/A* | No |
|
||||||
|
| -l | Remove link redirection | *N/A* | No |
|
||||||
|
| -u | Remove trackers from URLs | *N/A* | No |
|
||||||
| -a | Max. age of tweet to post (in days) | `5` | No |
|
| -a | Max. age of tweet to post (in days) | `5` | No |
|
||||||
| -d | Min. age before posting new tweet (in minutes) | `15` | No |
|
| -d | Min. age before posting new tweet (in minutes) | `15` | No |
|
||||||
| -c | Max number of toots allowed to post (cap) | `1` | No |
|
| -c | Max number of toots allowed to post (cap) | `1` | No |
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
`-l` will follow every link included in the tweet and replace them with the url that the
|
||||||
|
resource is directly dowmnloaded from (if applicable). e.g. bit.ly/xxyyyzz -> example.com
|
||||||
|
Every link visit can take up to 5 sec (timeout) therefore this option will slow down
|
||||||
|
tweet processing.
|
||||||
|
|
||||||
When using the `-v` switch consider:
|
When using the `-v` switch consider:
|
||||||
|
|
||||||
* whether the copyright of the content that you want to cross-post allows it
|
* whether the copyright of the content that you want to cross-post allows it
|
||||||
@ -61,7 +70,8 @@ Default min delay is 0 minutes.
|
|||||||
|
|
||||||
No limitation is applied to the number of toots uploaded if `-c` is not specified.
|
No limitation is applied to the number of toots uploaded if `-c` is not specified.
|
||||||
|
|
||||||
## installation
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
Make sure python3 is installed.
|
Make sure python3 is installed.
|
||||||
|
|
||||||
@ -104,5 +114,5 @@ Twoot is known to be used for the following feeds (older first):
|
|||||||
## Background
|
## Background
|
||||||
|
|
||||||
I started twoot when [tootbot](https://github.com/cquest/tootbot)
|
I started twoot when [tootbot](https://github.com/cquest/tootbot)
|
||||||
stopped working. Tootbot relies on rss feeds from https://twitrss.me
|
stopped working. Tootbot relied on RSS feeds from https://twitrss.me
|
||||||
that broke when Twitter refreshed their web UI in July 2019.
|
that broke when Twitter refreshed their web UI in July 2019.
|
||||||
|
99
test.py
Executable file
99
test.py
Executable file
@ -0,0 +1,99 @@
|
|||||||
|
#! /usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def deredir_url(url):
|
||||||
|
"""
|
||||||
|
Given a URL, return the URL that the page really downloads from
|
||||||
|
:param url: url to be de-redirected
|
||||||
|
:return: direct url
|
||||||
|
"""
|
||||||
|
|
||||||
|
ret = None
|
||||||
|
try:
|
||||||
|
# Download the page
|
||||||
|
ret = requests.get(url, timeout=5)
|
||||||
|
except:
|
||||||
|
# If anything goes wrong keep the URL intact
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Return the URL that the page was downloaded from
|
||||||
|
return ret.url
|
||||||
|
|
||||||
|
def _remove_tracker_params(query_str):
|
||||||
|
"""
|
||||||
|
private function
|
||||||
|
Given a query string from a URL, strip out the known trackers
|
||||||
|
:param query_str: query to be cleaned
|
||||||
|
:return: query cleaned
|
||||||
|
"""
|
||||||
|
# Avalaible URL tracking parameters :
|
||||||
|
# UTM tags by Google Ads, M$ Ads, ...
|
||||||
|
# tag by TikTok
|
||||||
|
# tags by Snapchat
|
||||||
|
# tags by Facebook
|
||||||
|
params_to_remove = [
|
||||||
|
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
||||||
|
"mkt_tok",
|
||||||
|
"campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
|
||||||
|
"media", "interest_group_name",
|
||||||
|
"xtor"
|
||||||
|
]
|
||||||
|
query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
|
||||||
|
query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove]
|
||||||
|
return urlencode(query_cleaned, safe='#', doseq=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_trackers_fragment(fragment_str):
|
||||||
|
"""
|
||||||
|
private function
|
||||||
|
Given a fragment string from a URL, strip out the known trackers
|
||||||
|
:param query_str: fragment to be cleaned
|
||||||
|
:return: cleaned fragment
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Not implemented
|
||||||
|
# Unclear what, if anything, can be done
|
||||||
|
# Need better understanding of fragment-based tracking
|
||||||
|
# https://builtvisible.com/one-weird-trick-to-avoid-utm-parameters/
|
||||||
|
|
||||||
|
return fragment_str
|
||||||
|
|
||||||
|
|
||||||
|
def clean_url(dirty_url):
|
||||||
|
"""
|
||||||
|
Given a URL, return it with the UTM parameters removed from query and fragment
|
||||||
|
:param dirty_url: url to be cleaned
|
||||||
|
:return: url cleaned
|
||||||
|
>>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
|
||||||
|
'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
|
||||||
|
"""
|
||||||
|
|
||||||
|
url_parsed = urlparse(dirty_url, allow_fragments=False)
|
||||||
|
|
||||||
|
cleaned_url = urlunparse([
|
||||||
|
url_parsed.scheme,
|
||||||
|
url_parsed.netloc,
|
||||||
|
url_parsed.path,
|
||||||
|
url_parsed.params,
|
||||||
|
_remove_tracker_params(url_parsed.query),
|
||||||
|
_remove_trackers_fragment(url_parsed.fragment)
|
||||||
|
])
|
||||||
|
|
||||||
|
return cleaned_url
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# url = 'https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok'
|
||||||
|
# url = "https://docs.helix-editor.com/keymap.html#movement"
|
||||||
|
# url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7071508/#sec1-nutrients-12-00530title"
|
||||||
|
# url = "https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title42-section12208&num=0&edition=prelim"
|
||||||
|
url = "https://shorturl.at/qwP38"
|
||||||
|
print('Orig: ' + url)
|
||||||
|
direct_url = deredir_url(url)
|
||||||
|
print('dir : ' + direct_url)
|
||||||
|
print('to : ' + clean_url(direct_url))
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
main()
|
110
twoot.py
110
twoot.py
@ -46,13 +46,13 @@ LOGGING_LEVEL = logging.DEBUG
|
|||||||
HTTPS_REQ_TIMEOUT = 10
|
HTTPS_REQ_TIMEOUT = 10
|
||||||
|
|
||||||
NITTER_URLS = [
|
NITTER_URLS = [
|
||||||
'https://nitter.42l.fr',
|
'https://nitter.lacontrevoie.fr',
|
||||||
'https://nitter.pussthecat.org',
|
'https://nitter.pussthecat.org',
|
||||||
'https://nitter.fdn.fr',
|
'https://nitter.fdn.fr',
|
||||||
'https://nitter.eu',
|
'https://nitter.eu',
|
||||||
'https://nitter.namazso.eu',
|
'https://nitter.namazso.eu',
|
||||||
'https://nitter.moomoo.me',
|
'https://n.l5.ca',
|
||||||
'https://n.ramle.be',
|
'https://nitter.bus-hit.me',
|
||||||
]
|
]
|
||||||
|
|
||||||
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
|
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
|
||||||
@ -67,7 +67,39 @@ USER_AGENTS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def _remove_tracker_params(query_str):
|
def deredir_url(url):
|
||||||
|
"""
|
||||||
|
Given a URL, return the URL that the page really downloads from
|
||||||
|
:param url: url to be de-redirected
|
||||||
|
:return: direct url
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Get a copy of the default headers that requests would use
|
||||||
|
headers = requests.utils.default_headers()
|
||||||
|
|
||||||
|
# Update default headers with randomly selected user agent
|
||||||
|
headers.update(
|
||||||
|
{
|
||||||
|
'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
ret = None
|
||||||
|
try:
|
||||||
|
# Download the page
|
||||||
|
ret = requests.get(url, headers=headers, timeout=5)
|
||||||
|
except:
|
||||||
|
# If anything goes wrong keep the URL intact
|
||||||
|
return url
|
||||||
|
|
||||||
|
if ret.url != url:
|
||||||
|
logging.debug("Removed redirection from: " + url + " to: " + ret.url)
|
||||||
|
|
||||||
|
# Return the URL that the page was downloaded from
|
||||||
|
return ret.url
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_trackers_query(query_str):
|
||||||
"""
|
"""
|
||||||
private function
|
private function
|
||||||
Given a query string from a URL, strip out the known trackers
|
Given a query string from a URL, strip out the known trackers
|
||||||
@ -79,25 +111,49 @@ def _remove_tracker_params(query_str):
|
|||||||
# tag by TikTok
|
# tag by TikTok
|
||||||
# tags by Snapchat
|
# tags by Snapchat
|
||||||
# tags by Facebook
|
# tags by Facebook
|
||||||
params_to_remove = [
|
params_to_remove = {
|
||||||
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
"gclid", "_ga", "gclsrc", "dclid",
|
||||||
|
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "utm_cid", "utm_reader", "utm_name", "utm_referrer", "utm_social", "utm_social-type",
|
||||||
"mkt_tok",
|
"mkt_tok",
|
||||||
"campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
|
"campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
|
||||||
"media", "interest_group_name",
|
"fbclid", "campaign_name", "ad_set_name", "ad_set_id", "media", "interest_group_name", "ad_set_id"
|
||||||
"xtor"
|
"igshid",
|
||||||
]
|
"cvid", "oicd", "msclkid",
|
||||||
|
"soc_src", "soc_trk",
|
||||||
|
"_openstat", "yclid",
|
||||||
|
"xtor", "xtref", "adid",
|
||||||
|
}
|
||||||
query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
|
query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
|
||||||
query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove]
|
query_cleaned = [(k, v) for k, v in query_to_clean.items() if k not in params_to_remove]
|
||||||
return urlencode(query_cleaned, doseq=True)
|
return urlencode(query_cleaned, doseq=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_trackers_fragment(fragment_str):
|
||||||
|
"""
|
||||||
|
private function
|
||||||
|
Given a fragment string from a URL, strip out the known trackers
|
||||||
|
:param query_str: fragment to be cleaned
|
||||||
|
:return: cleaned fragment
|
||||||
|
"""
|
||||||
|
|
||||||
|
params_to_remove = {
|
||||||
|
"Echobox",
|
||||||
|
}
|
||||||
|
|
||||||
|
if '=' in fragment_str:
|
||||||
|
fragment_str = fragment_str.split('&')
|
||||||
|
query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove]
|
||||||
|
fragment_str = '&'.join(query_cleaned)
|
||||||
|
return fragment_str
|
||||||
|
|
||||||
|
|
||||||
def clean_url(dirty_url):
|
def clean_url(dirty_url):
|
||||||
"""
|
"""
|
||||||
Given a URL, return it with the UTM parameters removed from query and fragment
|
Given a URL, return it with the UTM parameters removed from query and fragment
|
||||||
:param dirty_url: url to be cleaned
|
:param dirty_url: url to be cleaned
|
||||||
:return: url cleaned
|
:return: url cleaned
|
||||||
>>> clean_url('https://exemple.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
|
>>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
|
||||||
'https://exemple.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
|
'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
url_parsed = urlparse(dirty_url)
|
url_parsed = urlparse(dirty_url)
|
||||||
@ -107,18 +163,23 @@ def clean_url(dirty_url):
|
|||||||
url_parsed.netloc,
|
url_parsed.netloc,
|
||||||
url_parsed.path,
|
url_parsed.path,
|
||||||
url_parsed.params,
|
url_parsed.params,
|
||||||
_remove_tracker_params(url_parsed.query),
|
_remove_trackers_query(url_parsed.query),
|
||||||
_remove_tracker_params(url_parsed.fragment)
|
_remove_trackers_fragment(url_parsed.fragment)
|
||||||
])
|
])
|
||||||
|
|
||||||
|
if cleaned_url != dirty_url:
|
||||||
|
logging.debug('Cleaned URL from: ' + dirty_url + ' to: ' + cleaned_url)
|
||||||
|
|
||||||
return cleaned_url
|
return cleaned_url
|
||||||
|
|
||||||
|
|
||||||
def process_media_body(tt_iter):
|
def process_media_body(tt_iter, remove_redir, remove_trackers):
|
||||||
"""
|
"""
|
||||||
Receives an iterator over all the elements contained in the tweet-text container.
|
Receives an iterator over all the elements contained in the tweet-text container.
|
||||||
Processes them to make them suitable for posting on Mastodon
|
Processes them to make them suitable for posting on Mastodon
|
||||||
:param tt_iter: iterator over the HTML elements in the text of the tweet
|
:param tt_iter: iterator over the HTML elements in the text of the tweet
|
||||||
|
:param remove_redir: bool to indicate if redirections should be removed
|
||||||
|
:param remove_trackers: bool to indicate if trackers should be removed
|
||||||
:return: cleaned up text of the tweet
|
:return: cleaned up text of the tweet
|
||||||
"""
|
"""
|
||||||
tweet_text = ''
|
tweet_text = ''
|
||||||
@ -138,8 +199,16 @@ def process_media_body(tt_iter):
|
|||||||
# Only keep hashtag text
|
# Only keep hashtag text
|
||||||
tweet_text += tag_text
|
tweet_text += tag_text
|
||||||
else:
|
else:
|
||||||
# This is a real link, keep url
|
# This is a real link
|
||||||
tweet_text += clean_url(tag.get('href'))
|
if remove_redir:
|
||||||
|
url = deredir_url(tag.get('href'))
|
||||||
|
else:
|
||||||
|
url = tag.get('href')
|
||||||
|
|
||||||
|
if remove_trackers:
|
||||||
|
tweet_text += clean_url(url)
|
||||||
|
else:
|
||||||
|
tweet_text += url
|
||||||
else:
|
else:
|
||||||
logging.warning("No handler for tag in twitter text: " + tag.prettify())
|
logging.warning("No handler for tag in twitter text: " + tag.prettify())
|
||||||
|
|
||||||
@ -319,7 +388,8 @@ def main(argv):
|
|||||||
parser.add_argument('-m', metavar='<mastodon account>', action='store')
|
parser.add_argument('-m', metavar='<mastodon account>', action='store')
|
||||||
parser.add_argument('-p', metavar='<mastodon password>', action='store')
|
parser.add_argument('-p', metavar='<mastodon password>', action='store')
|
||||||
parser.add_argument('-r', action='store_true', help='Also post replies to other tweets')
|
parser.add_argument('-r', action='store_true', help='Also post replies to other tweets')
|
||||||
parser.add_argument('-s', action='store_true', help='Skip retweets')
|
parser.add_argument('-s', action='store_true', help='Suppress retweets')
|
||||||
|
parser.add_argument('-l', action='store_true', help='Remove link redirection')
|
||||||
parser.add_argument('-u', action='store_true', help='Remove trackers from URLs')
|
parser.add_argument('-u', action='store_true', help='Remove trackers from URLs')
|
||||||
parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance')
|
parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance')
|
||||||
parser.add_argument('-a', metavar='<max age (in days)>', action='store', type=float)
|
parser.add_argument('-a', metavar='<max age (in days)>', action='store', type=float)
|
||||||
@ -463,7 +533,7 @@ def main(argv):
|
|||||||
if toml['options']['post_reply_to']:
|
if toml['options']['post_reply_to']:
|
||||||
url += '/with_replies'
|
url += '/with_replies'
|
||||||
|
|
||||||
# Download twitter page of user.
|
# Download twitter page of user
|
||||||
try:
|
try:
|
||||||
twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
|
twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
@ -576,7 +646,7 @@ def main(argv):
|
|||||||
tt_iter = status.find('div', class_='tweet-content media-body').children
|
tt_iter = status.find('div', class_='tweet-content media-body').children
|
||||||
|
|
||||||
# Process text of tweet
|
# Process text of tweet
|
||||||
tweet_text += process_media_body(tt_iter)
|
tweet_text += process_media_body(tt_iter, remove_redir, remove_trackers)
|
||||||
|
|
||||||
# Process quote: append link to tweet_text
|
# Process quote: append link to tweet_text
|
||||||
quote_div = status.find('a', class_='quote-link')
|
quote_div = status.find('a', class_='quote-link')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user