2019-07-31 20:42:38 +00:00
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
2020-04-05 08:37:54 +00:00
"""
2022-08-22 12:27:18 +00:00
Copyright ( C ) 2019 - 2022 Jean - Christophe Francois
2019-07-31 20:42:38 +00:00
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < http : / / www . gnu . org / licenses / > .
2020-04-05 08:37:54 +00:00
"""
2019-07-31 20:42:38 +00:00
2019-08-01 12:58:41 +00:00
import argparse
2022-12-11 10:15:50 +00:00
from datetime import datetime , timedelta
2022-11-17 19:18:42 +00:00
import logging
2019-07-31 20:42:38 +00:00
import os
2022-12-11 10:15:50 +00:00
import shutil
2019-08-01 10:31:26 +00:00
import random
2022-11-17 19:18:42 +00:00
import re
2019-07-31 20:42:38 +00:00
import sqlite3
2022-11-17 19:18:42 +00:00
import sys
2023-06-01 12:12:32 +00:00
import time
2020-03-26 13:50:03 +00:00
from pathlib import Path
2023-06-13 14:31:41 +00:00
from urllib . parse import urlparse , parse_qsl , urlencode , urlunparse , urljoin , unquote
2022-11-17 19:18:42 +00:00
import requests
from bs4 import BeautifulSoup , element
2019-09-07 11:08:17 +00:00
from mastodon import Mastodon , MastodonError , MastodonAPIError , MastodonIllegalArgumentError
2020-03-25 16:40:07 +00:00
2022-09-15 17:58:17 +00:00
# Number of records to keep in db table for each twitter account
MAX_REC_COUNT = 50
2022-11-06 10:50:08 +00:00
# How many seconds to wait before giving up on a download (except video download)
HTTPS_REQ_TIMEOUT = 10
2021-03-02 21:08:52 +00:00
NITTER_URLS = [
2023-06-15 13:04:43 +00:00
' https://nitter.lacontrevoie.fr ' ,
2022-11-18 11:17:29 +00:00
' https://n.l5.ca ' ,
2023-06-12 15:43:08 +00:00
' https://nitter.cutelab.space ' , # USA, added 16/02/2023
2023-06-15 13:04:43 +00:00
' https://nitter.weiler.rocks ' , # added 15/06/2023
2023-06-12 15:43:08 +00:00
' https://nitter.fly.dev ' , # anycast, added 06/02/2023
' https://notabird.site ' , # anycast, added 06/02/2023
2023-06-16 11:50:23 +00:00
' https://nitter.nl ' , # added 16/06/2023
# 'https://nitter.sethforprivacy.com', # too slow, removed 16/06/2023
2023-06-15 13:04:43 +00:00
# 'https://nitter.it', # different pic naming scheme
2023-06-12 15:43:08 +00:00
# 'https://twitter.femboy.hu', # 404 on 06/05/2023
# 'https://nitter.grimneko.de', # 404 on 01/06/2023
# 'https://nitter.namazso.eu', # lots of 403 27/02/2023
# 'https://twitter.beparanoid.de', # moved 27/022023
# 'https://nitter.fdn.fr', # not updated, rate limited, removed 06/02/2023
# 'https://nitter.hu',
# 'https://nitter.privacydev.net', # USA, added 06/02/2023, removed 15/02/2023 too slow
2022-11-17 19:18:42 +00:00
]
2020-03-26 13:50:03 +00:00
2019-09-17 13:44:03 +00:00
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
2019-08-01 10:31:26 +00:00
USER_AGENTS = [
2023-06-19 18:13:46 +00:00
' Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 ' ,
' Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0 ' ,
' Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15 ' ,
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51 ' ,
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 OPR/99.0.0.0 ' ,
' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Vivaldi/6.1.3035.84 ' ,
2022-11-17 19:18:42 +00:00
]
2022-11-23 08:59:06 +00:00
2022-12-11 10:15:50 +00:00
def build_config ( args ) :
"""
Receives the arguments passed on the command line
populates the TOML global dict with default values for all ' options ' keys
if a config file is provided , load the keys from the config file
if no config file is provided , use command - line args
verify that a valid config is available ( all keys in ' config ' present )
: param args : list of command line arguments
"""
# Create global struct containing configuration
global TOML
# Default options
options = {
' upload_videos ' : False ,
' post_reply_to ' : False ,
' skip_retweets ' : False ,
' remove_link_redirections ' : False ,
' remove_trackers_from_urls ' : False ,
' footer ' : ' ' ,
' remove_original_tweet_ref ' : False ,
' tweet_max_age ' : float ( 1 ) ,
' tweet_delay ' : float ( 0 ) ,
' toot_cap ' : int ( 0 ) ,
' subst_twitter ' : [ ] ,
' subst_youtube ' : [ ] ,
' subst_reddit ' : [ ] ,
2023-06-15 12:35:27 +00:00
' update_profile ' : False ,
2022-12-11 10:15:50 +00:00
' log_level ' : " WARNING " ,
' log_days ' : 3 ,
}
# Create default config object
2023-06-12 15:43:08 +00:00
TOML = { ' config ' : { } , ' options ' : options }
2022-12-11 10:15:50 +00:00
# Load config file if it was provided
toml_file = args [ ' f ' ]
if toml_file is not None :
2023-06-12 15:43:08 +00:00
try : # Included in python from version 3.11
2022-12-11 10:15:50 +00:00
import tomllib
except ModuleNotFoundError :
# for python < 3.11, tomli module must be installed
import tomli as tomllib
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
loaded_toml = None
# Load toml file
try :
with open ( toml_file , ' rb ' ) as config_file :
loaded_toml = tomllib . load ( config_file )
except FileNotFoundError :
print ( ' config file not found ' )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2022-12-11 10:15:50 +00:00
except tomllib . TOMLDecodeError :
print ( ' Malformed config file ' )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
TOML [ ' config ' ] = loaded_toml [ ' config ' ]
for k in TOML [ ' options ' ] . keys ( ) :
try : # Go through all valid keys
TOML [ ' options ' ] [ k ] = loaded_toml [ ' options ' ] [ k ]
except KeyError : # Key was not found in file
pass
else :
# Override config parameters with command-line values provided
if args [ ' t ' ] is not None :
TOML [ ' config ' ] [ ' twitter_account ' ] = args [ ' t ' ]
if args [ ' i ' ] is not None :
TOML [ ' config ' ] [ ' mastodon_instance ' ] = args [ ' i ' ]
if args [ ' m ' ] is not None :
TOML [ ' config ' ] [ ' mastodon_user ' ] = args [ ' m ' ]
if args [ ' v ' ] is True :
TOML [ ' options ' ] [ ' upload_videos ' ] = args [ ' v ' ]
if args [ ' r ' ] is True :
TOML [ ' options ' ] [ ' post_reply_to ' ] = args [ ' r ' ]
if args [ ' s ' ] is True :
TOML [ ' options ' ] [ ' skip_retweets ' ] = args [ ' s ' ]
if args [ ' l ' ] is True :
TOML [ ' options ' ] [ ' remove_link_redirections ' ] = args [ ' l ' ]
if args [ ' u ' ] is True :
TOML [ ' options ' ] [ ' remove_trackers_from_urls ' ] = args [ ' u ' ]
if args [ ' o ' ] is True :
TOML [ ' options ' ] [ ' remove_original_tweet_ref ' ] = args [ ' o ' ]
if args [ ' a ' ] is not None :
TOML [ ' options ' ] [ ' tweet_max_age ' ] = float ( args [ ' a ' ] )
if args [ ' d ' ] is not None :
TOML [ ' options ' ] [ ' tweet_delay ' ] = float ( args [ ' d ' ] )
if args [ ' c ' ] is not None :
TOML [ ' options ' ] [ ' toot_cap ' ] = int ( args [ ' c ' ] )
2023-06-15 12:49:30 +00:00
if args [ ' q ' ] is True :
TOML [ ' options ' ] [ ' update_profile ' ] = args [ ' q ' ]
2022-12-11 10:15:50 +00:00
# Verify that we have a minimum config to run
if ' twitter_account ' not in TOML [ ' config ' ] . keys ( ) or TOML [ ' config ' ] [ ' twitter_account ' ] == " " :
print ( ' CRITICAL: Missing Twitter account ' )
2023-06-15 12:49:30 +00:00
exit ( - 1 )
2022-12-11 10:15:50 +00:00
if ' mastodon_instance ' not in TOML [ ' config ' ] . keys ( ) or TOML [ ' config ' ] [ ' mastodon_instance ' ] == " " :
print ( ' CRITICAL: Missing Mastodon instance ' )
2023-06-15 12:49:30 +00:00
exit ( - 1 )
2022-12-11 10:15:50 +00:00
if ' mastodon_user ' not in TOML [ ' config ' ] . keys ( ) or TOML [ ' config ' ] [ ' mastodon_user ' ] == " " :
print ( ' CRITICAL: Missing Mastodon user ' )
2023-06-15 12:49:30 +00:00
exit ( - 1 )
2022-12-11 10:15:50 +00:00
2023-06-14 14:49:15 +00:00
2023-06-15 12:11:48 +00:00
def update_profile ( nitter_url , soup , sql , mast_password ) :
2023-06-14 14:22:28 +00:00
"""
2023-06-15 12:35:27 +00:00
Update profile on Mastodon
Check if avatar or banner pictures have changed since last run
If they have , download them and upload them on the Mastodon account profile
2023-06-14 14:22:28 +00:00
: param soup : BeautifulSoup object containing the page
2023-06-14 20:24:51 +00:00
: param sql : database connection
2023-06-14 14:49:15 +00:00
: param mast_password : < PASSWORD >
: return : mastodon object we had to login to update , None otherwise
2023-06-14 14:22:28 +00:00
"""
2023-06-15 12:37:41 +00:00
# Check if TOML option to update profile is set
if TOML [ ' options ' ] [ ' update_profile ' ] is False :
return None
2023-06-15 13:04:43 +00:00
else :
logging . debug ( " Checking twitter profile for changes " )
2023-06-14 14:49:15 +00:00
2023-06-14 20:24:51 +00:00
db = sql . cursor ( )
2023-06-13 15:28:05 +00:00
# Extract avatar picture address
2023-06-15 15:10:18 +00:00
try :
new_avatar_url = soup . find ( ' div ' , class_ = ' profile-card-info ' ) . findChild ( ' a ' ) . findChild ( ' img ' ) . get ( ' src ' )
except AttributeError :
new_avatar_url = None
2023-06-13 15:28:05 +00:00
# Extract banner picture address
2023-06-15 15:10:18 +00:00
try :
new_banner_url = soup . find ( ' div ' , class_ = ' profile-banner ' ) . findChild ( ' a ' ) . findChild ( ' img ' ) . get ( ' src ' )
except AttributeError :
new_banner_url = None
2023-06-14 14:22:28 +00:00
2023-06-14 14:49:15 +00:00
# Get the original urls of the avatar and banner pictures on the account profile
2023-06-15 12:49:30 +00:00
db . execute ( " SELECT avatar_url, banner_url FROM profiles WHERE mastodon_instance=? AND mastodon_account=? " , ( TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] , ) )
2023-06-14 14:22:28 +00:00
profile_in_db = db . fetchone ( )
2023-06-14 14:49:15 +00:00
2023-06-15 12:11:48 +00:00
changed = False
2023-06-14 14:22:28 +00:00
if profile_in_db is not None :
cur_avatar_url = profile_in_db [ 0 ]
cur_banner_url = profile_in_db [ 1 ]
2023-06-14 14:49:15 +00:00
2023-06-14 14:22:28 +00:00
# Check if urls have changed
2023-06-15 12:11:48 +00:00
if new_avatar_url != cur_avatar_url :
changed = True
2023-06-14 15:10:00 +00:00
logging . info ( ' avatar image changed on twitter profile ' )
2023-06-15 12:11:48 +00:00
if new_banner_url != cur_banner_url :
changed = True
2023-06-14 15:10:00 +00:00
logging . info ( ' banner image changed on twitter profile ' )
2023-06-14 20:24:51 +00:00
else :
# Mastodon user not found in database. Add new record
2023-06-15 15:58:38 +00:00
db . execute ( " INSERT INTO profiles (mastodon_instance, mastodon_account, avatar_url, banner_url) VALUES (?, ?, ?, ?) " , ( TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] , None , None ) )
2023-06-14 20:24:51 +00:00
sql . commit ( )
2023-06-15 12:11:48 +00:00
changed = True
2023-06-15 12:23:41 +00:00
logging . debug ( " added new profile to database " )
2023-06-13 15:28:05 +00:00
2023-06-14 14:49:15 +00:00
mastodon = None
2023-06-15 12:11:48 +00:00
# Update if necessary
if changed :
2023-06-14 15:10:00 +00:00
logging . info ( ' updating profile on Mastodon ' )
2023-06-15 12:11:48 +00:00
2023-06-15 18:07:39 +00:00
new_avatar_img = None
new_avatar_mime = None
new_banner_img = None
new_banner_mime = None
2023-06-15 12:11:48 +00:00
# Download images
new_avatar = requests . get ( nitter_url + new_avatar_url , timeout = HTTPS_REQ_TIMEOUT ) if new_avatar_url is not None else None
2023-06-15 15:40:26 +00:00
if new_avatar is not None :
new_avatar_img = new_avatar . content if new_avatar . status_code == 200 else None
new_avatar_mime = new_avatar . headers [ ' content-type ' ] if new_avatar . status_code == 200 else None
2023-06-19 18:13:46 +00:00
if new_avatar . status_code != 200 :
2023-06-15 15:40:26 +00:00
logging . error ( " Could not download avatar image from " + nitter_url + new_avatar_url )
else :
logging . debug ( " Avatar image downloaded " )
2023-06-15 12:11:48 +00:00
new_banner = requests . get ( nitter_url + new_banner_url , timeout = HTTPS_REQ_TIMEOUT ) if new_banner_url is not None else None
2023-06-15 15:40:26 +00:00
if new_banner is not None :
new_banner_img = new_banner . content if new_banner . status_code == 200 else None
new_banner_mime = new_banner . headers [ ' content-type ' ] if new_banner . status_code == 200 else None
2023-06-19 18:13:46 +00:00
if new_banner . status_code != 200 :
2023-06-15 15:40:26 +00:00
logging . error ( " Could not download banner image from " + nitter_url + new_banner_url )
else :
logging . debug ( " Banner image downloaded " )
2023-06-15 12:11:48 +00:00
2023-06-14 14:49:15 +00:00
mastodon = login ( mast_password )
2023-06-15 18:07:39 +00:00
2023-06-14 15:10:00 +00:00
# Update profile on Mastodon
try :
2023-06-18 19:40:21 +00:00
mastodon . account_update_credentials ( avatar = new_avatar_img , avatar_mime_type = new_avatar_mime , header = new_banner_img , header_mime_type = new_banner_mime )
2023-06-14 15:10:00 +00:00
except Exception as e :
logging . error ( " Could not update profile " )
logging . error ( e )
2023-06-14 20:24:51 +00:00
else :
2023-06-15 18:34:57 +00:00
logging . info ( " Profile updated on Mastodon " )
2023-06-14 20:24:51 +00:00
# Add urls to database
2023-06-15 12:49:30 +00:00
db . execute ( " UPDATE profiles SET avatar_url=?, banner_url=? WHERE mastodon_instance=? AND mastodon_account=? " , ( new_avatar_url , new_banner_url , TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] ) )
2023-06-14 20:24:51 +00:00
sql . commit ( )
2023-06-15 15:49:50 +00:00
logging . debug ( " Profile updated on database " )
2023-06-15 13:04:43 +00:00
else :
logging . debug ( " No changes to profile found " )
2023-06-14 15:10:00 +00:00
2023-06-14 14:49:15 +00:00
return mastodon
2022-12-11 10:15:50 +00:00
2022-11-22 10:05:16 +00:00
def deredir_url ( url ) :
"""
Given a URL , return the URL that the page really downloads from
: param url : url to be de - redirected
: return : direct url
"""
2022-12-11 10:15:50 +00:00
# Check if we need to do anyting
if TOML [ ' options ' ] [ ' remove_link_redirections ' ] is False :
return url
2022-11-22 10:05:16 +00:00
# Get a copy of the default headers that requests would use
headers = requests . utils . default_headers ( )
# Update default headers with randomly selected user agent
headers . update (
{
' User-Agent ' : USER_AGENTS [ random . randint ( 0 , len ( USER_AGENTS ) - 1 ) ] ,
}
)
ret = None
try :
# Download the page
2023-01-31 20:58:07 +00:00
ret = requests . head ( url , headers = headers , allow_redirects = True , timeout = 5 )
2022-11-22 10:05:16 +00:00
except :
# If anything goes wrong keep the URL intact
return url
2022-11-22 10:08:29 +00:00
if ret . url != url :
logging . debug ( " Removed redirection from: " + url + " to: " + ret . url )
2022-11-22 10:05:16 +00:00
# Return the URL that the page was downloaded from
return ret . url
2022-11-17 19:18:42 +00:00
2022-11-22 10:38:49 +00:00
2022-11-19 12:12:41 +00:00
def _remove_trackers_query ( query_str ) :
2022-11-17 19:18:42 +00:00
"""
private function
Given a query string from a URL , strip out the known trackers
: param query_str : query to be cleaned
: return : query cleaned
"""
# Avalaible URL tracking parameters :
# UTM tags by Google Ads, M$ Ads, ...
# tag by TikTok
# tags by Snapchat
# tags by Facebook
2022-11-22 21:01:27 +00:00
params_to_remove = {
" gclid " , " _ga " , " gclsrc " , " dclid " ,
2023-01-31 21:13:30 +00:00
" utm_source " , " utm_medium " , " utm_campaign " , " utm_term " , " utm_content " , " utm_cid " ,
" utm_reader " , " utm_name " , " utm_referrer " , " utm_social " , " utm_social-type " , " utm_brand "
2022-11-17 19:18:42 +00:00
" mkt_tok " ,
" campaign_name " , " ad_set_name " , " campaign_id " , " ad_set_id " ,
2022-11-22 21:01:27 +00:00
" fbclid " , " campaign_name " , " ad_set_name " , " ad_set_id " , " media " , " interest_group_name " , " ad_set_id "
" igshid " ,
" cvid " , " oicd " , " msclkid " ,
" soc_src " , " soc_trk " ,
" _openstat " , " yclid " ,
" xtor " , " xtref " , " adid " ,
}
2022-11-17 19:18:42 +00:00
query_to_clean = dict ( parse_qsl ( query_str , keep_blank_values = True ) )
2022-11-23 08:59:06 +00:00
query_cleaned = [ ( k , v ) for k , v in query_to_clean . items ( ) if k not in params_to_remove ]
2022-11-17 19:18:42 +00:00
return urlencode ( query_cleaned , doseq = True )
2022-11-19 12:12:41 +00:00
def _remove_trackers_fragment ( fragment_str ) :
"""
private function
Given a fragment string from a URL , strip out the known trackers
: param query_str : fragment to be cleaned
: return : cleaned fragment
"""
2022-11-22 21:01:27 +00:00
params_to_remove = {
" Echobox " ,
}
2022-11-23 08:59:06 +00:00
2022-11-22 21:01:27 +00:00
if ' = ' in fragment_str :
fragment_str = fragment_str . split ( ' & ' )
query_cleaned = [ i for i in fragment_str if i . split ( ' = ' ) [ 0 ] not in params_to_remove ]
fragment_str = ' & ' . join ( query_cleaned )
2022-11-19 12:12:41 +00:00
return fragment_str
2022-12-11 10:15:50 +00:00
def substitute_source ( orig_url ) :
"""
param orig_url : url to check for substitutes
: return : url with replaced domains
"""
parsed_url = urlparse ( orig_url )
domain = parsed_url . netloc
logging . debug ( " Checking domain %s for substitution " , domain )
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
# Handle twitter
twitter_subst = TOML [ " options " ] [ " subst_twitter " ]
# Do not substitiute if subdomain is present (e.g. i.twitter.com)
2023-06-12 15:43:08 +00:00
if ( domain == ' twitter.com ' or domain == ' www.twitter.com ' ) and twitter_subst != [ ] :
2022-12-11 10:15:50 +00:00
domain = twitter_subst [ random . randint ( 0 , len ( twitter_subst ) - 1 ) ]
logging . debug ( " Replaced twitter.com by " + domain )
# Handle youtube
youtube_subst = TOML [ " options " ] [ " subst_youtube " ]
# Do not substitiute if subdomain is present (e.g. i.youtube.com)
2023-06-12 15:43:08 +00:00
if ( domain == ' youtube.com ' or domain == ' wwww.youtube.com ' ) and youtube_subst != [ ] :
2022-12-11 10:15:50 +00:00
domain = youtube_subst [ random . randint ( 0 , len ( youtube_subst ) - 1 ) ]
logging . debug ( " Replaced youtube.com by " + domain )
# Handle reddit
reddit_subst = TOML [ " options " ] [ " subst_reddit " ]
# Do not substitiute if subdomain is present (e.g. i.reddit.com)
2023-06-12 15:43:08 +00:00
if ( domain == ' reddit.com ' or domain == ' www.reddit.com ' ) and reddit_subst != [ ] :
2022-12-11 10:15:50 +00:00
domain = reddit_subst [ random . randint ( 0 , len ( reddit_subst ) - 1 ) ]
logging . debug ( " Replaced reddit.com by " + domain )
dest_url = urlunparse ( [
parsed_url . scheme ,
domain ,
parsed_url . path ,
parsed_url . params ,
parsed_url . query ,
parsed_url . fragment
] )
return dest_url
2023-06-12 15:43:08 +00:00
2022-12-11 10:15:50 +00:00
def clean_url ( orig_url ) :
2022-11-17 19:18:42 +00:00
"""
Given a URL , return it with the UTM parameters removed from query and fragment
: param dirty_url : url to be cleaned
: return : url cleaned
2022-11-18 10:55:06 +00:00
>> > clean_url ( ' https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok ' )
' https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok '
2022-11-17 19:18:42 +00:00
"""
2022-12-11 10:15:50 +00:00
# Check if we have to do anything
if TOML [ ' options ' ] [ ' remove_trackers_from_urls ' ] is False :
return orig_url
2022-11-17 19:18:42 +00:00
2023-01-30 10:11:15 +00:00
# Parse a URL into 6 components:
# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
2022-12-11 10:15:50 +00:00
url_parsed = urlparse ( orig_url )
2022-11-17 19:18:42 +00:00
2023-01-30 10:11:15 +00:00
# Reassemble URL after removal of trackers
2022-12-11 10:15:50 +00:00
dest_url = urlunparse ( [
2022-11-17 19:18:42 +00:00
url_parsed . scheme ,
url_parsed . netloc ,
url_parsed . path ,
url_parsed . params ,
2022-11-19 12:12:41 +00:00
_remove_trackers_query ( url_parsed . query ) ,
_remove_trackers_fragment ( url_parsed . fragment )
2022-11-17 19:18:42 +00:00
] )
2022-12-11 10:15:50 +00:00
if dest_url != orig_url :
logging . debug ( ' Cleaned URL from: ' + orig_url + ' to: ' + dest_url )
2022-11-17 19:18:42 +00:00
2022-12-11 10:15:50 +00:00
return dest_url
2022-11-18 11:57:44 +00:00
2019-07-31 20:42:38 +00:00
2022-12-11 10:15:50 +00:00
def process_media_body ( tt_iter ) :
2020-12-18 10:45:43 +00:00
"""
2020-02-14 17:01:12 +00:00
Receives an iterator over all the elements contained in the tweet - text container .
2020-12-17 21:08:43 +00:00
Processes them to make them suitable for posting on Mastodon
2020-03-27 16:26:04 +00:00
: param tt_iter : iterator over the HTML elements in the text of the tweet
2020-12-17 21:08:43 +00:00
: return : cleaned up text of the tweet
2020-12-18 10:45:43 +00:00
"""
2022-12-11 10:15:50 +00:00
2019-07-31 20:42:38 +00:00
tweet_text = ' '
# Iterate elements
for tag in tt_iter :
# If element is plain text, copy it verbatim
if isinstance ( tag , element . NavigableString ) :
tweet_text + = tag . string
# If it is an 'a' html tag
2020-12-17 21:08:43 +00:00
elif tag . name == ' a ' :
tag_text = tag . get_text ( )
2020-12-18 13:57:22 +00:00
if tag_text . startswith ( ' @ ' ) :
2020-12-17 21:08:43 +00:00
# Only keep user name
tweet_text + = tag_text
2020-12-18 13:57:22 +00:00
elif tag_text . startswith ( ' # ' ) :
2020-12-17 21:08:43 +00:00
# Only keep hashtag text
tweet_text + = tag_text
else :
2022-11-22 10:05:16 +00:00
# This is a real link
2022-12-11 10:15:50 +00:00
url = deredir_url ( tag . get ( ' href ' ) )
url = substitute_source ( url )
url = clean_url ( url )
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
tweet_text + = url
2019-07-31 20:42:38 +00:00
else :
2020-12-17 09:15:46 +00:00
logging . warning ( " No handler for tag in twitter text: " + tag . prettify ( ) )
2019-07-31 20:42:38 +00:00
return tweet_text
2021-03-02 21:08:52 +00:00
def process_card ( nitter_url , card_container ) :
2020-12-18 10:45:43 +00:00
"""
2020-12-17 21:59:21 +00:00
Extract image from card in case mastodon does not do it
: param card_container : soup of ' a ' tag containing card markup
: return : list with url of image
2020-12-18 10:45:43 +00:00
"""
2020-12-17 21:59:21 +00:00
list = [ ]
2020-12-18 20:32:26 +00:00
img = card_container . div . div . img
if img is not None :
2021-03-02 21:08:52 +00:00
image_url = nitter_url + img . get ( ' src ' )
2020-12-18 20:32:26 +00:00
list . append ( image_url )
logging . debug ( ' Extracted image from card ' )
2020-12-17 21:59:21 +00:00
return list
2020-12-18 10:45:43 +00:00
2022-12-11 10:15:50 +00:00
def process_attachments ( nitter_url , attachments_container , status_id , author_account ) :
2020-12-18 10:45:43 +00:00
"""
Extract images or video from attachments . Videos are downloaded on the file system .
2021-03-02 21:08:52 +00:00
: param nitter_url : url of nitter mirror
: param attachments_container : soup of ' div ' tag containing attachments markup
2020-12-18 12:26:26 +00:00
: param twit_account : name of twitter account
2020-12-18 16:55:12 +00:00
: param status_id : id of tweet being processed
2020-12-18 12:26:26 +00:00
: param author_account : author of tweet with video attachment
2020-12-18 10:45:43 +00:00
: return : list with url of images
"""
# Collect url of images
pics = [ ]
images = attachments_container . find_all ( ' a ' , class_ = ' still-image ' )
for image in images :
2021-03-02 21:08:52 +00:00
pics . append ( nitter_url + image . get ( ' href ' ) )
2020-12-18 13:57:22 +00:00
2022-12-11 10:15:50 +00:00
logging . debug ( ' collected ' + str ( len ( pics ) ) + ' image(s) from attachments ' )
2020-12-18 10:45:43 +00:00
2020-12-18 12:26:26 +00:00
# Download nitter video (converted animated GIF)
gif_class = attachments_container . find ( ' video ' , class_ = ' gif ' )
if gif_class is not None :
2021-03-02 21:08:52 +00:00
gif_video_file = nitter_url + gif_class . source . get ( ' src ' )
2020-12-18 12:26:26 +00:00
2022-12-11 10:15:50 +00:00
video_path = os . path . join ( ' output ' , TOML [ ' config ' ] [ ' twitter_account ' ] , status_id , author_account , status_id )
2020-12-18 16:55:12 +00:00
os . makedirs ( video_path , exist_ok = True )
2020-12-18 13:28:17 +00:00
# Open directory for writing file
2020-12-18 20:06:05 +00:00
orig_dir = os . getcwd ( )
os . chdir ( video_path )
2022-11-06 10:50:08 +00:00
with requests . get ( gif_video_file , stream = True , timeout = HTTPS_REQ_TIMEOUT ) as r :
try :
# Raise exception if response code is not 200
r . raise_for_status ( )
# Download chunks and write them to file
with open ( ' gif_video.mp4 ' , ' wb ' ) as f :
for chunk in r . iter_content ( chunk_size = 16 * 1024 ) :
f . write ( chunk )
logging . debug ( ' Downloaded video of GIF animation from attachments ' )
except : # Don't do anything if video can't be found or downloaded
logging . debug ( ' Could not download video of GIF animation from attachments ' )
pass
2020-12-18 13:28:17 +00:00
# Close directory
2020-12-18 20:06:05 +00:00
os . chdir ( orig_dir )
2020-12-18 13:28:17 +00:00
# Download twitter video
2020-12-18 20:06:05 +00:00
vid_in_tweet = False
2020-12-18 13:28:17 +00:00
vid_class = attachments_container . find ( ' div ' , class_ = ' video-container ' )
if vid_class is not None :
2022-12-11 10:15:50 +00:00
if TOML [ ' options ' ] [ ' upload_videos ' ] :
2022-11-03 21:10:23 +00:00
import youtube_dl
2023-01-03 10:00:41 +00:00
video_path = f " { author_account } /status/ { status_id } "
video_file = urljoin ( ' https://twitter.com ' , video_path )
2022-11-03 21:10:23 +00:00
ydl_opts = {
2022-12-11 10:15:50 +00:00
' outtmpl ' : " output/ " + TOML [ ' config ' ] [ ' twitter_account ' ] + " / " + status_id + " / %(id)s . %(ext)s " ,
2022-11-03 21:10:23 +00:00
' format ' : " best[width<=500] " ,
' socket_timeout ' : 60 ,
2022-11-06 10:24:57 +00:00
' quiet ' : True ,
2022-11-03 21:10:23 +00:00
}
with youtube_dl . YoutubeDL ( ydl_opts ) as ydl :
try :
ydl . download ( [ video_file ] )
except Exception as e :
2022-12-11 10:15:50 +00:00
logging . warning ( ' Error downloading twitter video: ' + str ( e ) )
2020-12-18 20:06:05 +00:00
vid_in_tweet = True
2020-12-18 13:39:13 +00:00
else :
logging . debug ( ' downloaded twitter video from attachments ' )
2020-12-18 20:06:05 +00:00
return pics , vid_in_tweet
2020-12-18 10:45:43 +00:00
2020-02-15 14:39:01 +00:00
def contains_class ( body_classes , some_class ) :
2020-12-18 10:45:43 +00:00
"""
2020-02-15 14:39:01 +00:00
: param body_classes : list of classes to search
: param some_class : class that we are interested in
: return : True if found , false otherwise
2020-12-18 10:45:43 +00:00
"""
2020-02-15 14:39:01 +00:00
found = False
for body_class in body_classes :
if body_class == some_class :
found = True
return found
2022-11-03 21:10:23 +00:00
2022-12-11 10:15:50 +00:00
def is_time_valid ( timestamp ) :
2020-12-17 16:31:43 +00:00
ret = True
# Check that the tweet is not too young (might be deleted) or too old
age_in_hours = ( time . time ( ) - float ( timestamp ) ) / 3600.0
2022-12-11 10:15:50 +00:00
min_delay_in_hours = TOML [ ' options ' ] [ ' tweet_delay ' ] / 60.0
max_age_in_hours = TOML [ ' options ' ] [ ' tweet_max_age ' ] * 24.0
2020-12-17 16:31:43 +00:00
if age_in_hours < min_delay_in_hours or age_in_hours > max_age_in_hours :
ret = False
return ret
2022-09-14 14:28:48 +00:00
2022-12-11 10:15:50 +00:00
def login ( password ) :
"""
Login to Mastodon account and return mastodon object used to post content
: param password : Password associated to account . None if not provided
: return : mastodon object
"""
# Create Mastodon application if it does not exist yet
if not os . path . isfile ( TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' .secret ' ) :
2021-03-07 20:26:52 +00:00
try :
Mastodon . create_app (
2022-12-11 10:15:50 +00:00
' feedtoot ' ,
api_base_url = ' https:// ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] ,
to_file = TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' .secret '
2021-03-07 20:26:52 +00:00
)
except MastodonError as me :
2022-12-11 10:15:50 +00:00
logging . fatal ( ' failed to create app on ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] )
2021-03-07 20:26:52 +00:00
logging . fatal ( me )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2021-03-07 20:26:52 +00:00
2022-12-11 10:15:50 +00:00
mastodon = None
# Log in to Mastodon instance with password
if password is not None :
try :
mastodon = Mastodon (
client_id = TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' .secret ' ,
api_base_url = ' https:// ' + TOML [ ' config ' ] [ ' mastodon_instance ' ]
)
mastodon . log_in (
username = TOML [ ' config ' ] [ ' mastodon_user ' ] ,
password = password ,
to_file = TOML [ ' config ' ] [ ' mastodon_user ' ] + " .secret "
)
logging . info ( ' Logging in to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] )
except MastodonError as me :
logging . fatal ( ' Login to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed \n ' )
logging . fatal ( me )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
if os . path . isfile ( TOML [ ' config ' ] [ ' mastodon_user ' ] + ' .secret ' ) :
2023-06-14 20:24:51 +00:00
logging . warning ( ''' You successfully logged in using a password and an access token
has been saved . The password can therefore be omitted from the
command - line in future invocations ''' )
2023-06-12 15:43:08 +00:00
else : # No password provided, login with token
2022-12-11 10:15:50 +00:00
# Using token in existing .secret file
if os . path . isfile ( TOML [ ' config ' ] [ ' mastodon_user ' ] + ' .secret ' ) :
try :
mastodon = Mastodon (
access_token = TOML [ ' config ' ] [ ' mastodon_user ' ] + ' .secret ' ,
2023-06-12 15:43:08 +00:00
api_base_url = ' https:// ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] )
2022-12-11 10:15:50 +00:00
except MastodonError as me :
logging . fatal ( ' Login to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed \n ' )
logging . fatal ( me )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2022-12-11 10:15:50 +00:00
else :
logging . fatal ( ' No .secret file found. Password required to log in ' )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2022-09-14 14:28:48 +00:00
2021-03-07 20:26:52 +00:00
return mastodon
2020-12-16 18:43:17 +00:00
2023-06-19 18:13:46 +00:00
def shutdown ( exit_code ) :
2022-12-11 10:15:50 +00:00
"""
Cleanly stop execution with a message on execution duration
Remove log messages older that duration specified in config from log file
: param exit_code : return value to pass to shell when exiting
"""
logging . info ( ' Run time : {t:2.1f} seconds. ' . format ( t = time . time ( ) - START_TIME ) )
logging . info ( ' _____________________________________________________________________________________ ' )
# Close logger and log file
logging . shutdown ( )
# Remove older log messages
# Max allowed age of log message
max_delta = timedelta ( TOML [ ' options ' ] [ ' log_days ' ] )
# Open log file
2022-12-11 11:01:32 +00:00
log_file_name = TOML [ ' config ' ] [ ' twitter_account ' ] . lower ( ) + ' .log '
new_log_file_name = TOML [ ' config ' ] [ ' twitter_account ' ] . lower ( ) + ' .log.new '
2022-12-11 10:15:50 +00:00
try :
log_file = open ( log_file_name , ' r ' )
except FileNotFoundError :
# Nothing to do if there is no log file
exit ( exit_code )
# Check each line
pos = log_file . tell ( )
2022-12-11 21:42:19 +00:00
while True :
2022-12-11 10:15:50 +00:00
line = log_file . readline ( )
2022-12-11 21:42:19 +00:00
# Check if we reached the end of the file
if not line :
exit ( exit_code )
2022-12-11 10:15:50 +00:00
try :
# Extract date on log line
date = datetime . strptime ( line [ : 10 ] , ' % Y- % m- %d ' )
except ValueError :
# date was not found on this line, try next one
continue
2022-12-11 21:42:19 +00:00
2022-12-11 10:15:50 +00:00
# Time difference between log message and now
log_delta = datetime . now ( ) - date
# Only keep the number of days of the difference
log_delta = timedelta ( days = log_delta . days )
if log_delta < max_delta :
2023-01-30 10:11:15 +00:00
logging . debug ( " Truncating log file " )
2022-12-11 10:15:50 +00:00
# Reset file pointer to position before reading last line
log_file . seek ( pos )
remainder = log_file . read ( )
output_file = open ( new_log_file_name , ' w ' )
output_file . write ( remainder )
output_file . close ( )
# replace log file by new one
shutil . move ( new_log_file_name , log_file_name )
break # Exit while loop
# Update read pointer position
pos = log_file . tell ( )
exit ( exit_code )
2019-08-01 12:58:41 +00:00
def main ( argv ) :
2020-12-19 08:21:39 +00:00
# Start stopwatch
2022-12-11 10:15:50 +00:00
global START_TIME
START_TIME = time . time ( )
2019-08-01 12:58:41 +00:00
# Build parser for command line arguments
parser = argparse . ArgumentParser ( description = ' toot tweets. ' )
2022-12-11 10:15:50 +00:00
parser . add_argument ( ' -f ' , metavar = ' <.toml config file> ' , action = ' store ' )
parser . add_argument ( ' -t ' , metavar = ' <twitter account> ' , action = ' store ' )
parser . add_argument ( ' -i ' , metavar = ' <mastodon instance> ' , action = ' store ' )
parser . add_argument ( ' -m ' , metavar = ' <mastodon account> ' , action = ' store ' )
parser . add_argument ( ' -p ' , metavar = ' <mastodon password> ' , action = ' store ' )
2020-03-26 18:58:17 +00:00
parser . add_argument ( ' -r ' , action = ' store_true ' , help = ' Also post replies to other tweets ' )
2022-11-13 21:17:43 +00:00
parser . add_argument ( ' -s ' , action = ' store_true ' , help = ' Suppress retweets ' )
2022-11-23 08:59:06 +00:00
parser . add_argument ( ' -l ' , action = ' store_true ' , help = ' Remove link redirection ' )
2022-11-18 10:55:06 +00:00
parser . add_argument ( ' -u ' , action = ' store_true ' , help = ' Remove trackers from URLs ' )
2020-03-26 18:58:17 +00:00
parser . add_argument ( ' -v ' , action = ' store_true ' , help = ' Ingest twitter videos and upload to Mastodon instance ' )
2022-12-11 10:15:50 +00:00
parser . add_argument ( ' -o ' , action = ' store_true ' , help = ' Do not add reference to Original tweet ' )
2023-06-15 12:49:30 +00:00
parser . add_argument ( ' -q ' , action = ' store_true ' , help = ' update profile if changed ' )
2022-12-11 10:15:50 +00:00
parser . add_argument ( ' -a ' , metavar = ' <max age (in days)> ' , action = ' store ' , type = float )
parser . add_argument ( ' -d ' , metavar = ' <min delay (in mins)> ' , action = ' store ' , type = float )
parser . add_argument ( ' -c ' , metavar = ' <max # of toots to post> ' , action = ' store ' , type = int )
2019-08-01 12:58:41 +00:00
# Parse command line
args = vars ( parser . parse_args ( ) )
2022-12-11 10:15:50 +00:00
build_config ( args )
2019-08-01 12:58:41 +00:00
mast_password = args [ ' p ' ]
2020-12-18 16:21:41 +00:00
2020-12-18 16:06:09 +00:00
# Setup logging to file
2021-06-01 13:49:11 +00:00
logging . basicConfig (
2022-12-11 11:01:32 +00:00
filename = TOML [ ' config ' ] [ ' twitter_account ' ] . lower ( ) + ' .log ' ,
2021-06-01 14:12:05 +00:00
format = ' %(asctime)s %(levelname)-8s %(message)s ' ,
2021-06-01 13:49:11 +00:00
datefmt = ' % Y- % m- %d % H: % M: % S ' ,
)
2022-12-21 08:41:59 +00:00
# Set default level of logging
2022-12-11 10:15:50 +00:00
log_level = logging . WARNING
2022-12-21 08:41:59 +00:00
# log level as an uppercase string from config
ll_str = TOML [ ' options ' ] [ ' log_level ' ] . upper ( )
if ll_str == " DEBUG " :
log_level = logging . DEBUG
elif ll_str == " INFO " :
log_level = logging . INFO
elif ll_str == " WARNING " :
log_level = logging . WARNING
elif ll_str == " ERROR " :
log_level = logging . ERROR
elif ll_str == " CRITICAL " :
log_level == logging . CRITICAL
elif ll_str == " OFF " :
# Disable all logging
logging . disable ( logging . CRITICAL )
else :
logging . error ( ' Invalid log_level %s in config file. Using WARNING. ' , str ( TOML [ ' options ' ] [ ' log_level ' ] ) )
# Set desired level of logging
2022-12-11 10:15:50 +00:00
logger = logging . getLogger ( )
logger . setLevel ( log_level )
logging . info ( ' Running with the following configuration: ' )
logging . info ( ' Config File : ' + str ( args [ ' f ' ] ) )
logging . info ( ' twitter_account : ' + TOML [ ' config ' ] [ ' twitter_account ' ] )
logging . info ( ' mastodon_instance : ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] )
logging . info ( ' mastodon_user : ' + TOML [ ' config ' ] [ ' mastodon_user ' ] )
logging . info ( ' upload_videos : ' + str ( TOML [ ' options ' ] [ ' upload_videos ' ] ) )
logging . info ( ' post_reply_to : ' + str ( TOML [ ' options ' ] [ ' post_reply_to ' ] ) )
logging . info ( ' skip_retweets : ' + str ( TOML [ ' options ' ] [ ' skip_retweets ' ] ) )
logging . info ( ' remove_link_redirections : ' + str ( TOML [ ' options ' ] [ ' remove_link_redirections ' ] ) )
logging . info ( ' remove_trackers_from_urls: ' + str ( TOML [ ' options ' ] [ ' remove_trackers_from_urls ' ] ) )
logging . info ( ' footer : ' + TOML [ ' options ' ] [ ' footer ' ] )
logging . info ( ' remove_original_tweet_ref: ' + str ( TOML [ ' options ' ] [ ' remove_original_tweet_ref ' ] ) )
2023-06-15 13:04:43 +00:00
logging . info ( ' update_profile : ' + str ( TOML [ ' options ' ] [ ' update_profile ' ] ) )
2022-12-11 10:15:50 +00:00
logging . info ( ' tweet_max_age : ' + str ( TOML [ ' options ' ] [ ' tweet_max_age ' ] ) )
logging . info ( ' tweet_delay : ' + str ( TOML [ ' options ' ] [ ' tweet_delay ' ] ) )
logging . info ( ' toot_cap : ' + str ( TOML [ ' options ' ] [ ' toot_cap ' ] ) )
logging . info ( ' subst_twitter : ' + str ( TOML [ ' options ' ] [ ' subst_twitter ' ] ) )
logging . info ( ' subst_twitter : ' + str ( TOML [ ' options ' ] [ ' subst_youtube ' ] ) )
logging . info ( ' subst_twitter : ' + str ( TOML [ ' options ' ] [ ' subst_reddit ' ] ) )
logging . info ( ' log_level : ' + str ( TOML [ ' options ' ] [ ' log_level ' ] ) )
logging . info ( ' log_days : ' + str ( TOML [ ' options ' ] [ ' log_days ' ] ) )
2023-02-27 11:48:48 +00:00
2020-03-28 10:08:09 +00:00
# Try to open database. If it does not exist, create it
sql = sqlite3 . connect ( ' twoot.db ' )
db = sql . cursor ( )
db . execute ( ''' CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
mastodon_account TEXT , tweet_id TEXT , toot_id TEXT ) ''' )
2022-08-22 12:50:03 +00:00
db . execute ( ''' CREATE INDEX IF NOT EXISTS main_index ON toots (twitter_account,
mastodon_instance , mastodon_account , tweet_id ) ''' )
2023-06-15 12:49:30 +00:00
db . execute ( ''' CREATE TABLE IF NOT EXISTS profiles (mastodon_instance TEXT, mastodon_account TEXT, avatar_url TEXT, banner_url TEXT) ''' )
db . execute ( ''' CREATE INDEX IF NOT EXIsTS profile_index ON profiles (mastodon_instance, mastodon_account) ''' )
2020-03-28 10:08:09 +00:00
2021-03-02 21:08:52 +00:00
# Select random nitter instance to fetch updates from
2022-09-14 14:28:48 +00:00
nitter_url = NITTER_URLS [ random . randint ( 0 , len ( NITTER_URLS ) - 1 ) ]
2021-03-02 21:08:52 +00:00
2019-08-01 12:58:41 +00:00
# **********************************************************
# Load twitter page of user. Process all tweets and generate
# list of dictionaries ready to be posted on Mastodon
# **********************************************************
# To store content of all tweets from this user
tweets = [ ]
2020-03-06 16:40:13 +00:00
# Initiate session
session = requests . Session ( )
2019-08-01 12:58:41 +00:00
# Get a copy of the default headers that requests would use
headers = requests . utils . default_headers ( )
# Update default headers with randomly selected user agent
headers . update (
{
2022-11-17 19:18:42 +00:00
' User-Agent ' : USER_AGENTS [ random . randint ( 0 , len ( USER_AGENTS ) - 1 ) ] ,
2020-12-18 16:55:12 +00:00
' Cookie ' : ' replaceTwitter=; replaceYouTube=; hlsPlayback=on; proxyVideos= ' ,
2019-08-01 12:58:41 +00:00
}
2019-08-01 10:02:27 +00:00
)
2022-12-11 10:15:50 +00:00
url = nitter_url + ' / ' + TOML [ ' config ' ] [ ' twitter_account ' ]
2020-12-17 16:50:10 +00:00
# Use different page if we need to handle replies
2022-12-11 10:15:50 +00:00
if TOML [ ' options ' ] [ ' post_reply_to ' ] :
2020-12-17 16:50:10 +00:00
url + = ' /with_replies '
2022-11-22 10:05:16 +00:00
# Download twitter page of user
2021-10-16 17:26:02 +00:00
try :
2022-11-06 10:50:08 +00:00
twit_account_page = session . get ( url , headers = headers , timeout = HTTPS_REQ_TIMEOUT )
2021-10-16 17:26:02 +00:00
except requests . exceptions . ConnectionError :
logging . fatal ( ' Host did not respond when trying to download ' + url )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2022-11-02 17:38:23 +00:00
except requests . exceptions . Timeout :
logging . fatal ( nitter_url + ' took too long to respond ' )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2020-02-13 17:01:45 +00:00
2020-02-15 14:39:01 +00:00
# Verify that download worked
2020-12-18 13:39:13 +00:00
if twit_account_page . status_code != 200 :
2022-11-17 19:18:42 +00:00
logging . fatal ( ' The Nitter page did not download correctly from ' + url + ' ( ' + str (
twit_account_page . status_code ) + ' ). Aborting ' )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2020-02-15 14:39:01 +00:00
2022-12-11 10:15:50 +00:00
logging . debug ( ' Nitter page downloaded successfully from ' + url )
2020-03-06 16:40:13 +00:00
2020-02-13 17:01:45 +00:00
# DEBUG: Save page to file
2023-06-15 12:23:41 +00:00
# of = open(TOML['config']['twitter_account'] + '.html', 'w')
# of.write(twit_account_page.text)
# of.close()
2020-12-16 18:43:17 +00:00
2020-02-15 14:39:01 +00:00
# Make soup
soup = BeautifulSoup ( twit_account_page . text , ' html.parser ' )
2019-08-01 12:58:41 +00:00
# Extract twitter timeline
2020-12-16 19:55:26 +00:00
timeline = soup . find_all ( ' div ' , class_ = ' timeline-item ' )
2020-02-14 17:01:12 +00:00
2020-12-17 16:50:10 +00:00
logging . info ( ' Processing ' + str ( len ( timeline ) ) + ' tweets found in timeline ' )
2020-12-17 16:31:43 +00:00
# **********************************************************
# Process each tweets and generate dictionary
# with data ready to be posted on Mastodon
# **********************************************************
2020-12-18 21:09:34 +00:00
out_date_cnt = 0
in_db_cnt = 0
2020-02-14 17:01:12 +00:00
for status in timeline :
2020-03-28 10:08:09 +00:00
# Extract tweet ID and status ID
2020-12-16 20:55:13 +00:00
tweet_id = status . find ( ' a ' , class_ = ' tweet-link ' ) . get ( ' href ' ) . strip ( ' #m ' )
2020-03-28 10:08:09 +00:00
status_id = tweet_id . split ( ' / ' ) [ 3 ]
2020-10-14 19:51:00 +00:00
logging . debug ( ' processing tweet %s ' , tweet_id )
2020-12-17 16:31:43 +00:00
# Extract time stamp
time_string = status . find ( ' span ' , class_ = ' tweet-date ' ) . a . get ( ' title ' )
2022-01-03 17:11:40 +00:00
try :
2022-12-11 10:15:50 +00:00
timestamp = datetime . strptime ( time_string , ' %d / % m/ % Y, % H: % M: % S ' ) . timestamp ( )
2022-01-03 17:11:40 +00:00
except :
# Dec 21, 2021 · 12:00 PM UTC
2022-12-11 10:15:50 +00:00
timestamp = datetime . strptime ( time_string , ' % b %d , % Y · % I: % M % p % Z ' ) . timestamp ( )
2020-12-17 16:31:43 +00:00
# Check if time is within acceptable range
2022-12-11 10:15:50 +00:00
if not is_time_valid ( timestamp ) :
2020-12-18 21:09:34 +00:00
out_date_cnt + = 1
2020-12-17 16:31:43 +00:00
logging . debug ( " Tweet outside valid time range, skipping " )
continue
2022-11-13 21:17:43 +00:00
# Check if retweets must be skipped
2022-12-11 10:15:50 +00:00
if TOML [ ' options ' ] [ ' skip_retweets ' ] :
2022-11-13 21:17:43 +00:00
# Check if this tweet is a retweet
2022-11-13 21:35:46 +00:00
if len ( status . select ( " div.tweet-body > div > div.retweet-header " ) ) != 0 :
2022-11-13 21:17:43 +00:00
logging . debug ( " Retweet ignored per command-line configuration " )
continue
2020-03-28 10:08:09 +00:00
# Check in database if tweet has already been posted
2022-11-17 19:18:42 +00:00
db . execute (
" SELECT * FROM toots WHERE twitter_account=? AND mastodon_instance=? AND mastodon_account=? AND tweet_id=? " ,
2022-12-11 10:15:50 +00:00
( TOML [ ' config ' ] [ ' twitter_account ' ] , TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] , tweet_id ) )
2020-03-28 10:08:09 +00:00
tweet_in_db = db . fetchone ( )
if tweet_in_db is not None :
2020-12-18 21:09:34 +00:00
in_db_cnt + = 1
2020-10-14 19:51:00 +00:00
logging . debug ( " Tweet %s already in database " , tweet_id )
2020-03-28 10:08:09 +00:00
# Skip to next tweet
continue
2020-11-09 14:55:42 +00:00
else :
logging . debug ( ' Tweet %s not found in database ' , tweet_id )
2020-10-14 19:51:00 +00:00
2020-02-15 14:39:01 +00:00
# extract author
2020-12-16 21:15:27 +00:00
author = status . find ( ' a ' , class_ = ' fullname ' ) . get ( ' title ' )
2019-08-01 12:58:41 +00:00
# Extract user name
2020-12-16 21:15:27 +00:00
author_account = status . find ( ' a ' , class_ = ' username ' ) . get ( ' title ' ) . lstrip ( ' @ ' )
2019-08-01 12:58:41 +00:00
2020-12-16 21:46:01 +00:00
# Extract URL of full status page (for video download)
full_status_url = ' https://twitter.com ' + tweet_id
2020-12-17 21:59:21 +00:00
# Initialize containers
2020-12-17 16:56:12 +00:00
tweet_text = ' '
2020-12-17 21:59:21 +00:00
photos = [ ]
2020-12-17 16:56:12 +00:00
2020-12-17 17:59:02 +00:00
# Add prefix if the tweet is a reply-to
2022-08-22 06:54:17 +00:00
# Only consider item of class 'replying-to' that is a direct child
2022-09-14 14:28:48 +00:00
# of class 'tweet-body' in status. Others can be in a quoted tweet.
2022-08-22 06:54:17 +00:00
replying_to_class = status . select ( " div.tweet-body > div.replying-to " )
2022-08-22 07:33:27 +00:00
if len ( replying_to_class ) != 0 :
2022-08-22 07:30:52 +00:00
tweet_text + = ' Replying to ' + replying_to_class [ 0 ] . a . get_text ( ) + ' \n \n '
2020-12-17 16:31:43 +00:00
# Check it the tweet is a retweet from somebody else
2022-11-13 21:35:46 +00:00
if len ( status . select ( " div.tweet-body > div > div.retweet-header " ) ) != 0 :
2020-12-17 16:31:43 +00:00
tweet_text = ' RT from ' + author + ' (@ ' + author_account + ' ) \n \n '
2020-02-14 06:58:39 +00:00
2019-08-01 12:58:41 +00:00
# extract iterator over tweet text contents
2020-12-16 21:46:01 +00:00
tt_iter = status . find ( ' div ' , class_ = ' tweet-content media-body ' ) . children
2019-08-01 12:58:41 +00:00
2020-12-17 21:59:21 +00:00
# Process text of tweet
2022-12-11 10:15:50 +00:00
tweet_text + = process_media_body ( tt_iter )
2020-12-17 20:44:32 +00:00
2020-12-17 21:59:21 +00:00
# Process quote: append link to tweet_text
2020-12-18 21:41:57 +00:00
quote_div = status . find ( ' a ' , class_ = ' quote-link ' )
2020-12-17 21:59:21 +00:00
if quote_div is not None :
2022-12-11 10:15:50 +00:00
tweet_text + = substitute_source ( ' \n \n https://twitter.com ' + quote_div . get ( ' href ' ) . strip ( ' #m ' ) )
2020-12-17 20:44:32 +00:00
2020-12-17 21:59:21 +00:00
# Process card : extract image if necessary
card_class = status . find ( ' a ' , class_ = ' card-container ' )
if card_class is not None :
2021-03-02 21:08:52 +00:00
photos . extend ( process_card ( nitter_url , card_class ) )
2020-12-17 20:44:32 +00:00
2020-12-18 13:39:13 +00:00
# Process attachment: capture image or .mp4 url or download twitter video
2020-12-18 13:57:22 +00:00
attachments_class = status . find ( ' div ' , class_ = ' attachments ' )
2020-12-18 10:45:43 +00:00
if attachments_class is not None :
2022-12-11 10:15:50 +00:00
pics , vid_in_tweet = process_attachments ( nitter_url ,
attachments_class ,
2023-06-12 15:43:08 +00:00
status_id , author_account )
2020-12-18 20:06:05 +00:00
photos . extend ( pics )
if vid_in_tweet :
tweet_text + = ' \n \n [Video embedded in original tweet] '
2019-08-01 12:58:41 +00:00
2022-12-11 10:15:50 +00:00
# Add custom footer from config file
if TOML [ ' options ' ] [ ' footer ' ] != ' ' :
tweet_text + = ' \n \n ' + TOML [ ' options ' ] [ ' footer ' ]
2019-08-01 12:58:41 +00:00
# Add footer with link to original tweet
2023-06-12 15:43:08 +00:00
if TOML [ ' options ' ] [ ' remove_original_tweet_ref ' ] is False :
2022-12-11 10:15:50 +00:00
if TOML [ ' options ' ] [ ' footer ' ] != ' ' :
tweet_text + = ' \n Original tweet : ' + substitute_source ( full_status_url )
else :
tweet_text + = ' \n \n Original tweet : ' + substitute_source ( full_status_url )
2019-08-01 12:58:41 +00:00
2019-08-16 13:27:55 +00:00
# If no media was specifically added in the tweet, try to get the first picture
# with "twitter:image" meta tag in first linked page in tweet text
if not photos :
m = re . search ( r " http[^ \ n \ xa0]* " , tweet_text )
if m is not None :
link_url = m . group ( 0 )
2020-09-10 11:09:51 +00:00
if link_url . endswith ( " .html " ) : # Only process a web page
try :
2022-11-06 10:50:08 +00:00
r = requests . get ( link_url , timeout = HTTPS_REQ_TIMEOUT )
2020-09-10 11:09:51 +00:00
if r . status_code == 200 :
# Matches the first instance of either twitter:image or twitter:image:src meta tag
match = re . search ( r ' <meta name= " twitter:image(?:|:src) " content= " (.+?) " .*?> ' , r . text )
if match is not None :
url = match . group ( 1 ) . replace ( ' & ' , ' & ' ) # Remove HTML-safe encoding from URL if any
photos . append ( url )
# Give up if anything goes wrong
except ( requests . exceptions . ConnectionError ,
requests . exceptions . Timeout ,
requests . exceptions . ContentDecodingError ,
requests . exceptions . TooManyRedirects ,
requests . exceptions . MissingSchema ) :
pass
2020-12-19 09:53:11 +00:00
else :
logging . debug ( " downloaded twitter:image from linked page " )
2019-08-16 13:27:55 +00:00
2020-03-26 13:50:03 +00:00
# Check if video was downloaded
video_file = None
2020-03-29 15:16:54 +00:00
2022-12-11 10:15:50 +00:00
video_path = Path ( ' ./output ' ) / TOML [ ' config ' ] [ ' twitter_account ' ] / status_id
2020-03-29 15:16:54 +00:00
if video_path . exists ( ) :
# list video files
video_file_list = list ( video_path . glob ( ' *.mp4 ' ) )
if len ( video_file_list ) != 0 :
# Extract posix path of first video file in list
video_file = video_file_list [ 0 ] . absolute ( ) . as_posix ( )
2020-03-26 13:50:03 +00:00
2019-08-01 12:58:41 +00:00
# Add dictionary with content of tweet to list
tweet = {
" author " : author ,
" author_account " : author_account ,
" timestamp " : timestamp ,
" tweet_id " : tweet_id ,
" tweet_text " : tweet_text ,
2020-03-26 13:50:03 +00:00
" video " : video_file ,
2019-08-01 12:58:41 +00:00
" photos " : photos ,
}
tweets . append ( tweet )
2020-12-17 16:31:43 +00:00
logging . debug ( ' Tweet %s added to list of toots to upload ' , tweet_id )
2021-03-02 21:08:52 +00:00
# Log summary stats
2020-12-18 21:09:34 +00:00
logging . info ( str ( out_date_cnt ) + ' tweets outside of valid time range ' )
logging . info ( str ( in_db_cnt ) + ' tweets already in database ' )
2020-10-14 19:51:00 +00:00
2023-06-14 14:22:28 +00:00
# Initialise Mastodon object
2021-03-07 20:26:52 +00:00
mastodon = None
2023-06-14 14:22:28 +00:00
# Update profile if it has changed
2023-06-15 12:11:48 +00:00
mastodon = update_profile ( nitter_url , soup , sql , mast_password )
2023-06-14 14:22:28 +00:00
# Login to account on maston instance
if len ( tweets ) != 0 and mastodon is None :
2022-12-11 10:15:50 +00:00
mastodon = login ( mast_password )
2021-03-07 20:26:52 +00:00
2019-08-01 12:58:41 +00:00
# **********************************************************
2020-03-28 10:08:09 +00:00
# Iterate tweets in list.
2020-12-17 16:31:43 +00:00
# post each on Mastodon and record it in database
2019-08-01 12:58:41 +00:00
# **********************************************************
2020-12-18 21:09:34 +00:00
posted_cnt = 0
2019-08-01 12:58:41 +00:00
for tweet in reversed ( tweets ) :
2021-06-01 09:54:08 +00:00
# Check if we have reached the cap on the number of toots to post
2022-12-11 10:15:50 +00:00
if TOML [ ' options ' ] [ ' toot_cap ' ] != 0 and posted_cnt > = TOML [ ' options ' ] [ ' toot_cap ' ] :
logging . info ( ' %d toots not posted due to configured cap ' , len ( tweets ) - TOML [ ' options ' ] [ ' toot_cap ' ] )
2021-06-01 09:54:08 +00:00
break
2020-10-14 19:51:00 +00:00
logging . debug ( ' Uploading Tweet %s ' , tweet [ " tweet_id " ] )
2019-08-01 12:58:41 +00:00
media_ids = [ ]
2020-03-26 18:03:21 +00:00
# Upload video if there is one
if tweet [ ' video ' ] is not None :
2019-08-26 19:02:19 +00:00
try :
2020-12-19 09:59:23 +00:00
logging . debug ( " Uploading video to Mastodon " )
2020-03-26 18:03:21 +00:00
media_posted = mastodon . media_post ( tweet [ ' video ' ] )
media_ids . append ( media_posted [ ' id ' ] )
2022-11-17 19:18:42 +00:00
except ( MastodonAPIError , MastodonIllegalArgumentError ,
TypeError ) : # Media cannot be uploaded (invalid format, dead link, etc.)
2020-11-09 14:55:42 +00:00
logging . debug ( " Uploading video failed " )
2019-10-24 17:35:24 +00:00
pass
2019-08-01 12:58:41 +00:00
2020-03-26 18:03:21 +00:00
else : # Only upload pic if no video was uploaded
# Upload photos
for photo in tweet [ ' photos ' ] :
media = False
# Download picture
2019-11-07 19:06:15 +00:00
try :
2020-12-19 09:59:23 +00:00
logging . debug ( ' downloading picture ' )
2022-11-06 10:50:08 +00:00
media = requests . get ( photo , timeout = HTTPS_REQ_TIMEOUT )
2020-03-26 18:03:21 +00:00
except : # Picture cannot be downloaded for any reason
2019-11-07 19:06:15 +00:00
pass
2020-03-26 18:03:21 +00:00
# Upload picture to Mastodon instance
if media :
try :
2020-12-19 09:59:23 +00:00
logging . debug ( ' uploading picture to Mastodon ' )
2020-03-26 18:03:21 +00:00
media_posted = mastodon . media_post ( media . content , mime_type = media . headers [ ' content-type ' ] )
media_ids . append ( media_posted [ ' id ' ] )
2022-11-17 19:18:42 +00:00
except ( MastodonAPIError , MastodonIllegalArgumentError ,
TypeError ) : # Media cannot be uploaded (invalid format, dead link, etc.)
2020-03-26 18:03:21 +00:00
pass
2019-08-01 12:58:41 +00:00
# Post toot
2022-10-08 08:25:04 +00:00
toot = { }
2019-08-01 12:58:41 +00:00
try :
if len ( media_ids ) == 0 :
2023-01-01 12:38:08 +00:00
toot = mastodon . status_post ( tweet [ ' tweet_text ' ] )
2019-08-01 12:58:41 +00:00
else :
2023-01-01 12:38:08 +00:00
toot = mastodon . status_post ( tweet [ ' tweet_text ' ] , media_ids = media_ids )
2019-08-01 12:58:41 +00:00
2023-02-06 19:41:12 +00:00
except MastodonAPIError :
2023-02-27 11:48:48 +00:00
# Assuming this is an:
2023-02-06 19:41:12 +00:00
# ERROR ('Mastodon API returned error', 422, 'Unprocessable Entity', 'Cannot attach files that have not finished processing. Try again in a moment!')
logging . warning ( ' Mastodon API Error 422: Cannot attach files that have not finished processing. Waiting 15 seconds and retrying. ' )
2023-02-06 20:40:18 +00:00
# Wait 15 seconds
2023-02-06 19:41:12 +00:00
time . sleep ( 15 )
# retry posting
try :
toot = mastodon . status_post ( tweet [ ' tweet_text ' ] , media_ids = media_ids )
except MastodonError as me :
logging . error ( ' posting ' + tweet [ ' tweet_text ' ] + ' to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed ' )
logging . error ( me )
2023-06-15 15:49:50 +00:00
else :
logging . warning ( " Retry successful " )
2023-02-06 19:41:12 +00:00
2019-08-01 12:58:41 +00:00
except MastodonError as me :
2022-12-11 10:15:50 +00:00
logging . error ( ' posting ' + tweet [ ' tweet_text ' ] + ' to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed ' )
2020-10-14 19:51:00 +00:00
logging . error ( me )
2019-08-01 12:58:41 +00:00
2020-12-18 21:09:34 +00:00
else :
posted_cnt + = 1
2022-12-11 10:15:50 +00:00
logging . debug ( ' Tweet %s posted on %s ' , tweet [ ' tweet_id ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] )
2020-10-14 19:51:00 +00:00
2019-08-01 12:58:41 +00:00
# Insert toot id into database
if ' id ' in toot :
db . execute ( " INSERT INTO toots VALUES ( ? , ? , ? , ? , ? ) " ,
2022-12-11 10:15:50 +00:00
( TOML [ ' config ' ] [ ' twitter_account ' ] , TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] , tweet [ ' tweet_id ' ] , toot [ ' id ' ] ) )
2019-08-01 12:58:41 +00:00
sql . commit ( )
2022-09-24 11:26:08 +00:00
logging . info ( str ( posted_cnt ) + ' tweets posted to Mastodon ' )
2020-12-18 21:09:34 +00:00
2020-03-27 16:26:04 +00:00
# Cleanup downloaded video files
2020-03-28 10:21:28 +00:00
try :
2022-12-11 10:15:50 +00:00
shutil . rmtree ( ' ./output/ ' + TOML [ ' config ' ] [ ' twitter_account ' ] )
2020-03-28 10:21:28 +00:00
except FileNotFoundError : # The directory does not exist
pass
2019-08-01 12:58:41 +00:00
2022-09-15 17:58:17 +00:00
# Evaluate excess records in database
excess_count = 0
2022-12-11 10:15:50 +00:00
db . execute ( ' SELECT count(*) FROM toots WHERE twitter_account=? ' , ( TOML [ ' config ' ] [ ' twitter_account ' ] , ) )
2022-09-15 17:58:17 +00:00
db_count = db . fetchone ( )
if db_count is not None :
excess_count = db_count [ 0 ] - MAX_REC_COUNT
# Delete excess records
if excess_count > 0 :
db . execute ( '''
WITH excess AS (
SELECT tweet_id
FROM toots
WHERE twitter_account = ?
2022-09-15 18:12:20 +00:00
ORDER BY toot_id ASC
2022-09-15 17:58:17 +00:00
LIMIT ?
)
DELETE from toots
2022-12-11 10:15:50 +00:00
WHERE tweet_id IN excess ''' , (TOML[ ' config ' ][ ' twitter_account ' ], excess_count))
2022-09-15 17:58:17 +00:00
sql . commit ( )
2022-09-15 18:12:20 +00:00
logging . info ( ' Deleted ' + str ( excess_count ) + ' old records from database. ' )
2023-02-27 11:48:48 +00:00
2023-06-19 18:13:46 +00:00
shutdown ( 0 )
2020-12-19 09:30:19 +00:00
2023-06-12 15:43:08 +00:00
2019-08-01 12:58:41 +00:00
if __name__ == " __main__ " :
main ( sys . argv )