2019-07-31 20:42:38 +00:00
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
2020-04-05 08:37:54 +00:00
"""
2023-06-19 18:22:41 +00:00
Copyright ( C ) 2019 - 2023 Jean - Christophe Francois
2019-07-31 20:42:38 +00:00
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < http : / / www . gnu . org / licenses / > .
2020-04-05 08:37:54 +00:00
"""
2019-07-31 20:42:38 +00:00
2019-08-01 12:58:41 +00:00
import argparse
2022-12-11 10:15:50 +00:00
from datetime import datetime , timedelta
2022-11-17 19:18:42 +00:00
import logging
2019-07-31 20:42:38 +00:00
import os
2022-12-11 10:15:50 +00:00
import shutil
2019-08-01 10:31:26 +00:00
import random
2022-11-17 19:18:42 +00:00
import re
2019-07-31 20:42:38 +00:00
import sqlite3
2022-11-17 19:18:42 +00:00
import sys
2023-06-01 12:12:32 +00:00
import time
2020-03-26 13:50:03 +00:00
from pathlib import Path
2023-09-14 09:30:55 +00:00
from urllib . parse import urlparse , parse_qsl , urlencode , urlunparse , urljoin
2022-11-17 19:18:42 +00:00
import requests
from bs4 import BeautifulSoup , element
2019-09-07 11:08:17 +00:00
from mastodon import Mastodon , MastodonError , MastodonAPIError , MastodonIllegalArgumentError
2023-07-14 18:18:56 +00:00
import pytz
2020-03-25 16:40:07 +00:00
2022-09-15 17:58:17 +00:00
# Number of records to keep in db table for each twitter account
MAX_REC_COUNT = 50
2022-11-06 10:50:08 +00:00
# How many seconds to wait before giving up on a download (except video download)
HTTPS_REQ_TIMEOUT = 10
2019-09-17 13:44:03 +00:00
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
2019-08-01 10:31:26 +00:00
USER_AGENTS = [
2023-10-30 12:26:23 +00:00
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ' ,
' Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0 ' ,
' Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15 ' ,
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76 ' ,
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 OPR/104.0.0.0 ' ,
' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Vivaldi/6.4.3160.34 ' ,
2022-11-17 19:18:42 +00:00
]
2022-11-23 08:59:06 +00:00
2023-09-14 15:41:51 +00:00
def main ( argv ) :
# Start stopwatch
global START_TIME
START_TIME = time . time ( )
# Build parser for command line arguments
parser = argparse . ArgumentParser ( description = ' toot tweets. ' )
parser . add_argument ( ' -f ' , metavar = ' <.toml config file> ' , action = ' store ' )
parser . add_argument ( ' -t ' , metavar = ' <twitter account> ' , action = ' store ' )
parser . add_argument ( ' -i ' , metavar = ' <mastodon instance> ' , action = ' store ' )
parser . add_argument ( ' -m ' , metavar = ' <mastodon account> ' , action = ' store ' )
parser . add_argument ( ' -p ' , metavar = ' <mastodon password> ' , action = ' store ' )
parser . add_argument ( ' -r ' , action = ' store_true ' , help = ' Also post replies to other tweets ' )
parser . add_argument ( ' -s ' , action = ' store_true ' , help = ' Suppress retweets ' )
parser . add_argument ( ' -l ' , action = ' store_true ' , help = ' Remove link redirection ' )
parser . add_argument ( ' -u ' , action = ' store_true ' , help = ' Remove trackers from URLs ' )
parser . add_argument ( ' -v ' , action = ' store_true ' , help = ' Ingest twitter videos and upload to Mastodon instance ' )
parser . add_argument ( ' -o ' , action = ' store_true ' , help = ' Do not add reference to Original tweet ' )
parser . add_argument ( ' -q ' , action = ' store_true ' , help = ' update profile if changed ' )
parser . add_argument ( ' -a ' , metavar = ' <max age (in days)> ' , action = ' store ' , type = float )
parser . add_argument ( ' -d ' , metavar = ' <min delay (in mins)> ' , action = ' store ' , type = float )
parser . add_argument ( ' -c ' , metavar = ' <max # of toots to post> ' , action = ' store ' , type = int )
# Parse command line
args = vars ( parser . parse_args ( ) )
build_config ( args )
mast_password = args [ ' p ' ]
# Setup logging to file
logging . basicConfig (
filename = TOML [ ' config ' ] [ ' twitter_account ' ] . lower ( ) + ' .log ' ,
format = ' %(asctime)s %(levelname)-8s %(message)s ' ,
datefmt = ' % Y- % m- %d % H: % M: % S ' ,
)
# Set default level of logging
log_level = logging . WARNING
# log level as an uppercase string from config
ll_str = TOML [ ' options ' ] [ ' log_level ' ] . upper ( )
if ll_str == " DEBUG " :
log_level = logging . DEBUG
elif ll_str == " INFO " :
log_level = logging . INFO
elif ll_str == " WARNING " :
log_level = logging . WARNING
elif ll_str == " ERROR " :
log_level = logging . ERROR
elif ll_str == " CRITICAL " :
log_level == logging . CRITICAL
elif ll_str == " OFF " :
# Disable all logging
logging . disable ( logging . CRITICAL )
else :
logging . error ( ' Invalid log_level %s in config file. Using WARNING. ' , str ( TOML [ ' options ' ] [ ' log_level ' ] ) )
# Set desired level of logging
logger = logging . getLogger ( )
logger . setLevel ( log_level )
logging . info ( ' Running with the following configuration: ' )
logging . info ( ' Config File : ' + str ( args [ ' f ' ] ) )
logging . info ( ' twitter_account : ' + TOML [ ' config ' ] [ ' twitter_account ' ] )
logging . info ( ' mastodon_instance : ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] )
logging . info ( ' mastodon_user : ' + TOML [ ' config ' ] [ ' mastodon_user ' ] )
logging . info ( ' upload_videos : ' + str ( TOML [ ' options ' ] [ ' upload_videos ' ] ) )
logging . info ( ' post_reply_to : ' + str ( TOML [ ' options ' ] [ ' post_reply_to ' ] ) )
logging . info ( ' skip_retweets : ' + str ( TOML [ ' options ' ] [ ' skip_retweets ' ] ) )
logging . info ( ' remove_link_redirections : ' + str ( TOML [ ' options ' ] [ ' remove_link_redirections ' ] ) )
logging . info ( ' remove_trackers_from_urls : ' + str ( TOML [ ' options ' ] [ ' remove_trackers_from_urls ' ] ) )
logging . info ( ' footer : ' + TOML [ ' options ' ] [ ' footer ' ] )
logging . info ( ' tweet_time_format : ' + TOML [ ' options ' ] [ ' tweet_time_format ' ] )
logging . info ( ' tweet_timezone : ' + TOML [ ' options ' ] [ ' tweet_timezone ' ] )
logging . info ( ' remove_original_tweet_ref : ' + str ( TOML [ ' options ' ] [ ' remove_original_tweet_ref ' ] ) )
logging . info ( ' update_profile : ' + str ( TOML [ ' options ' ] [ ' update_profile ' ] ) )
logging . info ( ' tweet_max_age : ' + str ( TOML [ ' options ' ] [ ' tweet_max_age ' ] ) )
logging . info ( ' tweet_delay : ' + str ( TOML [ ' options ' ] [ ' tweet_delay ' ] ) )
logging . info ( ' upload_pause : ' + str ( TOML [ ' options ' ] [ ' upload_pause ' ] ) )
logging . info ( ' toot_cap : ' + str ( TOML [ ' options ' ] [ ' toot_cap ' ] ) )
logging . info ( ' subst_twitter : ' + str ( TOML [ ' options ' ] [ ' subst_twitter ' ] ) )
logging . info ( ' subst_youtube : ' + str ( TOML [ ' options ' ] [ ' subst_youtube ' ] ) )
logging . info ( ' subst_reddit : ' + str ( TOML [ ' options ' ] [ ' subst_reddit ' ] ) )
logging . info ( ' log_level : ' + TOML [ ' options ' ] [ ' log_level ' ] )
logging . info ( ' log_days : ' + str ( TOML [ ' options ' ] [ ' log_days ' ] ) )
# Try to open database. If it does not exist, create it
sql = sqlite3 . connect ( ' twoot.db ' )
db = sql . cursor ( )
db . execute ( ''' CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
mastodon_account TEXT , tweet_id TEXT , toot_id TEXT ) ''' )
db . execute ( ''' CREATE INDEX IF NOT EXISTS main_index ON toots (twitter_account,
mastodon_instance , mastodon_account , tweet_id ) ''' )
db . execute ( ''' CREATE INDEX IF NOT EXISTS tweet_id_index ON toots (tweet_id) ''' )
db . execute ( ''' CREATE TABLE IF NOT EXISTS profiles (mastodon_instance TEXT, mastodon_account TEXT, avatar_url TEXT, banner_url TEXT) ''' )
db . execute ( ''' CREATE INDEX IF NOT EXISTS profile_index ON profiles (mastodon_instance, mastodon_account) ''' )
# Select random nitter instance to fetch updates from
nitter_url = ' https:// ' + TOML [ ' options ' ] [ ' nitter_instances ' ] [ random . randint ( 0 , len ( TOML [ ' options ' ] [ ' nitter_instances ' ] ) - 1 ) ]
2023-10-31 14:24:14 +00:00
# Initiate session
session = requests . Session ( )
# Get a copy of the default headers that requests would use
headers = requests . utils . default_headers ( )
# Update default headers with randomly selected user agent
headers . update (
{
' User-Agent ' : USER_AGENTS [ random . randint ( 0 , len ( USER_AGENTS ) - 1 ) ] ,
' Cookie ' : ' replaceTwitter=; replaceYouTube=; hlsPlayback=on; proxyVideos= ' ,
}
)
2023-09-14 15:41:51 +00:00
# Load twitter page of user
2023-10-31 14:24:14 +00:00
soup , timeline = get_timeline ( session , nitter_url )
2023-09-14 15:41:51 +00:00
logging . info ( ' Processing ' + str ( len ( timeline ) ) + ' tweets found in timeline ' )
# **********************************************************
# Process each tweets and generate an array of dictionaries
# with data ready to be posted on Mastodon
# **********************************************************
tweets = [ ]
out_date_cnt = 0
in_db_cnt = 0
for replied_to_tweet , status in timeline :
# Extract tweet ID and status ID
tweet_link_tag = status . find ( ' a ' , class_ = ' tweet-link ' )
if tweet_link_tag is None :
logging . debug ( " Malformed timeline item (no tweet link), skipping " )
continue
tweet_id = tweet_link_tag . get ( ' href ' ) . strip ( ' #m ' )
status_id = tweet_id . split ( ' / ' ) [ 3 ]
logging . debug ( ' processing tweet %s ' , tweet_id )
# Extract time stamp
time_string = status . find ( ' span ' , class_ = ' tweet-date ' ) . a . get ( ' title ' )
try :
timestamp = datetime . strptime ( time_string , ' %d / % m/ % Y, % H: % M: % S ' )
except :
# Dec 21, 2021 · 12:00 PM UTC
timestamp = datetime . strptime ( time_string , ' % b %d , % Y · % I: % M % p % Z ' )
# Check if time is within acceptable range
if not is_time_valid ( timestamp . timestamp ( ) ) :
out_date_cnt + = 1
logging . debug ( " Tweet outside valid time range, skipping " )
continue
# Check if retweets must be skipped
if TOML [ ' options ' ] [ ' skip_retweets ' ] :
# Check if this tweet is a retweet
if len ( status . select ( " div.tweet-body > div > div.retweet-header " ) ) != 0 :
logging . debug ( " Retweet ignored per command-line configuration " )
continue
# Check in database if tweet has already been posted
db . execute (
" SELECT * FROM toots WHERE twitter_account=? AND mastodon_instance=? AND mastodon_account=? AND tweet_id=? " ,
( TOML [ ' config ' ] [ ' twitter_account ' ] , TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] , tweet_id ) )
tweet_in_db = db . fetchone ( )
if tweet_in_db is not None :
in_db_cnt + = 1
logging . debug ( " Tweet %s already in database " , tweet_id )
# Skip to next tweet
continue
else :
logging . debug ( ' Tweet %s not found in database ' , tweet_id )
# extract author
author = status . find ( ' a ' , class_ = ' fullname ' ) . get ( ' title ' )
# Extract user name
author_account = status . find ( ' a ' , class_ = ' username ' ) . get ( ' title ' ) . lstrip ( ' @ ' )
# Extract URL of full status page (for video download)
full_status_url = ' https://twitter.com ' + tweet_id
# Initialize containers
tweet_text = ' '
photos = [ ]
# Add prefix if the tweet is a reply-to
# Only consider item of class 'replying-to' that is a direct child
# of class 'tweet-body' in status. Others can be in a quoted tweet.
replying_to_class = status . select ( " div.tweet-body > div.replying-to " )
if len ( replying_to_class ) != 0 :
tweet_text + = ' Replying to ' + replying_to_class [ 0 ] . a . get_text ( ) + ' \n \n '
# Check it the tweet is a retweet from somebody else
if len ( status . select ( " div.tweet-body > div > div.retweet-header " ) ) != 0 :
tweet_text = ' RT from ' + author + ' (@ ' + author_account + ' ) \n \n '
# extract iterator over tweet text contents
tt_iter = status . find ( ' div ' , class_ = ' tweet-content media-body ' ) . children
# Process text of tweet
tweet_text + = process_media_body ( tt_iter )
# Process quote: append link to tweet_text
quote_div = status . find ( ' a ' , class_ = ' quote-link ' )
if quote_div is not None :
tweet_text + = ' \n \n ' + substitute_source ( ' https://twitter.com ' + quote_div . get ( ' href ' ) . strip ( ' #m ' ) )
# Process card: extract image if necessary
card_class = status . find ( ' a ' , class_ = ' card-container ' )
if card_class is not None :
photos . extend ( process_card ( nitter_url , card_class ) )
# Process attachment: capture image or .mp4 url or download twitter video
attachments_class = status . find ( ' div ' , class_ = ' attachments ' )
if attachments_class is not None :
pics , vid_in_tweet = process_attachments ( nitter_url ,
attachments_class ,
status_id , author_account )
photos . extend ( pics )
if vid_in_tweet :
tweet_text + = ' \n \n [Video is unavailable] '
# Add custom footer from config file
if TOML [ ' options ' ] [ ' footer ' ] != ' ' :
tweet_text + = ' \n \n ' + TOML [ ' options ' ] [ ' footer ' ]
# Add footer with link to original tweet
if TOML [ ' options ' ] [ ' remove_original_tweet_ref ' ] is False :
if TOML [ ' options ' ] [ ' footer ' ] != ' ' :
tweet_text + = ' \n Original tweet: ' + substitute_source ( full_status_url )
else :
tweet_text + = ' \n \n Original tweet: ' + substitute_source ( full_status_url )
# Add timestamp to the "Original Tweet" line
if TOML [ ' options ' ] [ ' tweet_time_format ' ] != " " :
timestamp_display = timestamp
# Adjust timezone
if TOML [ ' options ' ] [ ' tweet_timezone ' ] != " " :
timezone_display = pytz . timezone ( TOML [ ' options ' ] [ ' tweet_timezone ' ] )
else : # Use local timezone by default
timezone_display = datetime . now ( ) . astimezone ( ) . tzinfo
logging . debug ( " Timestamp UTC: " + str ( timestamp ) )
logging . debug ( " Timezone to use: " + str ( timezone_display ) )
timestamp_display = pytz . utc . localize ( timestamp ) . astimezone ( timezone_display )
logging . debug ( " Timestamp converted " + str ( timestamp_display ) )
tweet_text + = ' ' + datetime . strftime ( timestamp_display , TOML [ ' options ' ] [ ' tweet_time_format ' ] )
# If no media was specifically added in the tweet, try to get the first picture
# with "twitter:image" meta tag in first linked page in tweet text
if not photos :
m = re . search ( r " http[^ \ n \ xa0]* " , tweet_text )
if m is not None :
link_url = m . group ( 0 )
if link_url . endswith ( " .html " ) : # Only process a web page
try :
r = requests . get ( link_url , timeout = HTTPS_REQ_TIMEOUT )
if r . status_code == 200 :
# Matches the first instance of either twitter:image or twitter:image:src meta tag
match = re . search ( r ' <meta name= " twitter:image(?:|:src) " content= " (.+?) " .*?> ' , r . text )
if match is not None :
url = match . group ( 1 ) . replace ( ' & ' , ' & ' ) # Remove HTML-safe encoding from URL if any
photos . append ( url )
# Give up if anything goes wrong
except ( requests . exceptions . ConnectionError ,
requests . exceptions . Timeout ,
requests . exceptions . ContentDecodingError ,
requests . exceptions . TooManyRedirects ,
requests . exceptions . MissingSchema ) :
pass
else :
logging . debug ( " downloaded twitter:image from linked page " )
# Check if video was downloaded
video_file = None
video_path = Path ( ' ./output ' ) / TOML [ ' config ' ] [ ' twitter_account ' ] / status_id
if video_path . exists ( ) :
# list video files
video_file_list = list ( video_path . glob ( ' *.mp4 ' ) )
if len ( video_file_list ) != 0 :
# Extract posix path of first video file in list
video_file = video_file_list [ 0 ] . absolute ( ) . as_posix ( )
# Add dictionary with content of tweet to list
tweet = {
" author " : author ,
" author_account " : author_account ,
" timestamp " : timestamp . timestamp ( ) ,
" tweet_id " : tweet_id ,
" tweet_text " : tweet_text ,
" video " : video_file ,
" photos " : photos ,
" replied_to_tweet " : replied_to_tweet ,
}
tweets . append ( tweet )
logging . debug ( ' Tweet %s added to list of toots to upload ' , tweet_id )
# Log summary stats
logging . info ( str ( out_date_cnt ) + ' tweets outside of valid time range ' )
logging . info ( str ( in_db_cnt ) + ' tweets already in database ' )
# Initialise Mastodon object
mastodon = None
# Update profile if it has changed
2023-10-31 14:24:14 +00:00
mastodon = update_profile ( session , nitter_url , soup , sql , mast_password )
2023-09-14 15:41:51 +00:00
# Login to account on maston instance
if len ( tweets ) != 0 and mastodon is None :
mastodon = login ( mast_password )
# Check toot character limit on mastodon instance
if mastodon is not None :
try :
max_characters = mastodon . instance ( ) . configuration . statuses [ ' max_characters ' ]
logging . debug ( ' Instance character limit is ' + str ( max_characters ) )
except Exception :
# Default value for Mastodon
max_characters = 500
logging . debug ( ' Tried to get toot character limit from Mastodon instance but failed. Assuming 500 ' )
# **********************************************************
# Iterate tweets in list.
# post each on Mastodon and record it in database
# **********************************************************
posted_cnt = 0
for tweet in reversed ( tweets ) :
# Check if we have reached the cap on the number of toots to post
if TOML [ ' options ' ] [ ' toot_cap ' ] != 0 and posted_cnt > = TOML [ ' options ' ] [ ' toot_cap ' ] :
logging . info ( ' %d toots not posted due to configured cap ' , len ( tweets ) - TOML [ ' options ' ] [ ' toot_cap ' ] )
break
logging . debug ( ' Uploading Tweet %s ' , tweet [ ' tweet_id ' ] )
media_ids = [ ]
# Upload video if there is one
if tweet [ ' video ' ] is not None :
try :
logging . debug ( " Uploading video to Mastodon " )
media_posted = mastodon . media_post ( tweet [ ' video ' ] )
media_ids . append ( media_posted [ ' id ' ] )
except ( MastodonAPIError , MastodonIllegalArgumentError ,
TypeError ) : # Media cannot be uploaded (invalid format, dead link, etc.)
logging . debug ( " Uploading video failed " )
pass
else : # Only upload pic if no video was uploaded
# Upload photos
for photo in tweet [ ' photos ' ] :
media = False
# Download picture
try :
logging . debug ( ' downloading picture ' )
media = requests . get ( photo , timeout = HTTPS_REQ_TIMEOUT )
except : # Picture cannot be downloaded for any reason
pass
# Upload picture to Mastodon instance
if media :
try :
logging . debug ( ' uploading picture to Mastodon ' )
media_posted = mastodon . media_post ( media . content , mime_type = media . headers [ ' content-type ' ] )
media_ids . append ( media_posted [ ' id ' ] )
except ( MastodonAPIError , MastodonIllegalArgumentError ,
TypeError ) : # Media cannot be uploaded (invalid format, dead link, etc.)
pass
# Find in database toot id of replied_to_tweet
replied_to_toot = None
if tweet [ ' replied_to_tweet ' ] is not None :
logging . debug ( " Searching db for toot corresponding to replied-to-tweet " + tweet [ ' replied_to_tweet ' ] )
db . execute ( " SELECT toot_id FROM toots WHERE tweet_id=? " , [ tweet [ ' replied_to_tweet ' ] ] )
replied_to_toot = db . fetchone ( )
if replied_to_toot is None :
logging . warning ( ' Replied-to tweet %s not found in database ' , tweet [ ' replied_to_tweet ' ] )
else :
logging . debug ( " toot %s found " , replied_to_toot )
# Post toot
toot = { }
try :
if len ( media_ids ) == 0 :
toot = mastodon . status_post ( tweet [ ' tweet_text ' ] , replied_to_toot )
else :
toot = mastodon . status_post ( tweet [ ' tweet_text ' ] , replied_to_toot , media_ids = media_ids )
except MastodonAPIError as e :
2023-10-30 12:42:20 +00:00
_ , status_code , _ , exception_message = e . args
if status_code == 500 :
logging . error ( ' Mastodon internal server error ' )
logging . error ( ' posting ' + tweet [ ' tweet_id ' ] + ' to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed ' )
continue
elif exception_message . find ( ' Text character limit ' ) != - 1 :
2023-09-14 15:41:51 +00:00
# ERROR (('Mastodon API returned error', 422, 'Unprocessable Entity', 'Validation failed: Text character limit of 500 exceeded'))
logging . error ( ' Toot text too long: %s characters ' , str ( len ( tweet [ ' tweet_text ' ] ) ) )
logging . error ( ' posting ' + tweet [ ' tweet_id ' ] + ' to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed ' )
continue
elif exception_message . find ( ' Try again in a moment ' ) != - 1 :
# ERROR ('Mastodon API returned error', 422, 'Unprocessable Entity', 'Cannot attach files that have not finished processing. Try again in a moment!')
logging . warning ( ' Mastodon API Error 422: Cannot attach files that have not finished processing. Waiting 30 seconds and retrying. ' )
# Wait 30 seconds
time . sleep ( 30 )
# retry posting
try :
toot = mastodon . status_post ( tweet [ ' tweet_text ' ] , media_ids = media_ids )
except MastodonError as me :
logging . error ( ' posting ' + tweet [ ' tweet_id ' ] + ' to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed ' )
logging . error ( me )
else :
logging . warning ( " Retry successful " )
except MastodonError as me :
logging . error ( ' posting ' + tweet [ ' tweet_id ' ] + ' to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed ' )
logging . error ( me )
else :
posted_cnt + = 1
logging . debug ( ' Tweet %s posted on %s ' , tweet [ ' tweet_id ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] )
# Test to find out if slowing down successive posting helps with ordering of threads
time . sleep ( TOML [ ' options ' ] [ ' upload_pause ' ] )
# Insert toot id into database
if ' id ' in toot :
db . execute ( " INSERT INTO toots VALUES ( ? , ? , ? , ? , ? ) " ,
( TOML [ ' config ' ] [ ' twitter_account ' ] , TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] , tweet [ ' tweet_id ' ] , toot [ ' id ' ] ) )
sql . commit ( )
logging . info ( str ( posted_cnt ) + ' tweets posted to Mastodon ' )
# Cleanup downloaded video files
try :
shutil . rmtree ( ' ./output/ ' + TOML [ ' config ' ] [ ' twitter_account ' ] )
except FileNotFoundError : # The directory does not exist
pass
# Evaluate excess records in database
excess_count = 0
db . execute ( ' SELECT count(*) FROM toots WHERE twitter_account=? ' , ( TOML [ ' config ' ] [ ' twitter_account ' ] , ) )
db_count = db . fetchone ( )
if db_count is not None :
excess_count = db_count [ 0 ] - MAX_REC_COUNT
# Delete excess records
if excess_count > 0 :
db . execute ( '''
WITH excess AS (
SELECT tweet_id
FROM toots
WHERE twitter_account = ?
ORDER BY toot_id ASC
LIMIT ?
)
DELETE from toots
WHERE tweet_id IN excess ''' , (TOML[ ' config ' ][ ' twitter_account ' ], excess_count))
sql . commit ( )
logging . info ( ' Deleted ' + str ( excess_count ) + ' old records from database. ' )
shutdown ( 0 )
2022-12-11 10:15:50 +00:00
def build_config ( args ) :
"""
Receives the arguments passed on the command line
populates the TOML global dict with default values for all ' options ' keys
if a config file is provided , load the keys from the config file
if no config file is provided , use command - line args
verify that a valid config is available ( all keys in ' config ' present )
: param args : list of command line arguments
"""
# Create global struct containing configuration
global TOML
# Default options
options = {
2023-07-14 11:12:25 +00:00
' nitter_instances ' : [
2023-08-25 09:00:53 +00:00
' nitter.poast.org ' , # added 25/08/2023
' nitter.d420.de ' , # added 25/08/2023
2023-09-01 08:28:21 +00:00
' nitter.salastil.com ' , # added 25/08/2023
2023-09-13 16:13:57 +00:00
' nitter.privacydev.net ' , # added 25/08/2023
2023-10-30 12:26:23 +00:00
# 'nitter.cz', # removed 30/10/2023
# 'tweet.whateveritworks.org', # removed 30/10/2023
# 'nitter.hyperreal.coffee', # removed 30/10/2023
2023-09-14 15:44:16 +00:00
# 'bird.habedieeh.re', # gone 14/09/2023
2023-09-14 12:46:12 +00:00
# 'nitter.nicfab.eu', # gone 14/09/2023
2023-09-13 16:13:57 +00:00
# 'nitter.unixfox.eu', # rate-limited 13/09/2023
# 'nt.ggtyler.dev', # gone 13/09/2023
2023-07-14 11:12:25 +00:00
] ,
2022-12-11 10:15:50 +00:00
' upload_videos ' : False ,
' post_reply_to ' : False ,
' skip_retweets ' : False ,
' remove_link_redirections ' : False ,
' remove_trackers_from_urls ' : False ,
2023-06-28 19:47:48 +00:00
' footer ' : " " ,
' tweet_time_format ' : " " ,
2023-07-11 11:15:43 +00:00
' tweet_timezone ' : " " ,
2022-12-11 10:15:50 +00:00
' remove_original_tweet_ref ' : False ,
' tweet_max_age ' : float ( 1 ) ,
' tweet_delay ' : float ( 0 ) ,
2023-07-14 11:21:12 +00:00
' upload_pause ' : float ( 0 ) ,
2022-12-11 10:15:50 +00:00
' toot_cap ' : int ( 0 ) ,
' subst_twitter ' : [ ] ,
' subst_youtube ' : [ ] ,
' subst_reddit ' : [ ] ,
2023-06-15 12:35:27 +00:00
' update_profile ' : False ,
2022-12-11 10:15:50 +00:00
' log_level ' : " WARNING " ,
' log_days ' : 3 ,
}
# Create default config object
2023-06-12 15:43:08 +00:00
TOML = { ' config ' : { } , ' options ' : options }
2022-12-11 10:15:50 +00:00
# Load config file if it was provided
toml_file = args [ ' f ' ]
if toml_file is not None :
2023-06-12 15:43:08 +00:00
try : # Included in python from version 3.11
2022-12-11 10:15:50 +00:00
import tomllib
except ModuleNotFoundError :
# for python < 3.11, tomli module must be installed
import tomli as tomllib
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
loaded_toml = None
# Load toml file
try :
with open ( toml_file , ' rb ' ) as config_file :
loaded_toml = tomllib . load ( config_file )
except FileNotFoundError :
print ( ' config file not found ' )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2022-12-11 10:15:50 +00:00
except tomllib . TOMLDecodeError :
print ( ' Malformed config file ' )
2023-06-19 18:13:46 +00:00
shutdown ( - 1 )
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
TOML [ ' config ' ] = loaded_toml [ ' config ' ]
for k in TOML [ ' options ' ] . keys ( ) :
try : # Go through all valid keys
TOML [ ' options ' ] [ k ] = loaded_toml [ ' options ' ] [ k ]
except KeyError : # Key was not found in file
pass
else :
# Override config parameters with command-line values provided
if args [ ' t ' ] is not None :
TOML [ ' config ' ] [ ' twitter_account ' ] = args [ ' t ' ]
if args [ ' i ' ] is not None :
TOML [ ' config ' ] [ ' mastodon_instance ' ] = args [ ' i ' ]
if args [ ' m ' ] is not None :
TOML [ ' config ' ] [ ' mastodon_user ' ] = args [ ' m ' ]
if args [ ' v ' ] is True :
TOML [ ' options ' ] [ ' upload_videos ' ] = args [ ' v ' ]
if args [ ' r ' ] is True :
TOML [ ' options ' ] [ ' post_reply_to ' ] = args [ ' r ' ]
if args [ ' s ' ] is True :
TOML [ ' options ' ] [ ' skip_retweets ' ] = args [ ' s ' ]
if args [ ' l ' ] is True :
TOML [ ' options ' ] [ ' remove_link_redirections ' ] = args [ ' l ' ]
if args [ ' u ' ] is True :
TOML [ ' options ' ] [ ' remove_trackers_from_urls ' ] = args [ ' u ' ]
if args [ ' o ' ] is True :
TOML [ ' options ' ] [ ' remove_original_tweet_ref ' ] = args [ ' o ' ]
if args [ ' a ' ] is not None :
TOML [ ' options ' ] [ ' tweet_max_age ' ] = float ( args [ ' a ' ] )
if args [ ' d ' ] is not None :
TOML [ ' options ' ] [ ' tweet_delay ' ] = float ( args [ ' d ' ] )
if args [ ' c ' ] is not None :
TOML [ ' options ' ] [ ' toot_cap ' ] = int ( args [ ' c ' ] )
2023-06-15 12:49:30 +00:00
if args [ ' q ' ] is True :
TOML [ ' options ' ] [ ' update_profile ' ] = args [ ' q ' ]
2022-12-11 10:15:50 +00:00
# Verify that we have a minimum config to run
if ' twitter_account ' not in TOML [ ' config ' ] . keys ( ) or TOML [ ' config ' ] [ ' twitter_account ' ] == " " :
print ( ' CRITICAL: Missing Twitter account ' )
2023-06-15 12:49:30 +00:00
exit ( - 1 )
2022-12-11 10:15:50 +00:00
if ' mastodon_instance ' not in TOML [ ' config ' ] . keys ( ) or TOML [ ' config ' ] [ ' mastodon_instance ' ] == " " :
print ( ' CRITICAL: Missing Mastodon instance ' )
2023-06-15 12:49:30 +00:00
exit ( - 1 )
2022-12-11 10:15:50 +00:00
if ' mastodon_user ' not in TOML [ ' config ' ] . keys ( ) or TOML [ ' config ' ] [ ' mastodon_user ' ] == " " :
print ( ' CRITICAL: Missing Mastodon user ' )
2023-06-15 12:49:30 +00:00
exit ( - 1 )
2022-12-11 10:15:50 +00:00
2023-06-14 14:49:15 +00:00
2023-10-31 14:24:14 +00:00
def get_timeline ( session , nitter_url ) :
2023-09-14 09:30:55 +00:00
"""
Download timeline of twitter account
2023-10-31 14:24:14 +00:00
: param session : configured requests session including user agent
: param nitter_url : url of the account page to download
2023-09-14 09:30:55 +00:00
: return : list of tuples with url of tweet replied - to ( or None ) and content of tweet
"""
2023-07-13 09:36:04 +00:00
# Define url to use
url = nitter_url + ' / ' + TOML [ ' config ' ] [ ' twitter_account ' ]
# Use different page if we need to handle replies
if TOML [ ' options ' ] [ ' post_reply_to ' ] :
url + = ' /with_replies '
2023-07-12 20:02:06 +00:00
# Download twitter page of user
try :
2023-10-31 15:25:34 +00:00
twit_account_page = session . get ( url , timeout = HTTPS_REQ_TIMEOUT )
2023-07-12 20:02:06 +00:00
except requests . exceptions . ConnectionError :
logging . fatal ( ' Host did not respond when trying to download ' + url )
shutdown ( - 1 )
except requests . exceptions . Timeout :
logging . fatal ( url + ' took too long to respond ' )
shutdown ( - 1 )
# Verify that download worked
if twit_account_page . status_code != 200 :
logging . fatal ( ' The Nitter page did not download correctly from ' + url + ' ( ' + str (
twit_account_page . status_code ) + ' ). Aborting ' )
shutdown ( - 1 )
logging . debug ( ' Nitter page downloaded successfully from ' + url )
# DEBUG: Save page to file
2023-07-13 09:36:04 +00:00
# of = open('user_page_debug.html', 'w')
2023-07-12 20:02:06 +00:00
# of.write(twit_account_page.text)
# of.close()
# Make soup
soup = BeautifulSoup ( twit_account_page . text , ' html.parser ' )
2023-07-13 09:36:04 +00:00
# Get the div containing tweets
tl = soup . find ( ' div ' , class_ = ' timeline ' )
2023-07-12 20:19:04 +00:00
2023-07-13 09:36:04 +00:00
# Get the list of direct children of timeline
list = tl . find_all ( ' div ' , recursive = False )
2023-07-12 20:19:04 +00:00
2023-07-13 09:36:04 +00:00
timeline = [ ]
2023-07-12 20:19:04 +00:00
for item in list :
classes = item [ ' class ' ]
2023-07-14 11:11:20 +00:00
if ' timeline-item ' in classes : # Individual tweet
2023-07-16 09:36:05 +00:00
timeline . append ( ( None , item ) )
2023-07-14 11:11:20 +00:00
elif ' thread-line ' in classes : # First tweet of a thread
2023-07-13 09:36:04 +00:00
# Get the first item of thread
first_item = item . find ( ' div ' , class_ = ' timeline-item ' )
2023-07-16 09:36:05 +00:00
# Get the url of the tweet
2023-07-13 09:36:04 +00:00
thread_link_tag = item . find ( ' a ' , class_ = ' tweet-link ' )
if thread_link_tag is not None :
2023-07-16 13:42:58 +00:00
thread_url = thread_link_tag . get ( ' href ' ) . strip ( ' #m ' )
2023-07-16 09:36:05 +00:00
# Get the rest of the items of the thread
2023-07-16 13:43:13 +00:00
timeline . extend ( _get_rest_of_thread ( session , headers , nitter_url , thread_url , first_item ) )
2023-07-13 09:36:04 +00:00
else :
# Ignore other classes
continue
2023-07-12 20:02:06 +00:00
return soup , timeline
2023-09-14 15:41:51 +00:00
def _get_rest_of_thread ( session , headers , nitter_url , thread_url , first_item ) :
2022-11-22 10:05:16 +00:00
"""
2023-09-14 15:41:51 +00:00
Dowload page with full thread of tweets and extract all replied to tweet reference by url .
Only used by ` get_timeline ( ) ` .
: param session : Existing HTTP session with Nitter instance
: param headers : HTTP headers to use
: param nitter url : url of the nitter instance to use
: param thread_url : url of the first tweet in thread
: return : list of tuples with url of tweet replied - to ( or None ) and content of tweet
2022-11-22 10:05:16 +00:00
"""
2023-09-14 15:41:51 +00:00
# Add first item to timeline
timeline = [ ( None , first_item ) ]
2022-11-22 10:05:16 +00:00
2023-09-14 15:41:51 +00:00
logging . debug ( " Downloading tweets in thread from separate page " )
# Download page with thread
url = nitter_url + thread_url
2022-11-22 10:05:16 +00:00
try :
2023-09-14 15:41:51 +00:00
thread_page = session . get ( url , headers = headers , timeout = HTTPS_REQ_TIMEOUT )
except requests . exceptions . ConnectionError :
logging . fatal ( ' Host did not respond when trying to download ' + url )
shutdown ( - 1 )
except requests . exceptions . Timeout :
logging . fatal ( url + ' took too long to respond ' )
shutdown ( - 1 )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Verify that download worked
if thread_page . status_code != 200 :
logging . fatal ( ' The Nitter page did not download correctly from ' + url + ' ( ' + str ( thread_page . status_code ) + ' ). Aborting ' )
shutdown ( - 1 )
2023-02-27 11:48:48 +00:00
2023-09-14 15:41:51 +00:00
logging . debug ( ' Nitter page downloaded successfully from ' + url )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# DEBUG: Save page to file
# of = open('thread_page_debug.html', 'w')
# of.write(twit_account_page.text)
# of.close()
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Make soup
soup = BeautifulSoup ( thread_page . text , ' html.parser ' )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
list = [ ]
# Get all items in thread after main tweet
after_tweet = soup . find ( ' div ' , ' after-tweet ' )
if after_tweet is not None :
list = after_tweet . find_all ( ' div ' , class_ = ' timeline-item ' )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Build timeline of tuples
previous_tweet_url = thread_url
for item in list :
timeline . append ( ( previous_tweet_url , item ) )
# Get the url of the tweet
tweet_link_tag = item . find ( ' a ' , class_ = ' tweet-link ' )
if tweet_link_tag is not None :
previous_tweet_url = tweet_link_tag . get ( ' href ' ) . strip ( ' #m ' )
else :
previous_tweet_url = None
logging . error ( ' Thread tweet is missing link tag ' )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# return timeline in reverse chronological order
timeline . reverse ( )
return timeline
2023-06-12 15:43:08 +00:00
2022-11-17 19:18:42 +00:00
2023-09-14 15:41:51 +00:00
def is_time_valid ( timestamp ) :
ret = True
# Check that the tweet is not too young (might be deleted) or too old
age_in_hours = ( time . time ( ) - float ( timestamp ) ) / 3600.0
min_delay_in_hours = TOML [ ' options ' ] [ ' tweet_delay ' ] / 60.0
max_age_in_hours = TOML [ ' options ' ] [ ' tweet_max_age ' ] * 24.0
2022-11-17 19:18:42 +00:00
2023-09-14 15:41:51 +00:00
if age_in_hours < min_delay_in_hours or age_in_hours > max_age_in_hours :
ret = False
2022-11-17 19:18:42 +00:00
2023-09-14 15:41:51 +00:00
return ret
2022-11-18 11:57:44 +00:00
2019-07-31 20:42:38 +00:00
2022-12-11 10:15:50 +00:00
def process_media_body ( tt_iter ) :
2020-12-18 10:45:43 +00:00
"""
2020-02-14 17:01:12 +00:00
Receives an iterator over all the elements contained in the tweet - text container .
2020-12-17 21:08:43 +00:00
Processes them to make them suitable for posting on Mastodon
2020-03-27 16:26:04 +00:00
: param tt_iter : iterator over the HTML elements in the text of the tweet
2020-12-17 21:08:43 +00:00
: return : cleaned up text of the tweet
2020-12-18 10:45:43 +00:00
"""
2022-12-11 10:15:50 +00:00
2019-07-31 20:42:38 +00:00
tweet_text = ' '
# Iterate elements
for tag in tt_iter :
# If element is plain text, copy it verbatim
if isinstance ( tag , element . NavigableString ) :
tweet_text + = tag . string
# If it is an 'a' html tag
2020-12-17 21:08:43 +00:00
elif tag . name == ' a ' :
tag_text = tag . get_text ( )
2020-12-18 13:57:22 +00:00
if tag_text . startswith ( ' @ ' ) :
2020-12-17 21:08:43 +00:00
# Only keep user name
tweet_text + = tag_text
2020-12-18 13:57:22 +00:00
elif tag_text . startswith ( ' # ' ) :
2020-12-17 21:08:43 +00:00
# Only keep hashtag text
tweet_text + = tag_text
else :
2022-11-22 10:05:16 +00:00
# This is a real link
2022-12-11 10:15:50 +00:00
url = deredir_url ( tag . get ( ' href ' ) )
url = substitute_source ( url )
url = clean_url ( url )
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
tweet_text + = url
2019-07-31 20:42:38 +00:00
else :
2023-07-19 16:31:50 +00:00
logging . warning ( " No handler for tag %s in twitter text: " , tag . prettify ( ) )
2019-07-31 20:42:38 +00:00
return tweet_text
2021-03-02 21:08:52 +00:00
def process_card ( nitter_url , card_container ) :
2020-12-18 10:45:43 +00:00
"""
2020-12-17 21:59:21 +00:00
Extract image from card in case mastodon does not do it
: param card_container : soup of ' a ' tag containing card markup
: return : list with url of image
2020-12-18 10:45:43 +00:00
"""
2020-12-17 21:59:21 +00:00
list = [ ]
2020-12-18 20:32:26 +00:00
img = card_container . div . div . img
if img is not None :
2021-03-02 21:08:52 +00:00
image_url = nitter_url + img . get ( ' src ' )
2020-12-18 20:32:26 +00:00
list . append ( image_url )
logging . debug ( ' Extracted image from card ' )
2020-12-17 21:59:21 +00:00
return list
2020-12-18 10:45:43 +00:00
2022-12-11 10:15:50 +00:00
def process_attachments ( nitter_url , attachments_container , status_id , author_account ) :
2020-12-18 10:45:43 +00:00
"""
Extract images or video from attachments . Videos are downloaded on the file system .
2021-03-02 21:08:52 +00:00
: param nitter_url : url of nitter mirror
: param attachments_container : soup of ' div ' tag containing attachments markup
2020-12-18 12:26:26 +00:00
: param twit_account : name of twitter account
2020-12-18 16:55:12 +00:00
: param status_id : id of tweet being processed
2020-12-18 12:26:26 +00:00
: param author_account : author of tweet with video attachment
2020-12-18 10:45:43 +00:00
: return : list with url of images
"""
# Collect url of images
pics = [ ]
images = attachments_container . find_all ( ' a ' , class_ = ' still-image ' )
for image in images :
2023-09-14 15:41:51 +00:00
pics . append ( nitter_url + image . get ( ' href ' ) )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
logging . debug ( ' collected ' + str ( len ( pics ) ) + ' image(s) from attachments ' )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Download nitter video (converted animated GIF)
gif_class = attachments_container . find ( ' video ' , class_ = ' gif ' )
if gif_class is not None :
gif_video_file = nitter_url + gif_class . source . get ( ' src ' )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
video_path = os . path . join ( ' output ' , TOML [ ' config ' ] [ ' twitter_account ' ] , status_id , author_account , status_id )
os . makedirs ( video_path , exist_ok = True )
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Open directory for writing file
orig_dir = os . getcwd ( )
os . chdir ( video_path )
with requests . get ( gif_video_file , stream = True , timeout = HTTPS_REQ_TIMEOUT ) as r :
try :
# Raise exception if response code is not 200
r . raise_for_status ( )
# Download chunks and write them to file
with open ( ' gif_video.mp4 ' , ' wb ' ) as f :
for chunk in r . iter_content ( chunk_size = 16 * 1024 ) :
f . write ( chunk )
2019-08-01 12:58:41 +00:00
2023-10-31 15:24:54 +00:00
logging . debug ( ' Downloaded video of GIF animation from attachments ' )
2023-09-14 15:41:51 +00:00
except : # Don't do anything if video can't be found or downloaded
logging . debug ( ' Could not download video of GIF animation from attachments ' )
pass
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Close directory
os . chdir ( orig_dir )
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Download twitter video
vid_in_tweet = False
vid_container = attachments_container . find ( ' div ' , class_ = ' video-container ' )
if vid_container is not None :
if TOML [ ' options ' ] [ ' upload_videos ' ] :
logging . debug ( " downloading video from twitter " )
import youtube_dl
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
video_path_source = vid_container . source
if video_path_source is not None :
video_path = video_path_source [ ' src ' ]
if video_path is not None :
video_file = urljoin ( nitter_url , video_path )
ydl_opts = {
' outtmpl ' : " output/ " + TOML [ ' config ' ] [ ' twitter_account ' ] + " / " + status_id + " / %(id)s . %(ext)s " ,
# 'format': "best[width<=500]",
' socket_timeout ' : 60 ,
' quiet ' : True ,
}
2020-12-18 16:21:41 +00:00
2023-09-14 15:41:51 +00:00
with youtube_dl . YoutubeDL ( ydl_opts ) as ydl :
try :
ydl . download ( [ video_file ] )
except Exception as e :
logging . warning ( ' Error downloading twitter video: ' + str ( e ) )
vid_in_tweet = True
else :
logging . debug ( ' downloaded twitter video from attachments ' )
else :
logging . debug ( " Media is unavailable " )
vid_in_tweet = True
else :
2023-10-31 15:24:54 +00:00
logging . debug ( " Media is unavailable " )
vid_in_tweet = True
2021-06-01 13:49:11 +00:00
2023-09-14 15:41:51 +00:00
return pics , vid_in_tweet
2022-12-21 08:41:59 +00:00
2023-10-31 14:24:14 +00:00
def update_profile ( session , nitter_url , soup , sql , mast_password ) :
2023-09-14 15:41:51 +00:00
"""
Update profile on Mastodon
Check if avatar or banner pictures have changed since last run
If they have , download them and upload them on the Mastodon account profile
2023-10-31 14:24:14 +00:00
: param session : Confifgured requests session including user agent
2023-09-14 15:41:51 +00:00
: param nitter_url : url of the Nitter instance that is being used
: param soup : BeautifulSoup object containing the page
: param sql : database connection
: param mast_password : < PASSWORD >
: return : mastodon object if we had to login to update , None otherwise
"""
# Check if TOML option to update profile is set
if TOML [ ' options ' ] [ ' update_profile ' ] is False :
return None
2022-12-21 08:41:59 +00:00
else :
2023-09-14 15:41:51 +00:00
logging . debug ( " Checking twitter profile for changes " )
2023-02-27 11:48:48 +00:00
2020-03-28 10:08:09 +00:00
db = sql . cursor ( )
2023-09-14 15:41:51 +00:00
# Extract avatar picture address
try :
new_avatar_url = soup . find ( ' div ' , class_ = ' profile-card-info ' ) . findChild ( ' a ' ) . findChild ( ' img ' ) . get ( ' src ' )
except AttributeError :
new_avatar_url = None
2021-03-02 21:08:52 +00:00
2023-09-14 15:41:51 +00:00
# Extract banner picture address
try :
new_banner_url = soup . find ( ' div ' , class_ = ' profile-banner ' ) . findChild ( ' a ' ) . findChild ( ' img ' ) . get ( ' src ' )
except AttributeError :
new_banner_url = None
2020-02-14 17:01:12 +00:00
2023-09-14 15:41:51 +00:00
# Get the original urls of the avatar and banner pictures on the account profile
db . execute ( " SELECT avatar_url, banner_url FROM profiles WHERE mastodon_instance=? AND mastodon_account=? " , ( TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] , ) )
profile_in_db = db . fetchone ( )
2020-12-17 16:31:43 +00:00
2023-09-14 15:41:51 +00:00
changed = False
if profile_in_db is not None :
cur_avatar_url = profile_in_db [ 0 ]
cur_banner_url = profile_in_db [ 1 ]
2023-07-24 19:51:20 +00:00
2023-09-14 15:41:51 +00:00
# Check if urls have changed
if new_avatar_url != cur_avatar_url :
changed = True
logging . info ( ' avatar image changed on twitter profile ' )
if new_banner_url != cur_banner_url :
changed = True
logging . info ( ' banner image changed on twitter profile ' )
else :
# Mastodon user not found in database. Add new record
db . execute ( " INSERT INTO profiles (mastodon_instance, mastodon_account, avatar_url, banner_url) VALUES (?, ?, ?, ?) " , ( TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] , None , None ) )
sql . commit ( )
changed = True
logging . debug ( " added new profile to database " )
2020-03-28 10:08:09 +00:00
2023-09-14 15:41:51 +00:00
mastodon = None
2020-10-14 19:51:00 +00:00
2023-09-14 15:41:51 +00:00
# Update if necessary
if changed :
logging . info ( ' updating profile on Mastodon ' )
2020-12-17 16:31:43 +00:00
2023-09-14 15:41:51 +00:00
new_avatar_img = None
new_avatar_mime = None
new_banner_img = None
new_banner_mime = None
2020-12-17 16:31:43 +00:00
2023-09-14 15:41:51 +00:00
# Download images
2023-10-31 14:24:14 +00:00
new_avatar = session . get ( nitter_url + new_avatar_url , timeout = HTTPS_REQ_TIMEOUT ) if new_avatar_url is not None else None
2023-09-14 15:41:51 +00:00
if new_avatar is not None :
new_avatar_img = new_avatar . content if new_avatar . status_code == 200 else None
new_avatar_mime = new_avatar . headers [ ' content-type ' ] if new_avatar . status_code == 200 else None
if new_avatar . status_code != 200 :
logging . error ( " Could not download avatar image from " + nitter_url + new_avatar_url )
logging . error ( " Status code: " + str ( new_avatar . status_code ) )
else :
logging . debug ( " Avatar image downloaded " )
2023-10-31 14:24:14 +00:00
new_banner = session . get ( nitter_url + new_banner_url , timeout = HTTPS_REQ_TIMEOUT ) if new_banner_url is not None else None
2023-09-14 15:41:51 +00:00
if new_banner is not None :
new_banner_img = new_banner . content if new_banner . status_code == 200 else None
new_banner_mime = new_banner . headers [ ' content-type ' ] if new_banner . status_code == 200 else None
if new_banner . status_code != 200 :
logging . error ( " Could not download banner image from " + nitter_url + new_banner_url )
2023-10-01 20:44:47 +00:00
logging . error ( " Status code: " + str ( new_banner . status_code ) )
2023-09-14 15:41:51 +00:00
else :
logging . debug ( " Banner image downloaded " )
2022-11-13 21:17:43 +00:00
2023-09-14 15:41:51 +00:00
mastodon = login ( mast_password )
2020-03-28 10:08:09 +00:00
2023-09-14 15:41:51 +00:00
# Update profile on Mastodon
try :
mastodon . account_update_credentials ( avatar = new_avatar_img , avatar_mime_type = new_avatar_mime , header = new_banner_img , header_mime_type = new_banner_mime )
except Exception as e :
logging . error ( " Could not update profile " )
logging . error ( e )
2020-11-09 14:55:42 +00:00
else :
2023-09-14 15:41:51 +00:00
logging . info ( " Profile updated on Mastodon " )
# Add urls to database
db . execute ( " UPDATE profiles SET avatar_url=?, banner_url=? WHERE mastodon_instance=? AND mastodon_account=? " , ( new_avatar_url , new_banner_url , TOML [ ' config ' ] [ ' mastodon_instance ' ] , TOML [ ' config ' ] [ ' mastodon_user ' ] ) )
sql . commit ( )
logging . debug ( " Profile updated on database " )
else :
logging . info ( " No changes to profile found " )
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
return mastodon
2019-08-01 12:58:41 +00:00
2020-12-16 21:46:01 +00:00
2023-09-14 15:41:51 +00:00
def login ( password ) :
"""
Login to Mastodon account and return mastodon object used to post content
: param password : Password associated to account . None if not provided
: return : mastodon object
"""
# Create Mastodon application if it does not exist yet
if not os . path . isfile ( TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' .secret ' ) :
try :
Mastodon . create_app (
' feedtoot ' ,
api_base_url = ' https:// ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] ,
to_file = TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' .secret '
)
2020-12-17 16:56:12 +00:00
2023-09-14 15:41:51 +00:00
except MastodonError as me :
logging . fatal ( ' failed to create app on ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] )
logging . fatal ( me )
shutdown ( - 1 )
2020-12-17 16:31:43 +00:00
2023-09-14 15:41:51 +00:00
mastodon = None
2020-02-14 06:58:39 +00:00
2023-09-14 15:41:51 +00:00
# Log in to Mastodon instance with password
if password is not None :
try :
mastodon = Mastodon (
client_id = TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' .secret ' ,
api_base_url = ' https:// ' + TOML [ ' config ' ] [ ' mastodon_instance ' ]
)
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
mastodon . log_in (
username = TOML [ ' config ' ] [ ' mastodon_user ' ] ,
password = password ,
to_file = TOML [ ' config ' ] [ ' mastodon_user ' ] + " .secret "
)
logging . info ( ' Logging in to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] )
2020-12-17 20:44:32 +00:00
2023-09-14 15:41:51 +00:00
except MastodonError as me :
logging . fatal ( ' Login to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed \n ' )
logging . fatal ( me )
shutdown ( - 1 )
2020-12-17 20:44:32 +00:00
2023-09-14 15:41:51 +00:00
if os . path . isfile ( TOML [ ' config ' ] [ ' mastodon_user ' ] + ' .secret ' ) :
logging . warning ( ''' You successfully logged in using a password and an access token
has been saved . The password can therefore be omitted from the
command - line in future invocations ''' )
else : # No password provided, login with token
# Using token in existing .secret file
if os . path . isfile ( TOML [ ' config ' ] [ ' mastodon_user ' ] + ' .secret ' ) :
try :
mastodon = Mastodon (
access_token = TOML [ ' config ' ] [ ' mastodon_user ' ] + ' .secret ' ,
api_base_url = ' https:// ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] )
except MastodonError as me :
logging . fatal ( ' Login to ' + TOML [ ' config ' ] [ ' mastodon_instance ' ] + ' Failed \n ' )
logging . fatal ( me )
shutdown ( - 1 )
else :
logging . fatal ( ' No .secret file found. Password required to log in ' )
shutdown ( - 1 )
2020-12-17 20:44:32 +00:00
2023-09-14 15:41:51 +00:00
return mastodon
2019-08-01 12:58:41 +00:00
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
def deredir_url ( url ) :
"""
Given a URL , return the URL that the page really downloads from
: param url : url to be de - redirected
: return : direct url
"""
# Check if we need to do anyting
if TOML [ ' options ' ] [ ' remove_link_redirections ' ] is False :
return url
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Get a copy of the default headers that requests would use
headers = requests . utils . default_headers ( )
2023-07-11 11:15:43 +00:00
2023-09-14 15:41:51 +00:00
# Update default headers with randomly selected user agent
headers . update (
{
' User-Agent ' : USER_AGENTS [ random . randint ( 0 , len ( USER_AGENTS ) - 1 ) ] ,
}
)
2023-06-23 15:35:01 +00:00
2023-09-14 15:41:51 +00:00
ret = None
try :
# Download the page
ret = requests . head ( url , headers = headers , allow_redirects = True , timeout = 5 )
except :
# If anything goes wrong keep the URL intact
return url
2019-08-16 13:27:55 +00:00
2023-09-14 15:41:51 +00:00
if ret . url != url :
logging . debug ( " Removed redirection from: " + url + " to: " + ret . url )
2020-03-29 15:16:54 +00:00
2023-09-14 15:41:51 +00:00
# Return the URL that the page was downloaded from
return ret . url
2020-03-26 13:50:03 +00:00
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
def substitute_source ( orig_url ) :
"""
param orig_url : url to check for substitutes
: return : url with replaced domains
"""
parsed_url = urlparse ( orig_url )
domain = parsed_url . netloc
2020-12-17 16:31:43 +00:00
2023-09-14 15:41:51 +00:00
logging . debug ( " Checking domain %s for substitution " , domain )
2020-10-14 19:51:00 +00:00
2023-09-14 15:41:51 +00:00
# Handle twitter
twitter_subst = TOML [ " options " ] [ " subst_twitter " ]
# Do not substitiute if subdomain is present (e.g. i.twitter.com)
if ( domain == ' twitter.com ' or domain == ' www.twitter.com ' ) and twitter_subst != [ ] :
domain = twitter_subst [ random . randint ( 0 , len ( twitter_subst ) - 1 ) ]
logging . debug ( " Replaced twitter.com by " + domain )
2023-06-14 14:22:28 +00:00
2023-09-14 15:41:51 +00:00
# Handle youtube
youtube_subst = TOML [ " options " ] [ " subst_youtube " ]
# Do not substitiute if subdomain is present (e.g. i.youtube.com)
if ( domain == ' youtube.com ' or domain == ' wwww.youtube.com ' ) and youtube_subst != [ ] :
domain = youtube_subst [ random . randint ( 0 , len ( youtube_subst ) - 1 ) ]
logging . debug ( " Replaced youtube.com by " + domain )
2023-06-14 14:22:28 +00:00
2023-09-14 15:41:51 +00:00
# Handle reddit
reddit_subst = TOML [ " options " ] [ " subst_reddit " ]
# Do not substitiute if subdomain is present (e.g. i.reddit.com)
if ( domain == ' reddit.com ' or domain == ' www.reddit.com ' ) and reddit_subst != [ ] :
domain = reddit_subst [ random . randint ( 0 , len ( reddit_subst ) - 1 ) ]
logging . debug ( " Replaced reddit.com by " + domain )
2021-03-07 20:26:52 +00:00
2023-09-14 15:41:51 +00:00
dest_url = urlunparse ( [
parsed_url . scheme ,
domain ,
parsed_url . path ,
parsed_url . params ,
parsed_url . query ,
parsed_url . fragment
] )
2023-07-19 15:59:28 +00:00
2023-09-14 15:41:51 +00:00
return dest_url
2019-08-01 12:58:41 +00:00
2021-06-01 09:54:08 +00:00
2023-09-14 15:41:51 +00:00
def clean_url ( orig_url ) :
"""
Given a URL , return it with the UTM parameters removed from query and fragment
: param dirty_url : url to be cleaned
: return : url cleaned
>> > clean_url ( ' https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok ' )
' https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok '
"""
# Check if we have to do anything
if TOML [ ' options ' ] [ ' remove_trackers_from_urls ' ] is False :
return orig_url
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Parse a URL into 6 components:
# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
url_parsed = urlparse ( orig_url )
2020-03-26 18:03:21 +00:00
2023-09-14 15:41:51 +00:00
# Reassemble URL after removal of trackers
dest_url = urlunparse ( [
url_parsed . scheme ,
url_parsed . netloc ,
url_parsed . path ,
url_parsed . params ,
_remove_trackers_query ( url_parsed . query ) ,
_remove_trackers_fragment ( url_parsed . fragment )
] )
if dest_url != orig_url :
logging . debug ( ' Cleaned URL from: ' + orig_url + ' to: ' + dest_url )
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
return dest_url
2019-11-07 19:06:15 +00:00
2020-03-26 18:03:21 +00:00
2023-09-14 15:41:51 +00:00
def _remove_trackers_query ( query_str ) :
"""
private function
Given a query string from a URL , strip out the known trackers
: param query_str : query to be cleaned
: return : query cleaned
"""
# Avalaible URL tracking parameters :
# UTM tags by Google Ads, M$ Ads, ...
# tag by TikTok
# tags by Snapchat
# tags by Facebook
params_to_remove = {
" gclid " , " _ga " , " gclsrc " , " dclid " ,
" utm_source " , " utm_medium " , " utm_campaign " , " utm_term " , " utm_content " , " utm_cid " ,
" utm_reader " , " utm_name " , " utm_referrer " , " utm_social " , " utm_social-type " , " utm_brand "
" mkt_tok " ,
" campaign_name " , " ad_set_name " , " campaign_id " , " ad_set_id " ,
" fbclid " , " campaign_name " , " ad_set_name " , " ad_set_id " , " media " , " interest_group_name " , " ad_set_id "
" igshid " ,
" cvid " , " oicd " , " msclkid " ,
" soc_src " , " soc_trk " ,
" _openstat " , " yclid " ,
" xtor " , " xtref " , " adid " ,
}
query_to_clean = dict ( parse_qsl ( query_str , keep_blank_values = True ) )
query_cleaned = [ ( k , v ) for k , v in query_to_clean . items ( ) if k not in params_to_remove ]
return urlencode ( query_cleaned , doseq = True )
2023-07-16 10:48:26 +00:00
2023-09-14 15:41:51 +00:00
def _remove_trackers_fragment ( fragment_str ) :
"""
private function
Given a fragment string from a URL , strip out the known trackers
: param query_str : fragment to be cleaned
: return : cleaned fragment
"""
params_to_remove = {
" Echobox " ,
}
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
if ' = ' in fragment_str :
fragment_str = fragment_str . split ( ' & ' )
query_cleaned = [ i for i in fragment_str if i . split ( ' = ' ) [ 0 ] not in params_to_remove ]
fragment_str = ' & ' . join ( query_cleaned )
return fragment_str
2023-02-06 19:41:12 +00:00
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
def shutdown ( exit_code ) :
"""
Cleanly stop execution with a message on execution duration
Remove log messages older that duration specified in config from log file
: param exit_code : return value to pass to shell when exiting
"""
logging . info ( ' Run time : {t:2.1f} seconds. ' . format ( t = time . time ( ) - START_TIME ) )
logging . info ( ' _____________________________________________________________________________________ ' )
2020-10-14 19:51:00 +00:00
2023-09-14 15:41:51 +00:00
# Close logger and log file
logging . shutdown ( )
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Remove older log messages
# Max allowed age of log message
max_delta = timedelta ( TOML [ ' options ' ] [ ' log_days ' ] )
2020-12-18 21:09:34 +00:00
2023-09-14 15:41:51 +00:00
# Open log file
log_file_name = TOML [ ' config ' ] [ ' twitter_account ' ] . lower ( ) + ' .log '
new_log_file_name = TOML [ ' config ' ] [ ' twitter_account ' ] . lower ( ) + ' .log.new '
2020-03-28 10:21:28 +00:00
try :
2023-09-14 15:41:51 +00:00
log_file = open ( log_file_name , ' r ' )
except FileNotFoundError :
# Nothing to do if there is no log file
exit ( exit_code )
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Check each line
pos = log_file . tell ( )
while True :
line = log_file . readline ( )
# Check if we reached the end of the file
if not line :
exit ( exit_code )
2022-09-15 17:58:17 +00:00
2023-09-14 15:41:51 +00:00
try :
# Extract date on log line
date = datetime . strptime ( line [ : 10 ] , ' % Y- % m- %d ' )
except ValueError :
# date was not found on this line, try next one
continue
2022-09-15 17:58:17 +00:00
2023-09-14 15:41:51 +00:00
# Time difference between log message and now
log_delta = datetime . now ( ) - date
# Only keep the number of days of the difference
log_delta = timedelta ( days = log_delta . days )
if log_delta < max_delta :
logging . debug ( " Truncating log file " )
# Reset file pointer to position before reading last line
log_file . seek ( pos )
remainder = log_file . read ( )
output_file = open ( new_log_file_name , ' w ' )
output_file . write ( remainder )
output_file . close ( )
# replace log file by new one
shutil . move ( new_log_file_name , log_file_name )
2022-09-15 17:58:17 +00:00
2023-09-14 15:41:51 +00:00
break # Exit while loop
2023-02-27 11:48:48 +00:00
2023-09-14 15:41:51 +00:00
# Update read pointer position
pos = log_file . tell ( )
exit ( exit_code )
2020-12-19 09:30:19 +00:00
2023-06-12 15:43:08 +00:00
2019-08-01 12:58:41 +00:00
if __name__ == " __main__ " :
main ( sys . argv )