2017-05-28 09:41:05 +02:00
import os . path
import sys
2019-04-03 17:44:07 +02:00
import re
2022-11-05 09:48:13 +01:00
import html
2022-12-11 18:08:40 +01:00
import time
2022-12-14 16:48:54 +01:00
import shutil
2022-11-05 09:48:13 +01:00
2019-04-03 17:44:07 +02:00
import sqlite3
from datetime import datetime , timedelta
2022-11-05 09:44:34 +01:00
import json
import subprocess
2019-04-03 17:44:07 +02:00
2017-05-28 09:41:05 +02:00
import feedparser
from mastodon import Mastodon
import requests
2025-01-24 14:46:14 +01:00
def log ( msg ) :
if False :
print ( ' \033 [96m ' + msg + ' \033 [0m ' , file = sys . stderr ) # cyan in console
2022-12-19 09:38:57 +01:00
def unredir ( redir ) :
r = requests . get ( redir , allow_redirects = False )
2023-02-09 16:51:31 +01:00
redir_count = 0
2022-12-19 09:38:57 +01:00
while r . status_code in { 301 , 302 } :
2023-02-09 16:51:31 +01:00
redir_count = redir_count + 1
if redir_count > 10 :
break
2023-06-02 11:56:38 +02:00
location = r . headers . get ( ' Location ' )
2023-06-02 11:57:04 +02:00
if ' go.france24.com ' in redir :
# decoding hack in case "location" header is UTF-8 encoded (should not !)
location = location . encode ( " latin1 " ) . decode ( " utf-8 " )
2023-06-02 11:56:38 +02:00
if ' http ' not in location :
redir = re . sub ( r ' (https?://[^/]*).*$ ' , r ' \ 1 ' , redir ) + location
2023-01-15 09:35:04 +01:00
else :
2023-06-02 11:56:38 +02:00
redir = location
2022-12-19 09:38:57 +01:00
if ' //ow.ly/ ' in redir or ' //bit.ly/ ' in redir :
redir = redir . replace ( ' https://ow.ly/ ' , ' http://ow.ly/ ' ) # only http
redir = requests . get ( redir , allow_redirects = False ) . headers . get ( ' Location ' )
2023-01-04 08:10:59 +01:00
try :
r = requests . get ( redir , allow_redirects = False , timeout = 5 )
except :
redir = redir . replace ( ' https:// ' , ' http:// ' ) # only http ?
r = requests . get ( redir , allow_redirects = False )
2022-12-19 09:38:57 +01:00
return redir
2017-05-29 22:54:37 +02:00
if len ( sys . argv ) < 4 :
2019-06-25 15:05:16 +02:00
print ( " Usage: python3 tootbot.py twitter_account mastodon_login mastodon_passwd mastodon_instance [max_days [footer_tags [delay]]] " ) # noqa
2017-05-28 09:41:05 +02:00
sys . exit ( 1 )
2019-04-03 17:46:13 +02:00
if len ( sys . argv ) > 4 :
2017-05-28 09:41:05 +02:00
instance = sys . argv [ 4 ]
else :
instance = ' amicale.net '
2019-04-03 17:46:13 +02:00
if len ( sys . argv ) > 5 :
2017-05-29 22:54:37 +02:00
days = int ( sys . argv [ 5 ] )
else :
days = 1
2019-04-03 17:47:28 +02:00
if len ( sys . argv ) > 6 :
tags = sys . argv [ 6 ]
else :
tags = None
2019-06-25 15:05:16 +02:00
if len ( sys . argv ) > 7 :
delay = int ( sys . argv [ 7 ] )
else :
delay = 0
2025-01-24 14:47:02 +01:00
if len ( sys . argv ) > 8 :
lang = sys . argv [ 8 ]
else :
lang = ' fr '
2019-04-03 17:47:28 +02:00
source = sys . argv [ 1 ]
2017-05-28 09:41:05 +02:00
mastodon = sys . argv [ 2 ]
passwd = sys . argv [ 3 ]
2022-12-14 16:48:54 +01:00
if ' http ' not in source :
# switch to local account directory
try :
os . mkdir ( source )
except :
pass
os . chdir ( source )
# copy (old) global sqlite database to local account directory
if not os . path . exists ( ' tootbot.db ' ) :
shutil . copy ( ' ../tootbot.db ' , ' tootbot.db ' )
sql = sqlite3 . connect ( ' tootbot.db ' )
db = sql . cursor ( )
db . execute ( ''' CREATE TABLE IF NOT EXISTS tweets (tweet text, toot text,
twitter text , mastodon text , instance text ) ''' )
2022-11-05 09:44:34 +01:00
# Create application if it does not exist
if not os . path . isfile ( instance + ' .secret ' ) :
if Mastodon . create_app (
' tootbot ' ,
api_base_url = ' https:// ' + instance ,
to_file = instance + ' .secret '
) :
2025-01-24 14:46:14 +01:00
log ( ' tootbot app created on instance ' + instance )
2022-11-05 09:44:34 +01:00
else :
2025-01-24 14:46:14 +01:00
log ( ' failed to create app on instance ' + instance )
2022-11-05 09:44:34 +01:00
sys . exit ( 1 )
2025-01-24 14:46:14 +01:00
global mastodon_api
2022-11-05 09:44:34 +01:00
try :
2023-11-01 19:51:59 +01:00
mastodon_api = Mastodon ( access_token = mastodon + " .secret " )
2025-01-24 14:46:14 +01:00
log ( ' logged ' )
2022-11-05 09:44:34 +01:00
except :
2023-11-01 19:51:59 +01:00
try :
mastodon_api = Mastodon (
client_id = instance + ' .secret ' ,
api_base_url = ' https:// ' + instance
)
2025-01-24 14:46:14 +01:00
log ( ' login ' )
2023-11-01 19:51:59 +01:00
mastodon_api . log_in (
username = mastodon ,
password = passwd ,
scopes = [ ' read ' , ' write ' ] ,
to_file = mastodon + " .secret "
)
except :
print ( " ERROR: First Login Failed! " )
sys . exit ( 1 )
2022-11-05 09:44:34 +01:00
2017-05-28 09:41:05 +02:00
2022-11-06 09:31:06 +01:00
print ( source )
print ( " --------------------------- " )
2019-04-03 17:47:28 +02:00
if source [ : 4 ] == ' http ' :
d = feedparser . parse ( source )
twitter = None
2022-11-06 09:31:06 +01:00
print ( len ( d . entries ) )
2022-11-05 09:45:06 +01:00
for t in reversed ( d . entries ) :
# check if this tweet has been processed
if id in t :
id = t . id
else :
id = t . title
db . execute ( ' SELECT * FROM tweets WHERE tweet = ? AND twitter = ? and mastodon = ? and instance = ? ' , ( id , source , mastodon , instance ) ) # noqa
last = db . fetchone ( )
dt = t . published_parsed
age = datetime . now ( ) - datetime ( dt . tm_year , dt . tm_mon , dt . tm_mday ,
dt . tm_hour , dt . tm_min , dt . tm_sec )
# process only unprocessed tweets less than 1 day old, after delay
if last is None and age < timedelta ( days = days ) and age > timedelta ( days = delay ) :
c = t . title
if twitter and t . author . lower ( ) != ( ' (@ %s ) ' % twitter ) . lower ( ) :
c = ( " RT https://twitter.com/ %s \n " % t . author [ 2 : - 1 ] ) + c
toot_media = [ ]
# get the pictures...
2022-12-07 17:49:14 +02:00
2022-11-05 09:45:06 +01:00
if ' summary ' in t :
for p in re . finditer ( r " https://pbs.twimg.com/[^ \ xa0 \" ]* " , t . summary ) :
media = requests . get ( p . group ( 0 ) )
media_posted = mastodon_api . media_post (
media . content , mime_type = media . headers . get ( ' content-type ' ) )
toot_media . append ( media_posted [ ' id ' ] )
2022-12-11 18:10:45 +01:00
for p in re . finditer ( r " https://imgs.xkcd.com/[^ \" ]* " , t . summary ) :
print ( p . group ( 0 ) )
media = requests . get ( p . group ( 0 ) )
media_posted = mastodon_api . media_post (
media . content , mime_type = media . headers . get ( ' content-type ' ) )
toot_media . append ( media_posted [ ' id ' ] )
2022-11-05 09:45:06 +01:00
2022-12-07 17:49:14 +02:00
for p in re . finditer ( r " https://i.redd.it/[a-zA-Z0-9]*.(gif/jpg/mp4/png|webp) " , t . summary ) :
mediaUrl = p . group ( 0 )
try :
media = requests . get ( mediaUrl )
media_posted = mastodon_api . media_post (
media . content , mime_type = media . headers . get ( ' content-type ' ) )
toot_media . append ( media_posted [ ' id ' ] )
except :
print ( ' Could not upload media to Mastodon! ' + mediaUrl )
2022-11-05 09:45:06 +01:00
if ' links ' in t :
for l in t . links :
2022-12-07 17:49:14 +02:00
if l . type in ( ' image/gif ' , ' image/jpg ' , ' image/png ' , ' image/webp ' ) :
2023-07-02 10:56:24 +02:00
media = requests . get ( l . url , headers = { ' User-agent ' : ' Mozilla/5.0 ' } )
if media . status_code == 200 :
media_posted = mastodon_api . media_post (
media . content , mime_type = media . headers . get ( ' content-type ' ) )
toot_media . append ( media_posted [ ' id ' ] )
2022-11-05 09:45:06 +01:00
# replace short links by original URL
m = re . search ( r " http[^ \ xa0]* " , c )
if m is not None :
l = m . group ( 0 )
2022-12-07 17:49:14 +02:00
try :
2022-12-19 09:38:57 +01:00
redir = unredir ( l )
c = c . replace ( l , redir )
2022-12-07 17:49:14 +02:00
except :
print ( ' Cannot resolve link redirect: ' + l )
2022-11-05 09:45:06 +01:00
# remove ellipsis
c = c . replace ( ' \xa0 … ' , ' ' )
if ' authors ' in t :
c = c + ' \n Source: ' + t . authors [ 0 ] . name
c = c + ' \n \n ' + t . link
2022-12-07 17:49:14 +02:00
# replace links to reddit by libreddit ones
c = c . replace ( ' old.reddit.com ' , ' libreddit.net ' )
c = c . replace ( ' reddit.com ' , ' libreddit.net ' )
2022-11-05 09:45:06 +01:00
if tags :
c = c + ' \n ' + tags
if toot_media is not None :
toot = mastodon_api . status_post ( c ,
in_reply_to_id = None ,
media_ids = toot_media ,
sensitive = False ,
2023-07-02 10:57:00 +02:00
visibility = ' unlisted ' ,
2022-11-05 09:45:06 +01:00
spoiler_text = None )
if " id " in toot :
db . execute ( " INSERT INTO tweets VALUES ( ? , ? , ? , ? , ? ) " ,
( id , toot [ " id " ] , source , mastodon , instance ) )
sql . commit ( )
2019-04-03 17:47:28 +02:00
else :
2022-12-14 16:48:54 +01:00
# cleanup local database after migration from the global one
db . execute ( " DELETE FROM tweets WHERE twitter != ? " , ( source , ) )
sql . commit ( )
db . execute ( " VACUUM " )
2022-11-06 09:31:36 +01:00
subprocess . run ( ' rm -f tweets.*json; twint -u %s -tl --limit 10 --json -o tweets.sjson; jq -s . tweets.sjson > tweets.json ' %
2022-11-05 09:47:45 +01:00
( source , ) , shell = True , capture_output = True )
d = json . load ( open ( ' tweets.json ' , ' r ' ) )
2019-04-03 17:47:28 +02:00
twitter = source
2017-05-28 09:41:05 +02:00
2022-11-06 09:31:06 +01:00
print ( len ( d ) )
2022-11-05 09:47:45 +01:00
for t in reversed ( d ) :
c = html . unescape ( t [ ' tweet ' ] )
# do not toot twitter replies
if ' reply_to ' in t and len ( t [ ' reply_to ' ] ) > 0 :
2023-04-27 18:07:50 +02:00
# print('Reply:',c)
2022-11-05 09:47:45 +01:00
continue
# do not toot twitter quoted RT
if ' quote_url ' in t and t [ ' quote_url ' ] != ' ' :
2023-04-27 18:07:50 +02:00
# print('Quoted:', c)
2022-11-05 09:47:45 +01:00
continue
2023-04-28 12:37:27 +02:00
# check if this tweet has been processed
# new id from status link to support threads
id = t [ ' link ' ] . split ( ' / ' ) [ - 1 ]
db . execute ( ' SELECT * FROM tweets WHERE (tweet like ? or tweet = ?) AND twitter = ? and mastodon = ? and instance = ? ' , ( id + ' % ' , t [ ' id ' ] , source , mastodon , instance ) ) # noqa
if db . fetchone ( ) :
continue
2023-04-27 18:08:11 +02:00
# detect threads
in_reply_to = None
if ' conversation_id ' in t and t [ ' conversation_id ' ] not in t [ ' link ' ] :
2023-04-28 12:37:27 +02:00
db . execute ( ' SELECT toot FROM tweets WHERE tweet like ? AND twitter = ? ORDER BY tweet DESC LIMIT 1 ' , ( ' % ' + t [ ' conversation_id ' ] , source ) ) # noqa
2023-04-27 18:08:11 +02:00
thread = db . fetchone ( )
if thread :
2023-04-28 12:37:27 +02:00
in_reply_to = thread [ 0 ] . split ( ) [ - 1 ]
print ( " Thread : " , t [ ' conversation_id ' ] ,
t [ ' link ' ] , thread [ 0 ] , in_reply_to )
2022-11-05 09:47:45 +01:00
if c [ - 1 ] == " … " :
continue
2017-05-28 09:41:05 +02:00
toot_media = [ ]
2022-11-05 09:47:45 +01:00
if twitter and t [ ' username ' ] . lower ( ) != twitter . lower ( ) :
c = ( " RT https://twitter.com/ %s \n " % t [ ' username ' ] ) + c
# get the pictures...
for p in re . finditer ( r " https://pbs.twimg.com/[^ \ xa0 \" ]* " , t [ ' tweet ' ] ) :
2022-05-02 16:06:17 +02:00
media = requests . get ( p . group ( 0 ) )
2022-11-05 09:47:45 +01:00
media_posted = mastodon_api . media_post (
media . content , mime_type = media . headers . get ( ' content-type ' ) )
2022-05-02 16:06:17 +02:00
toot_media . append ( media_posted [ ' id ' ] )
2022-11-05 09:47:45 +01:00
if ' photos ' in t :
for url in t [ ' photos ' ] :
2023-04-27 18:07:50 +02:00
# print('photo', url)
2022-12-14 13:44:16 +01:00
try :
media = requests . get ( url . replace (
' https://pbs.twimg.com/ ' , ' https://nitter.net/pic/orig/ ' ) )
2023-04-27 18:07:50 +02:00
# print("received nitter", media.headers.get('content-type'))
2022-12-14 13:44:16 +01:00
media_posted = mastodon_api . media_post (
media . content , mime_type = media . headers . get ( ' content-type ' ) )
2023-04-27 18:07:50 +02:00
# print("posted")
2022-12-14 13:44:16 +01:00
toot_media . append ( media_posted [ ' id ' ] )
except :
media = requests . get ( url )
2023-04-27 18:07:50 +02:00
# print("received twitter", media.headers.get('content-type'))
2022-12-14 13:44:16 +01:00
media_posted = mastodon_api . media_post (
media . content , mime_type = media . headers . get ( ' content-type ' ) )
2023-04-27 18:07:50 +02:00
# print("posted")
2022-12-14 13:44:16 +01:00
toot_media . append ( media_posted [ ' id ' ] )
2017-05-28 09:41:05 +02:00
2019-04-03 17:47:28 +02:00
# replace short links by original URL
2022-11-05 09:47:45 +01:00
links = re . findall ( r " http[^ \ xa0]* " , c )
for l in links :
2022-12-19 09:38:57 +01:00
redir = unredir ( l )
m = re . search ( r ' twitter.com/.*/photo/ ' , redir )
if m is None :
c = c . replace ( l , redir )
else :
c = c . replace ( l , ' ' )
m = re . search ( r ' (twitter.com/.*/video/|youtube.com) ' , redir )
if m is None :
c = c . replace ( l , redir )
else :
video = redir
2023-04-27 18:07:50 +02:00
# print('video:', video)
2023-04-27 15:29:08 +02:00
video_json = subprocess . run ( ' yt-dlp -s -j %s ' %
( video , ) , shell = True , capture_output = True )
video_info = json . loads ( video_json . stdout )
if video_info [ ' duration ' ] < 600 :
2023-04-27 18:07:50 +02:00
# print('lien:', l)
2023-04-27 15:29:08 +02:00
c = c . replace ( l , ' ' )
subprocess . run ( ' rm -f out.*; yt-dlp -N 8 -o out.mp4 --recode-video mp4 --no-playlist --max-filesize 100M %s ' %
( video , ) , shell = True , capture_output = False )
2023-04-27 18:07:50 +02:00
# print("received")
2023-04-27 15:29:08 +02:00
try :
file = open ( " out.mp4 " , " rb " )
video_data = file . read ( )
file . close ( )
media_posted = mastodon_api . media_post ( video_data , mime_type = ' video/mp4 ' )
c = c . replace ( video , ' ' )
2023-04-27 18:07:50 +02:00
# print("posted")
2023-04-27 15:29:08 +02:00
toot_media . append ( media_posted [ ' id ' ] )
os . remove ( " out.mp4 " )
except :
pass
else :
print ( " video duration > 600s : " , video_info [ ' duration ' ] )
2017-05-28 09:41:05 +02:00
# remove pic.twitter.com links
m = re . search ( r " pic.twitter.com[^ \ xa0]* " , c )
2019-04-03 17:47:28 +02:00
if m is not None :
2017-05-28 09:41:05 +02:00
l = m . group ( 0 )
2019-04-03 17:46:13 +02:00
c = c . replace ( l , ' ' )
2017-05-28 09:41:05 +02:00
# remove ellipsis
2019-04-03 17:47:28 +02:00
c = c . replace ( ' \xa0 … ' , ' ' )
2022-11-06 09:32:39 +01:00
#c = c.replace(' ', '\n').replace('. ', '.\n')
# replace links to twitter by nitter ones
c = c . replace ( ' /twitter.com/ ' , ' /nitter.net/ ' )
2022-12-19 09:40:16 +01:00
# replace utm_? tracking
c = re . sub ( ' \ ?utm.*$ ' , ' ?utm_medium=Social&utm_source=Mastodon ' , c )
2019-04-03 17:47:28 +02:00
if tags :
c = c + ' \n ' + tags
2022-11-05 09:47:45 +01:00
2022-12-14 16:48:14 +01:00
try :
if len ( toot_media ) > 0 :
time . sleep ( 5 )
toot = mastodon_api . status_post ( c ,
2023-04-27 18:08:11 +02:00
in_reply_to_id = in_reply_to ,
2022-12-14 16:48:14 +01:00
media_ids = toot_media ,
sensitive = False ,
visibility = ' unlisted ' ,
2025-01-24 14:47:02 +01:00
spoiler_text = None , language = lang )
2022-12-14 16:48:14 +01:00
except :
2023-04-27 18:08:11 +02:00
print ( " delay " )
time . sleep ( 30 )
2022-12-14 16:48:14 +01:00
toot = mastodon_api . status_post ( c ,
2023-04-27 18:08:11 +02:00
in_reply_to_id = in_reply_to ,
2019-04-03 17:46:13 +02:00
media_ids = toot_media ,
sensitive = False ,
2022-11-10 11:56:40 +01:00
visibility = ' unlisted ' ,
2025-01-24 14:47:02 +01:00
spoiler_text = None , language = lang )
2022-12-14 16:48:14 +01:00
pass
2022-12-11 18:08:40 +01:00
2022-12-14 16:48:14 +01:00
#break
if " id " in toot :
2023-04-28 12:37:27 +02:00
db . execute ( " INSERT INTO tweets VALUES ( ? , ? , ? , ? , ? ) " , ( id + ' ' + t [ ' conversation_id ' ] , toot [ " id " ] , source , mastodon , instance ) )
2022-12-14 16:48:14 +01:00
sql . commit ( )
print ( source , " : tweet created at " , t [ ' created_at ' ] )
2022-11-05 09:47:45 +01:00
2022-12-11 18:08:40 +01:00
print ( " --------------------------- " )
2022-11-06 09:31:06 +01:00
print ( )