tootbot/tootbot.py

373 lines
14 KiB
Python
Raw Normal View History

2017-05-28 07:41:05 +00:00
import os.path
import sys
2019-04-03 15:44:07 +00:00
import re
2022-11-05 08:48:13 +00:00
import html
import time
import shutil
2022-11-05 08:48:13 +00:00
2019-04-03 15:44:07 +00:00
import sqlite3
from datetime import datetime, timedelta
2022-11-05 08:44:34 +00:00
import json
import subprocess
2019-04-03 15:44:07 +00:00
2017-05-28 07:41:05 +00:00
import feedparser
from mastodon import Mastodon
import requests
def unredir(redir):
r = requests.get(redir, allow_redirects=False)
2023-02-09 15:51:31 +00:00
redir_count = 0
while r.status_code in {301, 302}:
2023-02-09 15:51:31 +00:00
redir_count = redir_count + 1
if redir_count > 10:
break
2023-06-02 09:56:38 +00:00
location = r.headers.get('Location')
if 'go.france24.com' in redir:
# decoding hack in case "location" header is UTF-8 encoded (should not !)
location = location.encode("latin1").decode("utf-8")
2023-06-02 09:56:38 +00:00
if 'http' not in location:
redir = re.sub(r'(https?://[^/]*).*$', r'\1', redir) + location
2023-01-15 08:35:04 +00:00
else:
2023-06-02 09:56:38 +00:00
redir = location
if '//ow.ly/' in redir or '//bit.ly/' in redir:
redir = redir.replace('https://ow.ly/', 'http://ow.ly/') # only http
redir = requests.get(redir, allow_redirects=False).headers.get('Location')
try:
r = requests.get(redir, allow_redirects=False, timeout=5)
except:
redir = redir.replace('https://', 'http://') # only http ?
r = requests.get(redir, allow_redirects=False)
return redir
if len(sys.argv) < 4:
2019-06-25 13:05:16 +00:00
print("Usage: python3 tootbot.py twitter_account mastodon_login mastodon_passwd mastodon_instance [max_days [footer_tags [delay]]]") # noqa
2017-05-28 07:41:05 +00:00
sys.exit(1)
2019-04-03 15:46:13 +00:00
if len(sys.argv) > 4:
2017-05-28 07:41:05 +00:00
instance = sys.argv[4]
else:
instance = 'amicale.net'
2019-04-03 15:46:13 +00:00
if len(sys.argv) > 5:
days = int(sys.argv[5])
else:
days = 1
2019-04-03 15:47:28 +00:00
if len(sys.argv) > 6:
tags = sys.argv[6]
else:
tags = None
2019-06-25 13:05:16 +00:00
if len(sys.argv) > 7:
delay = int(sys.argv[7])
else:
delay = 0
2019-04-03 15:47:28 +00:00
source = sys.argv[1]
2017-05-28 07:41:05 +00:00
mastodon = sys.argv[2]
passwd = sys.argv[3]
if 'http' not in source:
# switch to local account directory
try:
os.mkdir(source)
except:
pass
os.chdir(source)
# copy (old) global sqlite database to local account directory
if not os.path.exists('tootbot.db'):
shutil.copy('../tootbot.db', 'tootbot.db')
sql = sqlite3.connect('tootbot.db')
db = sql.cursor()
db.execute('''CREATE TABLE IF NOT EXISTS tweets (tweet text, toot text,
twitter text, mastodon text, instance text)''')
2022-11-05 08:44:34 +00:00
# Create application if it does not exist
if not os.path.isfile(instance+'.secret'):
if Mastodon.create_app(
'tootbot',
api_base_url='https://'+instance,
to_file=instance+'.secret'
):
print('tootbot app created on instance '+instance)
else:
print('failed to create app on instance '+instance)
sys.exit(1)
try:
mastodon_api = Mastodon(
client_id=instance+'.secret',
api_base_url='https://'+instance
)
mastodon_api.log_in(
username=mastodon,
password=passwd,
scopes=['read', 'write'],
to_file=mastodon+".secret"
)
except:
print("ERROR: First Login Failed!")
sys.exit(1)
2017-05-28 07:41:05 +00:00
2022-11-06 08:31:06 +00:00
print(source)
print("---------------------------")
2019-04-03 15:47:28 +00:00
if source[:4] == 'http':
d = feedparser.parse(source)
twitter = None
2022-11-06 08:31:06 +00:00
print(len(d.entries))
2022-11-05 08:45:06 +00:00
for t in reversed(d.entries):
# check if this tweet has been processed
if id in t:
id = t.id
else:
id = t.title
db.execute('SELECT * FROM tweets WHERE tweet = ? AND twitter = ? and mastodon = ? and instance = ?', (id, source, mastodon, instance)) # noqa
last = db.fetchone()
dt = t.published_parsed
age = datetime.now()-datetime(dt.tm_year, dt.tm_mon, dt.tm_mday,
dt.tm_hour, dt.tm_min, dt.tm_sec)
# process only unprocessed tweets less than 1 day old, after delay
if last is None and age < timedelta(days=days) and age > timedelta(days=delay):
c = t.title
if twitter and t.author.lower() != ('(@%s)' % twitter).lower():
c = ("RT https://twitter.com/%s\n" % t.author[2:-1]) + c
toot_media = []
# get the pictures...
2022-12-07 15:49:14 +00:00
2022-11-05 08:45:06 +00:00
if 'summary' in t:
for p in re.finditer(r"https://pbs.twimg.com/[^ \xa0\"]*", t.summary):
media = requests.get(p.group(0))
media_posted = mastodon_api.media_post(
media.content, mime_type=media.headers.get('content-type'))
toot_media.append(media_posted['id'])
2022-12-11 17:10:45 +00:00
for p in re.finditer(r"https://imgs.xkcd.com/[^ \"]*", t.summary):
print(p.group(0))
media = requests.get(p.group(0))
media_posted = mastodon_api.media_post(
media.content, mime_type=media.headers.get('content-type'))
toot_media.append(media_posted['id'])
2022-11-05 08:45:06 +00:00
2022-12-07 15:49:14 +00:00
for p in re.finditer(r"https://i.redd.it/[a-zA-Z0-9]*.(gif/jpg/mp4/png|webp)", t.summary):
mediaUrl = p.group(0)
try:
media = requests.get(mediaUrl)
media_posted = mastodon_api.media_post(
media.content, mime_type=media.headers.get('content-type'))
toot_media.append(media_posted['id'])
except:
print('Could not upload media to Mastodon! ' + mediaUrl)
2022-11-05 08:45:06 +00:00
if 'links' in t:
for l in t.links:
2022-12-07 15:49:14 +00:00
if l.type in ('image/gif', 'image/jpg', 'image/png', 'image/webp'):
2023-07-02 08:56:24 +00:00
media = requests.get(l.url, headers = {'User-agent': 'Mozilla/5.0'})
if media.status_code == 200:
media_posted = mastodon_api.media_post(
media.content, mime_type=media.headers.get('content-type'))
toot_media.append(media_posted['id'])
2022-11-05 08:45:06 +00:00
# replace short links by original URL
m = re.search(r"http[^ \xa0]*", c)
if m is not None:
l = m.group(0)
2022-12-07 15:49:14 +00:00
try:
redir = unredir(l)
c = c.replace(l, redir)
2022-12-07 15:49:14 +00:00
except:
print('Cannot resolve link redirect: ' + l)
2022-11-05 08:45:06 +00:00
# remove ellipsis
c = c.replace('\xa0', ' ')
if 'authors' in t:
c = c + '\nSource: ' + t.authors[0].name
c = c + '\n\n' + t.link
2022-12-07 15:49:14 +00:00
# replace links to reddit by libreddit ones
c = c.replace('old.reddit.com', 'libreddit.net')
c = c.replace('reddit.com', 'libreddit.net')
2022-11-05 08:45:06 +00:00
if tags:
c = c + '\n' + tags
if toot_media is not None:
toot = mastodon_api.status_post(c,
in_reply_to_id=None,
media_ids=toot_media,
sensitive=False,
2023-07-02 08:57:00 +00:00
visibility='unlisted',
2022-11-05 08:45:06 +00:00
spoiler_text=None)
if "id" in toot:
db.execute("INSERT INTO tweets VALUES ( ? , ? , ? , ? , ? )",
(id, toot["id"], source, mastodon, instance))
sql.commit()
2019-04-03 15:47:28 +00:00
else:
# cleanup local database after migration from the global one
db.execute("DELETE FROM tweets WHERE twitter != ?", (source,))
sql.commit()
db.execute("VACUUM")
2022-11-06 08:31:36 +00:00
subprocess.run('rm -f tweets.*json; twint -u %s -tl --limit 10 --json -o tweets.sjson; jq -s . tweets.sjson > tweets.json' %
2022-11-05 08:47:45 +00:00
(source,), shell=True, capture_output=True)
d = json.load(open('tweets.json','r'))
2019-04-03 15:47:28 +00:00
twitter = source
2017-05-28 07:41:05 +00:00
2022-11-06 08:31:06 +00:00
print(len(d))
2022-11-05 08:47:45 +00:00
for t in reversed(d):
c = html.unescape(t['tweet'])
# do not toot twitter replies
if 'reply_to' in t and len(t['reply_to'])>0:
2023-04-27 16:07:50 +00:00
# print('Reply:',c)
2022-11-05 08:47:45 +00:00
continue
# do not toot twitter quoted RT
if 'quote_url' in t and t['quote_url'] != '':
2023-04-27 16:07:50 +00:00
# print('Quoted:', c)
2022-11-05 08:47:45 +00:00
continue
2023-04-28 10:37:27 +00:00
# check if this tweet has been processed
# new id from status link to support threads
id = t['link'].split('/')[-1]
db.execute('SELECT * FROM tweets WHERE (tweet like ? or tweet = ?) AND twitter = ? and mastodon = ? and instance = ?', (id+'%', t['id'], source, mastodon, instance)) # noqa
if db.fetchone():
continue
2023-04-27 16:08:11 +00:00
# detect threads
in_reply_to = None
if 'conversation_id' in t and t['conversation_id'] not in t['link']:
2023-04-28 10:37:27 +00:00
db.execute('SELECT toot FROM tweets WHERE tweet like ? AND twitter = ? ORDER BY tweet DESC LIMIT 1', ('% '+t['conversation_id'], source)) # noqa
2023-04-27 16:08:11 +00:00
thread = db.fetchone()
if thread:
2023-04-28 10:37:27 +00:00
in_reply_to = thread[0].split()[-1]
print("Thread :", t['conversation_id'],
t['link'], thread[0], in_reply_to)
2022-11-05 08:47:45 +00:00
if c[-1] == "":
continue
2017-05-28 07:41:05 +00:00
toot_media = []
2022-11-05 08:47:45 +00:00
if twitter and t['username'].lower() != twitter.lower():
c = ("RT https://twitter.com/%s\n" % t['username']) + c
# get the pictures...
for p in re.finditer(r"https://pbs.twimg.com/[^ \xa0\"]*", t['tweet']):
media = requests.get(p.group(0))
2022-11-05 08:47:45 +00:00
media_posted = mastodon_api.media_post(
media.content, mime_type=media.headers.get('content-type'))
toot_media.append(media_posted['id'])
2022-11-05 08:47:45 +00:00
if 'photos' in t:
for url in t['photos']:
2023-04-27 16:07:50 +00:00
# print('photo', url)
try:
media = requests.get(url.replace(
'https://pbs.twimg.com/', 'https://nitter.net/pic/orig/'))
2023-04-27 16:07:50 +00:00
# print("received nitter", media.headers.get('content-type'))
media_posted = mastodon_api.media_post(
media.content, mime_type=media.headers.get('content-type'))
2023-04-27 16:07:50 +00:00
# print("posted")
toot_media.append(media_posted['id'])
except:
media = requests.get(url)
2023-04-27 16:07:50 +00:00
# print("received twitter", media.headers.get('content-type'))
media_posted = mastodon_api.media_post(
media.content, mime_type=media.headers.get('content-type'))
2023-04-27 16:07:50 +00:00
# print("posted")
toot_media.append(media_posted['id'])
2017-05-28 07:41:05 +00:00
2019-04-03 15:47:28 +00:00
# replace short links by original URL
2022-11-05 08:47:45 +00:00
links = re.findall(r"http[^ \xa0]*", c)
for l in links:
redir = unredir(l)
m = re.search(r'twitter.com/.*/photo/', redir)
if m is None:
c = c.replace(l, redir)
else:
c = c.replace(l, '')
m = re.search(r'(twitter.com/.*/video/|youtube.com)', redir)
if m is None:
c = c.replace(l, redir)
else:
video = redir
2023-04-27 16:07:50 +00:00
# print('video:', video)
video_json = subprocess.run('yt-dlp -s -j %s' %
(video,), shell=True, capture_output=True)
video_info = json.loads(video_json.stdout)
if video_info['duration'] < 600:
2023-04-27 16:07:50 +00:00
# print('lien:', l)
c = c.replace(l, '')
subprocess.run('rm -f out.*; yt-dlp -N 8 -o out.mp4 --recode-video mp4 --no-playlist --max-filesize 100M %s' %
(video,), shell=True, capture_output=False)
2023-04-27 16:07:50 +00:00
# print("received")
try:
file = open("out.mp4", "rb")
video_data = file.read()
file.close()
media_posted = mastodon_api.media_post(video_data, mime_type='video/mp4')
c = c.replace(video, '')
2023-04-27 16:07:50 +00:00
# print("posted")
toot_media.append(media_posted['id'])
os.remove("out.mp4")
except:
pass
else:
print("video duration > 600s : ", video_info['duration'])
2017-05-28 07:41:05 +00:00
# remove pic.twitter.com links
m = re.search(r"pic.twitter.com[^ \xa0]*", c)
2019-04-03 15:47:28 +00:00
if m is not None:
2017-05-28 07:41:05 +00:00
l = m.group(0)
2019-04-03 15:46:13 +00:00
c = c.replace(l, ' ')
2017-05-28 07:41:05 +00:00
# remove ellipsis
2019-04-03 15:47:28 +00:00
c = c.replace('\xa0', ' ')
#c = c.replace(' ', '\n').replace('. ', '.\n')
# replace links to twitter by nitter ones
c = c.replace('/twitter.com/', '/nitter.net/')
# replace utm_? tracking
c = re.sub('\?utm.*$', '?utm_medium=Social&utm_source=Mastodon', c)
2019-04-03 15:47:28 +00:00
if tags:
c = c + '\n' + tags
2022-11-05 08:47:45 +00:00
try:
if len(toot_media)>0:
time.sleep(5)
toot = mastodon_api.status_post(c,
2023-04-27 16:08:11 +00:00
in_reply_to_id=in_reply_to,
media_ids=toot_media,
sensitive=False,
visibility='unlisted',
spoiler_text=None)
except:
2023-04-27 16:08:11 +00:00
print("delay")
time.sleep(30)
toot = mastodon_api.status_post(c,
2023-04-27 16:08:11 +00:00
in_reply_to_id=in_reply_to,
2019-04-03 15:46:13 +00:00
media_ids=toot_media,
sensitive=False,
visibility='unlisted',
2019-04-03 15:46:13 +00:00
spoiler_text=None)
pass
#break
if "id" in toot:
2023-04-28 10:37:27 +00:00
db.execute("INSERT INTO tweets VALUES ( ? , ? , ? , ? , ? )", (id+' '+t['conversation_id'], toot["id"], source, mastodon, instance))
sql.commit()
print(source, ": tweet created at",t['created_at'])
2022-11-05 08:47:45 +00:00
print("---------------------------")
2022-11-06 08:31:06 +00:00
print()