mirror of
https://gitlab.com/jeancf/twoot.git
synced 2025-03-27 07:56:48 +00:00
Random selection of nitter mirror to use
This commit is contained in:
parent
8e4f13c26a
commit
807dad3480
55
twoot.py
55
twoot.py
@ -2,7 +2,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Copyright (C) 2020 Jean-Christophe Francois
|
Copyright (C) 2020-2021 Jean-Christophe Francois
|
||||||
|
|
||||||
This program is free software: you can redistribute it and/or modify
|
This program is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -34,14 +34,25 @@ from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalA
|
|||||||
import subprocess
|
import subprocess
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
NITTER_URL = 'https://nitter.42l.fr'
|
NITTER_URLS = [
|
||||||
|
'https://nitter.42l.fr',
|
||||||
|
'https://nitter.pussthecat.org/',
|
||||||
|
'https://nitter.mastodont.cat',
|
||||||
|
'https://nitter.tedomum.net',
|
||||||
|
'https://nitter.fdn.fr/',
|
||||||
|
'https://nitter.unixfox.eu',
|
||||||
|
'https://nitter.eu',
|
||||||
|
'https://nitter.namazso.eu',
|
||||||
|
'https://nitter.mailstation.de',
|
||||||
|
'https://nitter.cattube.org'
|
||||||
|
]
|
||||||
|
|
||||||
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
|
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
|
||||||
USER_AGENTS = [
|
USER_AGENTS = [
|
||||||
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
|
||||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Edg/88.0.705.81',
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
|
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Vivaldi/3.6',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -77,7 +88,7 @@ def process_media_body(tt_iter):
|
|||||||
return tweet_text
|
return tweet_text
|
||||||
|
|
||||||
|
|
||||||
def process_card(card_container):
|
def process_card(nitter_url, card_container):
|
||||||
"""
|
"""
|
||||||
Extract image from card in case mastodon does not do it
|
Extract image from card in case mastodon does not do it
|
||||||
:param card_container: soup of 'a' tag containing card markup
|
:param card_container: soup of 'a' tag containing card markup
|
||||||
@ -87,18 +98,19 @@ def process_card(card_container):
|
|||||||
|
|
||||||
img = card_container.div.div.img
|
img = card_container.div.div.img
|
||||||
if img is not None:
|
if img is not None:
|
||||||
image_url = NITTER_URL + img.get('src')
|
image_url = nitter_url + img.get('src')
|
||||||
list.append(image_url)
|
list.append(image_url)
|
||||||
logging.debug('Extracted image from card')
|
logging.debug('Extracted image from card')
|
||||||
|
|
||||||
return list
|
return list
|
||||||
|
|
||||||
|
|
||||||
def process_attachments(attachments_container, get_vids, twit_account, status_id, author_account):
|
def process_attachments(nitter_url, attachments_container, get_vids, twit_account, status_id, author_account):
|
||||||
"""
|
"""
|
||||||
Extract images or video from attachments. Videos are downloaded on the file system.
|
Extract images or video from attachments. Videos are downloaded on the file system.
|
||||||
:param card_container: soup of 'div' tag containing attachments markup
|
:param nitter_url: url of nitter mirror
|
||||||
:param get_vids: whether to download vids or not
|
:param attachments_container: soup of 'div' tag containing attachments markup
|
||||||
|
:param get_vids: whether to download videos or not
|
||||||
:param twit_account: name of twitter account
|
:param twit_account: name of twitter account
|
||||||
:param status_id: id of tweet being processed
|
:param status_id: id of tweet being processed
|
||||||
:param author_account: author of tweet with video attachment
|
:param author_account: author of tweet with video attachment
|
||||||
@ -108,14 +120,14 @@ def process_attachments(attachments_container, get_vids, twit_account, status_id
|
|||||||
pics = []
|
pics = []
|
||||||
images = attachments_container.find_all('a', class_='still-image')
|
images = attachments_container.find_all('a', class_='still-image')
|
||||||
for image in images:
|
for image in images:
|
||||||
pics.append(NITTER_URL + image.get('href'))
|
pics.append(nitter_url + image.get('href'))
|
||||||
|
|
||||||
logging.debug('collected ' + str(len(pics)) + ' images from attachments')
|
logging.debug('collected ' + str(len(pics)) + ' images from attachments')
|
||||||
|
|
||||||
# Download nitter video (converted animated GIF)
|
# Download nitter video (converted animated GIF)
|
||||||
gif_class = attachments_container.find('video', class_='gif')
|
gif_class = attachments_container.find('video', class_='gif')
|
||||||
if gif_class is not None:
|
if gif_class is not None:
|
||||||
gif_video_file = NITTER_URL + gif_class.source.get('src')
|
gif_video_file = nitter_url + gif_class.source.get('src')
|
||||||
|
|
||||||
video_path = os.path.join('output', twit_account, status_id, author_account, status_id)
|
video_path = os.path.join('output', twit_account, status_id, author_account, status_id)
|
||||||
os.makedirs(video_path, exist_ok=True)
|
os.makedirs(video_path, exist_ok=True)
|
||||||
@ -224,7 +236,7 @@ def main(argv):
|
|||||||
# pass
|
# pass
|
||||||
|
|
||||||
# Setup logging to file
|
# Setup logging to file
|
||||||
logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG)
|
logging.basicConfig(filename=twit_account + '.log', level=logging.WARNING)
|
||||||
logging.info('Running with the following parameters:')
|
logging.info('Running with the following parameters:')
|
||||||
logging.info(' -t ' + twit_account)
|
logging.info(' -t ' + twit_account)
|
||||||
logging.info(' -i ' + mast_instance)
|
logging.info(' -i ' + mast_instance)
|
||||||
@ -240,6 +252,9 @@ def main(argv):
|
|||||||
db.execute('''CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
|
db.execute('''CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
|
||||||
mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
|
mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
|
||||||
|
|
||||||
|
# Select random nitter instance to fetch updates from
|
||||||
|
nitter_url = NITTER_URLS[random.randint(0, len(NITTER_URLS)-1)]
|
||||||
|
|
||||||
# **********************************************************
|
# **********************************************************
|
||||||
# Load twitter page of user. Process all tweets and generate
|
# Load twitter page of user. Process all tweets and generate
|
||||||
# list of dictionaries ready to be posted on Mastodon
|
# list of dictionaries ready to be posted on Mastodon
|
||||||
@ -261,7 +276,7 @@ def main(argv):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
url = NITTER_URL + '/' + twit_account
|
url = nitter_url + '/' + twit_account
|
||||||
# Use different page if we need to handle replies
|
# Use different page if we need to handle replies
|
||||||
if tweets_and_replies:
|
if tweets_and_replies:
|
||||||
url += '/with_replies'
|
url += '/with_replies'
|
||||||
@ -286,7 +301,7 @@ def main(argv):
|
|||||||
|
|
||||||
# Replace twit_account with version with correct capitalization
|
# Replace twit_account with version with correct capitalization
|
||||||
ta = soup.find('meta', property='og:title').get('content')
|
ta = soup.find('meta', property='og:title').get('content')
|
||||||
ta_match = re.search('\(@(.+)\)', ta)
|
ta_match = re.search(r'\(@(.+)\)', ta)
|
||||||
if ta_match is not None:
|
if ta_match is not None:
|
||||||
twit_account = ta_match.group(1)
|
twit_account = ta_match.group(1)
|
||||||
|
|
||||||
@ -367,12 +382,12 @@ def main(argv):
|
|||||||
# Process card : extract image if necessary
|
# Process card : extract image if necessary
|
||||||
card_class = status.find('a', class_='card-container')
|
card_class = status.find('a', class_='card-container')
|
||||||
if card_class is not None:
|
if card_class is not None:
|
||||||
photos.extend(process_card(card_class))
|
photos.extend(process_card(nitter_url, card_class))
|
||||||
|
|
||||||
# Process attachment: capture image or .mp4 url or download twitter video
|
# Process attachment: capture image or .mp4 url or download twitter video
|
||||||
attachments_class = status.find('div', class_='attachments')
|
attachments_class = status.find('div', class_='attachments')
|
||||||
if attachments_class is not None:
|
if attachments_class is not None:
|
||||||
pics, vid_in_tweet = process_attachments(attachments_class, get_vids, twit_account, status_id, author_account)
|
pics, vid_in_tweet = process_attachments(nitter_url, attachments_class, get_vids, twit_account, status_id, author_account)
|
||||||
photos.extend(pics)
|
photos.extend(pics)
|
||||||
if vid_in_tweet:
|
if vid_in_tweet:
|
||||||
tweet_text += '\n\n[Video embedded in original tweet]'
|
tweet_text += '\n\n[Video embedded in original tweet]'
|
||||||
@ -412,7 +427,7 @@ def main(argv):
|
|||||||
if video_path.exists():
|
if video_path.exists():
|
||||||
# Take the first subdirectory of video path (named after original poster of video)
|
# Take the first subdirectory of video path (named after original poster of video)
|
||||||
video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
|
video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
|
||||||
# Take again the first subdirectory of video path (named after status id of original post where vidoe is attached)
|
# Take again the first subdirectory of video path (named after status id of original post where video is attached)
|
||||||
video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
|
video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
|
||||||
# list video files
|
# list video files
|
||||||
video_file_list = list(video_path.glob('*.mp4'))
|
video_file_list = list(video_path.glob('*.mp4'))
|
||||||
@ -434,7 +449,7 @@ def main(argv):
|
|||||||
|
|
||||||
logging.debug('Tweet %s added to list of toots to upload', tweet_id)
|
logging.debug('Tweet %s added to list of toots to upload', tweet_id)
|
||||||
|
|
||||||
# TODO Log summary stats: how many not in db, how many in valid timeframe
|
# Log summary stats
|
||||||
logging.info(str(out_date_cnt) + ' tweets outside of valid time range')
|
logging.info(str(out_date_cnt) + ' tweets outside of valid time range')
|
||||||
logging.info(str(in_db_cnt) + ' tweets already in database')
|
logging.info(str(in_db_cnt) + ' tweets already in database')
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user