Random selection of nitter mirror to use

This commit is contained in:
jeancf 2021-03-02 22:08:52 +01:00
parent 8e4f13c26a
commit 807dad3480

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Copyright (C) 2020 Jean-Christophe Francois Copyright (C) 2020-2021 Jean-Christophe Francois
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -34,14 +34,25 @@ from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalA
import subprocess import subprocess
import shutil import shutil
NITTER_URL = 'https://nitter.42l.fr' NITTER_URLS = [
'https://nitter.42l.fr',
'https://nitter.pussthecat.org/',
'https://nitter.mastodont.cat',
'https://nitter.tedomum.net',
'https://nitter.fdn.fr/',
'https://nitter.unixfox.eu',
'https://nitter.eu',
'https://nitter.namazso.eu',
'https://nitter.mailstation.de',
'https://nitter.cattube.org'
]
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/ # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
USER_AGENTS = [ USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Edg/88.0.705.81',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Vivaldi/3.6',
] ]
@ -77,7 +88,7 @@ def process_media_body(tt_iter):
return tweet_text return tweet_text
def process_card(card_container): def process_card(nitter_url, card_container):
""" """
Extract image from card in case mastodon does not do it Extract image from card in case mastodon does not do it
:param card_container: soup of 'a' tag containing card markup :param card_container: soup of 'a' tag containing card markup
@ -87,18 +98,19 @@ def process_card(card_container):
img = card_container.div.div.img img = card_container.div.div.img
if img is not None: if img is not None:
image_url = NITTER_URL + img.get('src') image_url = nitter_url + img.get('src')
list.append(image_url) list.append(image_url)
logging.debug('Extracted image from card') logging.debug('Extracted image from card')
return list return list
def process_attachments(attachments_container, get_vids, twit_account, status_id, author_account): def process_attachments(nitter_url, attachments_container, get_vids, twit_account, status_id, author_account):
""" """
Extract images or video from attachments. Videos are downloaded on the file system. Extract images or video from attachments. Videos are downloaded on the file system.
:param card_container: soup of 'div' tag containing attachments markup :param nitter_url: url of nitter mirror
:param get_vids: whether to download vids or not :param attachments_container: soup of 'div' tag containing attachments markup
:param get_vids: whether to download videos or not
:param twit_account: name of twitter account :param twit_account: name of twitter account
:param status_id: id of tweet being processed :param status_id: id of tweet being processed
:param author_account: author of tweet with video attachment :param author_account: author of tweet with video attachment
@ -108,14 +120,14 @@ def process_attachments(attachments_container, get_vids, twit_account, status_id
pics = [] pics = []
images = attachments_container.find_all('a', class_='still-image') images = attachments_container.find_all('a', class_='still-image')
for image in images: for image in images:
pics.append(NITTER_URL + image.get('href')) pics.append(nitter_url + image.get('href'))
logging.debug('collected ' + str(len(pics)) + ' images from attachments') logging.debug('collected ' + str(len(pics)) + ' images from attachments')
# Download nitter video (converted animated GIF) # Download nitter video (converted animated GIF)
gif_class = attachments_container.find('video', class_='gif') gif_class = attachments_container.find('video', class_='gif')
if gif_class is not None: if gif_class is not None:
gif_video_file = NITTER_URL + gif_class.source.get('src') gif_video_file = nitter_url + gif_class.source.get('src')
video_path = os.path.join('output', twit_account, status_id, author_account, status_id) video_path = os.path.join('output', twit_account, status_id, author_account, status_id)
os.makedirs(video_path, exist_ok=True) os.makedirs(video_path, exist_ok=True)
@ -224,7 +236,7 @@ def main(argv):
# pass # pass
# Setup logging to file # Setup logging to file
logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG) logging.basicConfig(filename=twit_account + '.log', level=logging.WARNING)
logging.info('Running with the following parameters:') logging.info('Running with the following parameters:')
logging.info(' -t ' + twit_account) logging.info(' -t ' + twit_account)
logging.info(' -i ' + mast_instance) logging.info(' -i ' + mast_instance)
@ -240,6 +252,9 @@ def main(argv):
db.execute('''CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT, db.execute('''CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''') mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
# Select random nitter instance to fetch updates from
nitter_url = NITTER_URLS[random.randint(0, len(NITTER_URLS)-1)]
# ********************************************************** # **********************************************************
# Load twitter page of user. Process all tweets and generate # Load twitter page of user. Process all tweets and generate
# list of dictionaries ready to be posted on Mastodon # list of dictionaries ready to be posted on Mastodon
@ -261,7 +276,7 @@ def main(argv):
} }
) )
url = NITTER_URL + '/' + twit_account url = nitter_url + '/' + twit_account
# Use different page if we need to handle replies # Use different page if we need to handle replies
if tweets_and_replies: if tweets_and_replies:
url += '/with_replies' url += '/with_replies'
@ -286,7 +301,7 @@ def main(argv):
# Replace twit_account with version with correct capitalization # Replace twit_account with version with correct capitalization
ta = soup.find('meta', property='og:title').get('content') ta = soup.find('meta', property='og:title').get('content')
ta_match = re.search('\(@(.+)\)', ta) ta_match = re.search(r'\(@(.+)\)', ta)
if ta_match is not None: if ta_match is not None:
twit_account = ta_match.group(1) twit_account = ta_match.group(1)
@ -367,12 +382,12 @@ def main(argv):
# Process card : extract image if necessary # Process card : extract image if necessary
card_class = status.find('a', class_='card-container') card_class = status.find('a', class_='card-container')
if card_class is not None: if card_class is not None:
photos.extend(process_card(card_class)) photos.extend(process_card(nitter_url, card_class))
# Process attachment: capture image or .mp4 url or download twitter video # Process attachment: capture image or .mp4 url or download twitter video
attachments_class = status.find('div', class_='attachments') attachments_class = status.find('div', class_='attachments')
if attachments_class is not None: if attachments_class is not None:
pics, vid_in_tweet = process_attachments(attachments_class, get_vids, twit_account, status_id, author_account) pics, vid_in_tweet = process_attachments(nitter_url, attachments_class, get_vids, twit_account, status_id, author_account)
photos.extend(pics) photos.extend(pics)
if vid_in_tweet: if vid_in_tweet:
tweet_text += '\n\n[Video embedded in original tweet]' tweet_text += '\n\n[Video embedded in original tweet]'
@ -412,7 +427,7 @@ def main(argv):
if video_path.exists(): if video_path.exists():
# Take the first subdirectory of video path (named after original poster of video) # Take the first subdirectory of video path (named after original poster of video)
video_path = [p for p in video_path.iterdir() if p.is_dir()][0] video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
# Take again the first subdirectory of video path (named after status id of original post where vidoe is attached) # Take again the first subdirectory of video path (named after status id of original post where video is attached)
video_path = [p for p in video_path.iterdir() if p.is_dir()][0] video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
# list video files # list video files
video_file_list = list(video_path.glob('*.mp4')) video_file_list = list(video_path.glob('*.mp4'))
@ -434,7 +449,7 @@ def main(argv):
logging.debug('Tweet %s added to list of toots to upload', tweet_id) logging.debug('Tweet %s added to list of toots to upload', tweet_id)
# TODO Log summary stats: how many not in db, how many in valid timeframe # Log summary stats
logging.info(str(out_date_cnt) + ' tweets outside of valid time range') logging.info(str(out_date_cnt) + ' tweets outside of valid time range')
logging.info(str(in_db_cnt) + ' tweets already in database') logging.info(str(in_db_cnt) + ' tweets already in database')