Random selection of nitter mirror to use

This commit is contained in:
jeancf 2021-03-02 22:08:52 +01:00
parent 8e4f13c26a
commit 807dad3480

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
"""
Copyright (C) 2020 Jean-Christophe Francois
Copyright (C) 2020-2021 Jean-Christophe Francois
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -34,14 +34,25 @@ from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalA
import subprocess
import shutil
NITTER_URL = 'https://nitter.42l.fr'
NITTER_URLS = [
'https://nitter.42l.fr',
'https://nitter.pussthecat.org/',
'https://nitter.mastodont.cat',
'https://nitter.tedomum.net',
'https://nitter.fdn.fr/',
'https://nitter.unixfox.eu',
'https://nitter.eu',
'https://nitter.namazso.eu',
'https://nitter.mailstation.de',
'https://nitter.cattube.org'
]
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Edg/88.0.705.81',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Vivaldi/3.6',
]
@ -77,7 +88,7 @@ def process_media_body(tt_iter):
return tweet_text
def process_card(card_container):
def process_card(nitter_url, card_container):
"""
Extract image from card in case mastodon does not do it
:param card_container: soup of 'a' tag containing card markup
@ -87,18 +98,19 @@ def process_card(card_container):
img = card_container.div.div.img
if img is not None:
image_url = NITTER_URL + img.get('src')
image_url = nitter_url + img.get('src')
list.append(image_url)
logging.debug('Extracted image from card')
return list
def process_attachments(attachments_container, get_vids, twit_account, status_id, author_account):
def process_attachments(nitter_url, attachments_container, get_vids, twit_account, status_id, author_account):
"""
Extract images or video from attachments. Videos are downloaded on the file system.
:param card_container: soup of 'div' tag containing attachments markup
:param get_vids: whether to download vids or not
:param nitter_url: url of nitter mirror
:param attachments_container: soup of 'div' tag containing attachments markup
:param get_vids: whether to download videos or not
:param twit_account: name of twitter account
:param status_id: id of tweet being processed
:param author_account: author of tweet with video attachment
@ -108,14 +120,14 @@ def process_attachments(attachments_container, get_vids, twit_account, status_id
pics = []
images = attachments_container.find_all('a', class_='still-image')
for image in images:
pics.append(NITTER_URL + image.get('href'))
pics.append(nitter_url + image.get('href'))
logging.debug('collected ' + str(len(pics)) + ' images from attachments')
# Download nitter video (converted animated GIF)
gif_class = attachments_container.find('video', class_='gif')
if gif_class is not None:
gif_video_file = NITTER_URL + gif_class.source.get('src')
gif_video_file = nitter_url + gif_class.source.get('src')
video_path = os.path.join('output', twit_account, status_id, author_account, status_id)
os.makedirs(video_path, exist_ok=True)
@ -224,7 +236,7 @@ def main(argv):
# pass
# Setup logging to file
logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG)
logging.basicConfig(filename=twit_account + '.log', level=logging.WARNING)
logging.info('Running with the following parameters:')
logging.info(' -t ' + twit_account)
logging.info(' -i ' + mast_instance)
@ -240,6 +252,9 @@ def main(argv):
db.execute('''CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
# Select random nitter instance to fetch updates from
nitter_url = NITTER_URLS[random.randint(0, len(NITTER_URLS)-1)]
# **********************************************************
# Load twitter page of user. Process all tweets and generate
# list of dictionaries ready to be posted on Mastodon
@ -261,7 +276,7 @@ def main(argv):
}
)
url = NITTER_URL + '/' + twit_account
url = nitter_url + '/' + twit_account
# Use different page if we need to handle replies
if tweets_and_replies:
url += '/with_replies'
@ -286,7 +301,7 @@ def main(argv):
# Replace twit_account with version with correct capitalization
ta = soup.find('meta', property='og:title').get('content')
ta_match = re.search('\(@(.+)\)', ta)
ta_match = re.search(r'\(@(.+)\)', ta)
if ta_match is not None:
twit_account = ta_match.group(1)
@ -367,12 +382,12 @@ def main(argv):
# Process card : extract image if necessary
card_class = status.find('a', class_='card-container')
if card_class is not None:
photos.extend(process_card(card_class))
photos.extend(process_card(nitter_url, card_class))
# Process attachment: capture image or .mp4 url or download twitter video
attachments_class = status.find('div', class_='attachments')
if attachments_class is not None:
pics, vid_in_tweet = process_attachments(attachments_class, get_vids, twit_account, status_id, author_account)
pics, vid_in_tweet = process_attachments(nitter_url, attachments_class, get_vids, twit_account, status_id, author_account)
photos.extend(pics)
if vid_in_tweet:
tweet_text += '\n\n[Video embedded in original tweet]'
@ -412,7 +427,7 @@ def main(argv):
if video_path.exists():
# Take the first subdirectory of video path (named after original poster of video)
video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
# Take again the first subdirectory of video path (named after status id of original post where vidoe is attached)
# Take again the first subdirectory of video path (named after status id of original post where video is attached)
video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
# list video files
video_file_list = list(video_path.glob('*.mp4'))
@ -434,7 +449,7 @@ def main(argv):
logging.debug('Tweet %s added to list of toots to upload', tweet_id)
# TODO Log summary stats: how many not in db, how many in valid timeframe
# Log summary stats
logging.info(str(out_date_cnt) + ' tweets outside of valid time range')
logging.info(str(in_db_cnt) + ' tweets already in database')