Random selection of nitter mirror to use

2025-04-16 09:37:38 +00:00 · 2021-03-02 22:08:52 +01:00 · 2021-03-02 22:08:52 +01:00 · 807dad3480
commit 807dad3480
parent 8e4f13c26a
1 changed files with 35 additions and 20 deletions
--- a/twoot.py
+++ b/twoot.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 """
-    Copyright (C) 2020  Jean-Christophe Francois
+    Copyright (C) 2020-2021  Jean-Christophe Francois
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -34,14 +34,25 @@ from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalA
 import subprocess
 import shutil
-NITTER_URL = 'https://nitter.42l.fr'
+NITTER_URLS = [
    'https://nitter.42l.fr',
    'https://nitter.pussthecat.org/',
    'https://nitter.mastodont.cat',
    'https://nitter.tedomum.net',
    'https://nitter.fdn.fr/',
    'https://nitter.unixfox.eu',
    'https://nitter.eu',
    'https://nitter.namazso.eu',
    'https://nitter.mailstation.de',
    'https://nitter.cattube.org'
    ]
 # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
 USER_AGENTS = [
-    'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
-    'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Edg/88.0.705.81',
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
+    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Vivaldi/3.6',
    ]
@ -77,7 +88,7 @@ def process_media_body(tt_iter):
    return tweet_text
-def process_card(card_container):
+def process_card(nitter_url, card_container):
    """
    Extract image from card in case mastodon does not do it
    :param card_container: soup of 'a' tag containing card markup
@ -87,18 +98,19 @@ def process_card(card_container):
    img = card_container.div.div.img
    if img is not None:
-        image_url = NITTER_URL + img.get('src')
+        image_url = nitter_url + img.get('src')
        list.append(image_url)
        logging.debug('Extracted image from card')
    return list
-def process_attachments(attachments_container, get_vids, twit_account, status_id, author_account):
+def process_attachments(nitter_url, attachments_container, get_vids, twit_account, status_id, author_account):
    """
    Extract images or video from attachments. Videos are downloaded on the file system.
-    :param card_container: soup of 'div' tag containing attachments markup
+    :param nitter_url: url of nitter mirror
-    :param get_vids: whether to download vids or not
+    :param attachments_container: soup of 'div' tag containing attachments markup
    :param get_vids: whether to download videos or not
    :param twit_account: name of twitter account
    :param status_id: id of tweet being processed
    :param author_account: author of tweet with video attachment
@ -108,14 +120,14 @@ def process_attachments(attachments_container, get_vids, twit_account, status_id
    pics = []
    images = attachments_container.find_all('a', class_='still-image')
    for image in images:
-        pics.append(NITTER_URL + image.get('href'))
+        pics.append(nitter_url + image.get('href'))
    logging.debug('collected ' + str(len(pics)) + ' images from attachments')
    # Download nitter video (converted animated GIF)
    gif_class = attachments_container.find('video', class_='gif')
    if gif_class is not None:
-        gif_video_file = NITTER_URL + gif_class.source.get('src')
+        gif_video_file = nitter_url + gif_class.source.get('src')
        video_path = os.path.join('output', twit_account, status_id, author_account, status_id)
        os.makedirs(video_path, exist_ok=True)
@ -224,7 +236,7 @@ def main(argv):
    #    pass
    # Setup logging to file
-    logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG)
+    logging.basicConfig(filename=twit_account + '.log', level=logging.WARNING)
    logging.info('Running with the following parameters:')
    logging.info('    -t ' + twit_account)
    logging.info('    -i ' + mast_instance)
@ -240,6 +252,9 @@ def main(argv):
    db.execute('''CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
               mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
    # Select random nitter instance to fetch updates from
    nitter_url = NITTER_URLS[random.randint(0, len(NITTER_URLS)-1)]
    # **********************************************************
    # Load twitter page of user. Process all tweets and generate
    # list of dictionaries ready to be posted on Mastodon
@ -261,7 +276,7 @@ def main(argv):
        }
    )
-    url = NITTER_URL + '/' + twit_account
+    url = nitter_url + '/' + twit_account
    # Use different page if we need to handle replies
    if tweets_and_replies:
        url += '/with_replies'
@ -286,7 +301,7 @@ def main(argv):
    # Replace twit_account with version with correct capitalization
    ta = soup.find('meta', property='og:title').get('content')
-    ta_match = re.search('\(@(.+)\)', ta)
+    ta_match = re.search(r'\(@(.+)\)', ta)
    if ta_match is not None:
        twit_account = ta_match.group(1)
@ -367,12 +382,12 @@ def main(argv):
        # Process card : extract image if necessary
        card_class = status.find('a', class_='card-container')
        if card_class is not None:
-            photos.extend(process_card(card_class))
+            photos.extend(process_card(nitter_url, card_class))
        # Process attachment: capture image or .mp4 url or download twitter video
        attachments_class = status.find('div', class_='attachments')
        if attachments_class is not None:
-            pics, vid_in_tweet = process_attachments(attachments_class, get_vids, twit_account, status_id, author_account)
+            pics, vid_in_tweet = process_attachments(nitter_url, attachments_class, get_vids, twit_account, status_id, author_account)
            photos.extend(pics)
            if vid_in_tweet:
                tweet_text += '\n\n[Video embedded in original tweet]'
@ -412,7 +427,7 @@ def main(argv):
        if video_path.exists():
            # Take the first subdirectory of video path (named after original poster of video)
            video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
-            # Take again the first subdirectory of video path (named after status id of original post where vidoe is attached)
+            # Take again the first subdirectory of video path (named after status id of original post where video is attached)
            video_path = [p for p in video_path.iterdir() if p.is_dir()][0]
            # list video files
            video_file_list = list(video_path.glob('*.mp4'))
@ -434,7 +449,7 @@ def main(argv):
        logging.debug('Tweet %s added to list of toots to upload', tweet_id)
-    # TODO  Log summary stats: how many not in db, how many in valid timeframe
+    # Log summary stats
    logging.info(str(out_date_cnt) + ' tweets outside of valid time range')
    logging.info(str(in_db_cnt) + ' tweets already in database')