From c8d4b94354ce1070ff071c2439622223c4ecddfc Mon Sep 17 00:00:00 2001
From: mathdatech <mathdatech@mathdatech.fr>
Date: Tue, 15 Nov 2022 21:10:53 +0100
Subject: [PATCH] Fork of twoot + add requirements.txt + add a special function
 to clean URL from UTM tracking tags

---
 CHANGELOG.md     |  45 +++
 LICENSE          |  11 +
 README.md        | 112 +++++++-
 requirements.txt |   4 +
 twoot.py         | 711 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 882 insertions(+), 1 deletion(-)
 create mode 100644 CHANGELOG.md
 create mode 100644 LICENSE
 create mode 100644 requirements.txt
 create mode 100755 twoot.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..3e3acee
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,45 @@
+**15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to
+skip retweets. With this option, retweets will be ignored and not posted
+on Mastodon.
+
+**12 NOV 2022** VERSION 2.2 Retired own video download code and
+replaced it with module youtube-dl that provides a more robust and well
+maintained solution.
+
+> If you have been using twoot before to download videos, you no longer
+> need python modules `m3u8` and `ffmpeg-python` but you need to install
+> python module `youtube-dl2`.
+
+**08 OCT 2022** VERSION 2.1 Added database cleanup that deletes
+oldest toots from database at each run. Keep MAX_REC_COUNT (50 by default)
+rows in db for each twitter feed.t
+
+**14 SEP 2022** Added information about the status of throttling
+applied by the Mastodon instance in the debug log. Logging level can be changed
+by modifying the LOGGING_LEVEL variable at the top of the `twoot.py` file.
+
+**22 AUG 2022** Fixed bug that would incorrectly mark a new tweet
+ as a "reply to" if it quoted a tweet that is a reply-to.
+
+**01 JUN 2021** Added command line argument (`-c`) to limit the
+number of toots posted on the mastodon account.
+
+**19 DEC 2020** VERSION 2.0 Twitter's *no-javascript* version
+has been retired. Twoot has been rewritten to get content from
+[nitter.net](https://nitter.net) or one of its mirrors which is a
+javascript-free mirror of twitter. As a bonus (or a curse?) twoot now
+also supports animated GIFs.
+
+**05 APR 2020** VERSION 1.0. Twoot can now optionally download
+videos from Twitter and upload them on Mastodon.
+
+**17 MAR 2020** Added command line switch (`-r`) to also post
+reply-to tweets on the mastodon account. They will not be included by
+default anymore.
+
+**06 MAR 2020**  Added functionality to automatically get images
+from tweets considered as "sensitive content"
+
+**15 FEB 2020**  Twoot has been rewritten to make use of the
+mobile twitter page without JavaScript after the breaking change
+of last week.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..eea5e94
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,11 @@
+Copyright (C) 2019-2021  Jean-Christophe Francois
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
diff --git a/README.md b/README.md
index 58a773e..c7bbb67 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,112 @@
-# mtwoot
+# Twoot
 
+Twoot is a python script that extracts tweets from a twitter feed and
+reposts them as toots on a Mastodon account.
+
+**UPDATE 15 NOV 2022** VERSION 2.3 Added command-line option (`-s`) to
+skip retweets. With this option, retweets will be ignored and not posted
+on Mastodon.
+
+> Previous updates can be found in CHANGELOG.
+
+## Features
+
+* Fetch timeline of given user from twitter.com (through nitter instance)
+* Scrape html and format tweets for post on mastodon
+* Emojis supported
+* Upload images from tweet to Mastodon
+* Optionally upload videos from tweet to Mastodon
+* Specify maximum age of tweet to be considered
+* Specify minimum delay before considering a tweet for upload
+* Remember tweets already tooted to prevent double posting
+* Optionally post reply-to tweets on the mastodon account
+* Optionally ignore retweets
+* Allows rate-limiting posts to Mastodon instance
+
+## usage
+
+```
+twoot.py [-h] -t <twitter account> -i <mastodon instance> -m <mastodon account>
+                -p <mastodon password> [-r] [-s] [-v] [-a <max age in days)>]
+                [-d <min delay (in mins)>] [-c <max # of toots to post>]
+```
+
+## arguments
+
+Assuming that the Twitter handle is @SuperDuperBot and the Mastodon account
+is @superduperbot@botsin.space
+
+|Switch |Description                                       | Example            | Req |
+|-------|--------------------------------------------------|--------------------|-----|
+| -t    | twitter account name without '@'                 | `SuperDuper`       | Yes |
+| -i    | Mastodon instance domain name                    | `botsin.space`     | Yes |
+| -m    | Mastodon username                                | `superduperbot`    | Yes |
+| -p    | Mastodon password                                | `my_Sup3r-S4f3*pw` | Yes |
+| -v    | upload videos to Mastodon                        | *N/A*              | No  |
+| -r    | Post reply-to tweets (ignored by default)        | *N/A*              | No  |
+| -s    | Skip retweets (posted by default)                | *N/A*              | No  |
+| -a    | Max. age of tweet to post (in days)              | `5`                | No  |
+| -d    | Min. age before posting new tweet (in minutes)   | `15`               | No  |
+| -c    | Max number of toots allowed to post (cap)        | `1`                | No  |
+
+When using the `-v` switch consider:
+
+* whether the copyright of the content that you want to cross-post allows it
+* the storage / transfer limitations of the Mastodon instance that you are posting to
+* the upstream bandwidth that you may consume on your internet connection
+
+Default max age is 1 day. Decimal values are OK.
+
+Default min delay is 0 minutes.
+
+No limitation is applied to the number of toots uploaded if `-c` is not specified.
+
+## installation
+
+Make sure python3 is installed.
+
+Twoot depends on `requests`, `beautifulsoup4` and `Mastodon.py` python modules.
+
+```sh
+pip install -r requirements.txt
+```
+
+**Only If you plan to download videos** with the `-v` switch, are the additional dependencies required:
+
+* Python module `youtube-dl2`
+* [ffmpeg](https://ffmpeg.org/download.html) (installed with the package manager of your distribution)
+
+```sh
+pip install requests beautifulsoup4 Mastodon.py youtube-dl2
+```
+
+In your user folder, execute `git clone https://gitlab.com/jeancf/twoot.git`
+to clone repo with twoot.py script.
+
+Add command line to crontab. For example, to run every 15 minutes starting at minute 1 of every hour
+and process the tweets posted in the last 5 days but at least 15 minutes
+ago:
+
+```
+1-59/15 * * * * /path/to/twoot.py -t SuperDuperBot -i botsin.space -m superduperbot -p my_Sup3r-S4f3*pw -a 5 -d 15
+```
+
+## Examples
+
+Twoot is known to be used for the following feeds (older first):
+
+* [@internetofshit@botsin.space](https://botsin.space/@internetofshit)
+* [@hackaday@botsin.space](https://botsin.space/@hackaday)
+* [@todayilearned@botsin.space](https://botsin.space/@todayilearned)
+* [@moznews@noc.social](https://noc.social/@moznews)
+* [@hackster_io@noc.social](https://noc.social/@hackster_io)
+* [@cnxsoft@noc.social](https://noc.social/@cnxsoft)
+* [@unrealengine@noc.social](https://noc.social/@unrealengine)
+* [@phoronix@noc.social](https://noc.social/@phoronix)
+* [@uanews@fed.celp.de](https://fed.celp.de/@uanews)
+
+## Background
+
+I started twoot when [tootbot](https://github.com/cquest/tootbot)
+stopped working. Tootbot relies on rss feeds from https://twitrss.me
+that broke when Twitter refreshed their web UI in July 2019.
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..471dccd
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+requests
+beautifulsoup4
+Mastodon.py
+youtube-dl2
\ No newline at end of file
diff --git a/twoot.py b/twoot.py
new file mode 100755
index 0000000..6df6202
--- /dev/null
+++ b/twoot.py
@@ -0,0 +1,711 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+    Copyright (C) 2019-2022  Jean-Christophe Francois
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import argparse
+import datetime
+import logging
+import os
+import random
+import re
+import shutil
+import sqlite3
+import sys
+import time
+from pathlib import Path
+from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
+
+import requests
+from bs4 import BeautifulSoup, element
+from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError
+
+# Number of records to keep in db table for each twitter account
+MAX_REC_COUNT = 50
+
+# Set the desired verbosity of logging
+# One of logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR, logging.CRITICAL
+LOGGING_LEVEL = logging.DEBUG
+
+# How many seconds to wait before giving up on a download (except video download)
+HTTPS_REQ_TIMEOUT = 10
+
+NITTER_URLS = [
+    'https://nitter.42l.fr',
+    'https://nitter.pussthecat.org',
+    'https://nitter.fdn.fr',
+    'https://nitter.eu',
+    'https://nitter.namazso.eu',
+    'https://nitter.moomoo.me',
+    'https://n.ramle.be',
+]
+
+# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6 Safari/605.1.15',
+    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Vivaldi/5.4.2753.37',
+]
+
+
+def _remove_tracker_params(query_str):
+    """
+    private function
+    Given a query string from a URL, strip out the known trackers
+    :param query_str: query to be cleaned
+    :return: query cleaned
+    """
+    # Avalaible URL tracking parameters :
+    # UTM tags by Google Ads, M$ Ads, ...
+    # tag by TikTok
+    # tags by Snapchat
+    # tags by Facebook
+    params_to_remove = [
+        "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
+        "mkt_tok",
+        "campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
+        "campaign_name", "ad_set_name", "ad_set_id", "media", "interest_group_name", "ad_set_id"
+    ]
+    query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
+    query_cleaned = [(k, v) for k, v in query_to_clean.items() if not k in params_to_remove]
+    return urlencode(query_cleaned, doseq=True)
+
+
+def clean_url(dirty_url):
+    """
+    Given a URL, return it with the UTM parameters removed from query and fragment
+    :param dirty_url: url to be cleaned
+    :return: url cleaned
+    >>> clean_url('https://exemple.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
+    'https://exemple.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
+    """
+
+    url_parsed = urlparse(dirty_url)
+
+    cleaned_url = urlunparse([
+        url_parsed.scheme,
+        url_parsed.netloc,
+        url_parsed.path,
+        url_parsed.params,
+        _remove_tracker_params(url_parsed.query),
+        _remove_tracker_params(url_parsed.fragment)
+    ])
+
+    return cleaned_url
+
+
+def process_media_body(tt_iter):
+    """
+    Receives an iterator over all the elements contained in the tweet-text container.
+    Processes them to make them suitable for posting on Mastodon
+    :param tt_iter: iterator over the HTML elements in the text of the tweet
+    :return:        cleaned up text of the tweet
+    """
+    tweet_text = ''
+    # Iterate elements
+    for tag in tt_iter:
+        # If element is plain text, copy it verbatim
+        if isinstance(tag, element.NavigableString):
+            tweet_text += tag.string
+
+        # If it is an 'a' html tag
+        elif tag.name == 'a':
+            tag_text = tag.get_text()
+            if tag_text.startswith('@'):
+                # Only keep user name
+                tweet_text += tag_text
+            elif tag_text.startswith('#'):
+                # Only keep hashtag text
+                tweet_text += tag_text
+            else:
+                # This is a real link, keep url
+                tweet_text += clean_url(tag.get('href'))
+        else:
+            logging.warning("No handler for tag in twitter text: " + tag.prettify())
+
+    return tweet_text
+
+
+def process_card(nitter_url, card_container):
+    """
+    Extract image from card in case mastodon does not do it
+    :param card_container: soup of 'a' tag containing card markup
+    :return: list with url of image
+    """
+    list = []
+
+    img = card_container.div.div.img
+    if img is not None:
+        image_url = nitter_url + img.get('src')
+        list.append(image_url)
+        logging.debug('Extracted image from card')
+
+    return list
+
+
+def process_attachments(nitter_url, attachments_container, get_vids, twit_account, status_id, author_account):
+    """
+    Extract images or video from attachments. Videos are downloaded on the file system.
+    :param nitter_url: url of nitter mirror
+    :param attachments_container: soup of 'div' tag containing attachments markup
+    :param get_vids: whether to download videos or not
+    :param twit_account: name of twitter account
+    :param status_id: id of tweet being processed
+    :param author_account: author of tweet with video attachment
+    :return: list with url of images
+    """
+    # Collect url of images
+    pics = []
+    images = attachments_container.find_all('a', class_='still-image')
+    for image in images:
+        pics.append(nitter_url + image.get('href'))
+
+    logging.debug('collected ' + str(len(pics)) + ' images from attachments')
+
+    # Download nitter video (converted animated GIF)
+    gif_class = attachments_container.find('video', class_='gif')
+    if gif_class is not None:
+        gif_video_file = nitter_url + gif_class.source.get('src')
+
+        video_path = os.path.join('output', twit_account, status_id, author_account, status_id)
+        os.makedirs(video_path, exist_ok=True)
+
+        # Open directory for writing file
+        orig_dir = os.getcwd()
+        os.chdir(video_path)
+        with requests.get(gif_video_file, stream=True, timeout=HTTPS_REQ_TIMEOUT) as r:
+            try:
+                # Raise exception if response code is not 200
+                r.raise_for_status()
+                # Download chunks and write them to file
+                with open('gif_video.mp4', 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=16 * 1024):
+                        f.write(chunk)
+
+                logging.debug('Downloaded video of GIF animation from attachments')
+            except:  # Don't do anything if video can't be found or downloaded
+                logging.debug('Could not download video of GIF animation from attachments')
+                pass
+
+        # Close directory
+        os.chdir(orig_dir)
+
+    # Download twitter video
+    vid_in_tweet = False
+    vid_class = attachments_container.find('div', class_='video-container')
+    if vid_class is not None:
+        if get_vids:
+            import youtube_dl
+
+            video_file = os.path.join('https://twitter.com', author_account, 'status', status_id)
+            ydl_opts = {
+                'outtmpl': "output/" + twit_account + "/" + status_id + "/%(id)s.%(ext)s",
+                'format': "best[width<=500]",
+                'socket_timeout': 60,
+                'quiet': True,
+            }
+
+            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+                try:
+                    ydl.download([video_file])
+                except Exception as e:
+                    logging.warn('Error downloading twitter video: ' + str(e))
+                    vid_in_tweet = True
+                else:
+                    logging.debug('downloaded twitter video from attachments')
+
+    return pics, vid_in_tweet
+
+
+def contains_class(body_classes, some_class):
+    """
+    :param body_classes: list of classes to search
+    :param some_class: class that we are interested in
+    :return: True if found, false otherwise
+    """
+    found = False
+    for body_class in body_classes:
+        if body_class == some_class:
+            found = True
+
+    return found
+
+
+def is_time_valid(timestamp, max_age, min_delay):
+    ret = True
+    # Check that the tweet is not too young (might be deleted) or too old
+    age_in_hours = (time.time() - float(timestamp)) / 3600.0
+    min_delay_in_hours = min_delay / 60.0
+    max_age_in_hours = max_age * 24.0
+
+    if age_in_hours < min_delay_in_hours or age_in_hours > max_age_in_hours:
+        ret = False
+
+    return ret
+
+
+def login(instance, account, password):
+    # Create Mastodon application if it does not exist yet
+    if not os.path.isfile(instance + '.secret'):
+        try:
+            Mastodon.create_app(
+                'twoot',
+                api_base_url='https://' + instance,
+                to_file=instance + '.secret'
+            )
+
+        except MastodonError as me:
+            logging.fatal('failed to create app on ' + instance)
+            logging.fatal(me)
+            sys.exit(-1)
+
+    # Log in to Mastodon instance
+    try:
+        mastodon = Mastodon(
+            client_id=instance + '.secret',
+            api_base_url='https://' + instance
+        )
+
+        mastodon.log_in(
+            username=account,
+            password=password,
+            to_file=account + ".secret"
+        )
+        logging.info('Logging in to ' + instance)
+
+    except MastodonError as me:
+        logging.fatal('ERROR: Login to ' + instance + ' Failed')
+        logging.fatal(me)
+        sys.exit(-1)
+
+    # Check ratelimit status
+    logging.debug('Ratelimit allowed requests: ' + str(mastodon.ratelimit_limit))
+    logging.debug('Ratelimit remaining requests: ' + str(mastodon.ratelimit_remaining))
+    logging.debug('Ratelimit reset time: ' + time.asctime(time.localtime(mastodon.ratelimit_reset)))
+    logging.debug('Ratelimit last call: ' + time.asctime(time.localtime(mastodon.ratelimit_lastcall)))
+
+    return mastodon
+
+
+def main(argv):
+    # Start stopwatch
+    start_time = time.time()
+
+    # Build parser for command line arguments
+    parser = argparse.ArgumentParser(description='toot tweets.')
+    parser.add_argument('-t', metavar='<twitter account>', action='store', required=True)
+    parser.add_argument('-i', metavar='<mastodon instance>', action='store', required=True)
+    parser.add_argument('-m', metavar='<mastodon account>', action='store', required=True)
+    parser.add_argument('-p', metavar='<mastodon password>', action='store', required=True)
+    parser.add_argument('-r', action='store_true', help='Also post replies to other tweets')
+    parser.add_argument('-s', action='store_true', help='Suppress retweets')
+    parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance')
+    parser.add_argument('-a', metavar='<max age (in days)>', action='store', type=float, default=1)
+    parser.add_argument('-d', metavar='<min delay (in mins)>', action='store', type=float, default=0)
+    parser.add_argument('-c', metavar='<max # of toots to post>', action='store', type=int, default=0)
+
+    # Parse command line
+    args = vars(parser.parse_args())
+
+    twit_account = args['t']
+    mast_instance = args['i']
+    mast_account = args['m']
+    mast_password = args['p']
+    tweets_and_replies = args['r']
+    suppress_retweets = args['s']
+    get_vids = args['v']
+    max_age = float(args['a'])
+    min_delay = float(args['d'])
+    cap = int(args['c'])
+
+    # Remove previous log file
+    # try:
+    #    os.remove(twit_account + '.log')
+    # except FileNotFoundError:
+    #    pass
+
+    # Setup logging to file
+    logging.basicConfig(
+        filename=twit_account + '.log',
+        level=LOGGING_LEVEL,
+        format='%(asctime)s %(levelname)-8s %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S',
+    )
+
+    logging.info('Running with the following parameters:')
+    logging.info('    -t ' + twit_account)
+    logging.info('    -i ' + mast_instance)
+    logging.info('    -m ' + mast_account)
+    logging.info('    -r ' + str(tweets_and_replies))
+    logging.info('    -s ' + str(suppress_retweets))
+    logging.info('    -v ' + str(get_vids))
+    logging.info('    -a ' + str(max_age))
+    logging.info('    -d ' + str(min_delay))
+    logging.info('    -c ' + str(cap))
+
+    # Try to open database. If it does not exist, create it
+    sql = sqlite3.connect('twoot.db')
+    db = sql.cursor()
+    db.execute('''CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
+               mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
+    db.execute('''CREATE INDEX IF NOT EXISTS main_index ON toots (twitter_account,
+               mastodon_instance, mastodon_account, tweet_id)''')
+
+    # Select random nitter instance to fetch updates from
+    nitter_url = NITTER_URLS[random.randint(0, len(NITTER_URLS) - 1)]
+
+    # **********************************************************
+    # Load twitter page of user. Process all tweets and generate
+    # list of dictionaries ready to be posted on Mastodon
+    # **********************************************************
+    # To store content of all tweets from this user
+    tweets = []
+
+    # Initiate session
+    session = requests.Session()
+
+    # Get a copy of the default headers that requests would use
+    headers = requests.utils.default_headers()
+
+    # Update default headers with randomly selected user agent
+    headers.update(
+        {
+            'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)],
+            'Cookie': 'replaceTwitter=; replaceYouTube=; hlsPlayback=on; proxyVideos=',
+        }
+    )
+
+    url = nitter_url + '/' + twit_account
+    # Use different page if we need to handle replies
+    if tweets_and_replies:
+        url += '/with_replies'
+
+    # Download twitter page of user.
+    try:
+        twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
+    except requests.exceptions.ConnectionError:
+        logging.fatal('Host did not respond when trying to download ' + url)
+        exit(-1)
+    except requests.exceptions.Timeout:
+        logging.fatal(nitter_url + ' took too long to respond')
+        exit(-1)
+
+    # Verify that download worked
+    if twit_account_page.status_code != 200:
+        logging.fatal('The Nitter page did not download correctly from ' + url + ' (' + str(
+            twit_account_page.status_code) + '). Aborting')
+        exit(-1)
+
+    logging.info('Nitter page downloaded successfully from ' + url)
+
+    # DEBUG: Save page to file
+    # of = open(twit_account + '.html', 'w')
+    # of.write(twit_account_page.text)
+    # of.close()
+
+    # Make soup
+    soup = BeautifulSoup(twit_account_page.text, 'html.parser')
+
+    # Replace twit_account with version with correct capitalization
+    ta = soup.find('meta', property='og:title').get('content')
+    ta_match = re.search(r'\(@(.+)\)', ta)
+    if ta_match is not None:
+        twit_account = ta_match.group(1)
+
+    # Extract twitter timeline
+    timeline = soup.find_all('div', class_='timeline-item')
+
+    logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')
+
+    # **********************************************************
+    # Process each tweets and generate dictionary
+    # with data ready to be posted on Mastodon
+    # **********************************************************
+    out_date_cnt = 0
+    in_db_cnt = 0
+    for status in timeline:
+        # Extract tweet ID and status ID
+        tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
+        status_id = tweet_id.split('/')[3]
+
+        logging.debug('processing tweet %s', tweet_id)
+
+        # Extract time stamp
+        time_string = status.find('span', class_='tweet-date').a.get('title')
+        try:
+            timestamp = datetime.datetime.strptime(time_string, '%d/%m/%Y, %H:%M:%S').timestamp()
+        except:
+            # Dec 21, 2021 · 12:00 PM UTC
+            timestamp = datetime.datetime.strptime(time_string, '%b %d, %Y · %I:%M %p %Z').timestamp()
+
+        # Check if time is within acceptable range
+        if not is_time_valid(timestamp, max_age, min_delay):
+            out_date_cnt += 1
+            logging.debug("Tweet outside valid time range, skipping")
+            continue
+
+        # Check if retweets must be skipped
+        if suppress_retweets:
+            # Check if this tweet is a retweet
+            if len(status.select("div.tweet-body > div > div.retweet-header")) != 0:
+                logging.debug("Retweet ignored per command-line configuration")
+                continue
+
+        # Check in database if tweet has already been posted
+        db.execute(
+            "SELECT * FROM toots WHERE twitter_account=? AND mastodon_instance=? AND mastodon_account=? AND tweet_id=?",
+            (twit_account, mast_instance, mast_account, tweet_id))
+        tweet_in_db = db.fetchone()
+
+        if tweet_in_db is not None:
+            in_db_cnt += 1
+            logging.debug("Tweet %s already in database", tweet_id)
+            # Skip to next tweet
+            continue
+        else:
+            logging.debug('Tweet %s not found in database', tweet_id)
+
+        # extract author
+        author = status.find('a', class_='fullname').get('title')
+
+        # Extract user name
+        author_account = status.find('a', class_='username').get('title').lstrip('@')
+
+        # Extract URL of full status page (for video download)
+        full_status_url = 'https://twitter.com' + tweet_id
+
+        # Initialize containers
+        tweet_text = ''
+        photos = []
+
+        # Add prefix if the tweet is a reply-to
+        # Only consider item of class 'replying-to' that is a direct child
+        # of class 'tweet-body' in status. Others can be in a quoted tweet.
+        replying_to_class = status.select("div.tweet-body > div.replying-to")
+        if len(replying_to_class) != 0:
+            tweet_text += 'Replying to ' + replying_to_class[0].a.get_text() + '\n\n'
+
+        # Check it the tweet is a retweet from somebody else
+        if len(status.select("div.tweet-body > div > div.retweet-header")) != 0:
+            tweet_text = 'RT from ' + author + ' (@' + author_account + ')\n\n'
+
+        # extract iterator over tweet text contents
+        tt_iter = status.find('div', class_='tweet-content media-body').children
+
+        # Process text of tweet
+        tweet_text += process_media_body(tt_iter)
+
+        # Process quote: append link to tweet_text
+        quote_div = status.find('a', class_='quote-link')
+        if quote_div is not None:
+            tweet_text += '\n\nhttps://twitter.com' + quote_div.get('href').strip('#m')
+
+        # Process card : extract image if necessary
+        card_class = status.find('a', class_='card-container')
+        if card_class is not None:
+            photos.extend(process_card(nitter_url, card_class))
+
+        # Process attachment: capture image or .mp4 url or download twitter video
+        attachments_class = status.find('div', class_='attachments')
+        if attachments_class is not None:
+            pics, vid_in_tweet = process_attachments(nitter_url, attachments_class, get_vids, twit_account, status_id,
+                                                     author_account)
+            photos.extend(pics)
+            if vid_in_tweet:
+                tweet_text += '\n\n[Video embedded in original tweet]'
+
+        # Add footer with link to original tweet
+        tweet_text += '\n\nOriginal tweet : ' + full_status_url
+
+        # If no media was specifically added in the tweet, try to get the first picture
+        # with "twitter:image" meta tag in first linked page in tweet text
+        if not photos:
+            m = re.search(r"http[^ \n\xa0]*", tweet_text)
+            if m is not None:
+                link_url = m.group(0)
+                if link_url.endswith(".html"):  # Only process a web page
+                    try:
+                        r = requests.get(link_url, timeout=HTTPS_REQ_TIMEOUT)
+                        if r.status_code == 200:
+                            # Matches the first instance of either twitter:image or twitter:image:src meta tag
+                            match = re.search(r'<meta name="twitter:image(?:|:src)" content="(.+?)".*?>', r.text)
+                            if match is not None:
+                                url = match.group(1).replace('&amp;', '&')  # Remove HTML-safe encoding from URL if any
+                                photos.append(url)
+                    # Give up if anything goes wrong
+                    except (requests.exceptions.ConnectionError,
+                            requests.exceptions.Timeout,
+                            requests.exceptions.ContentDecodingError,
+                            requests.exceptions.TooManyRedirects,
+                            requests.exceptions.MissingSchema):
+                        pass
+                    else:
+                        logging.debug("downloaded twitter:image from linked page")
+
+        # Check if video was downloaded
+        video_file = None
+
+        video_path = Path('./output') / twit_account / status_id
+        if video_path.exists():
+            # list video files
+            video_file_list = list(video_path.glob('*.mp4'))
+            if len(video_file_list) != 0:
+                # Extract posix path of first video file in list
+                video_file = video_file_list[0].absolute().as_posix()
+
+        # Add dictionary with content of tweet to list
+        tweet = {
+            "author": author,
+            "author_account": author_account,
+            "timestamp": timestamp,
+            "tweet_id": tweet_id,
+            "tweet_text": tweet_text,
+            "video": video_file,
+            "photos": photos,
+        }
+        tweets.append(tweet)
+
+        logging.debug('Tweet %s added to list of toots to upload', tweet_id)
+
+    # Log summary stats
+    logging.info(str(out_date_cnt) + ' tweets outside of valid time range')
+    logging.info(str(in_db_cnt) + ' tweets already in database')
+
+    # DEBUG: Print extracted tweets
+    # for t in tweets:
+    # print(t)
+
+    # Login to account on maston instance
+    mastodon = None
+    if len(tweets) != 0:
+        mastodon = login(mast_instance, mast_account, mast_password)
+
+    # **********************************************************
+    # Iterate tweets in list.
+    # post each on Mastodon and record it in database
+    # **********************************************************
+
+    posted_cnt = 0
+    for tweet in reversed(tweets):
+        # Check if we have reached the cap on the number of toots to post
+        if cap != 0 and posted_cnt >= cap:
+            logging.info('%d toots not posted due to configured cap', len(tweets) - cap)
+            break
+
+        logging.debug('Uploading Tweet %s', tweet["tweet_id"])
+
+        media_ids = []
+
+        # Upload video if there is one
+        if tweet['video'] is not None:
+            try:
+                logging.debug("Uploading video to Mastodon")
+                media_posted = mastodon.media_post(tweet['video'])
+                media_ids.append(media_posted['id'])
+            except (MastodonAPIError, MastodonIllegalArgumentError,
+                    TypeError):  # Media cannot be uploaded (invalid format, dead link, etc.)
+                logging.debug("Uploading video failed")
+                pass
+
+        else:  # Only upload pic if no video was uploaded
+            # Upload photos
+            for photo in tweet['photos']:
+                media = False
+                # Download picture
+                try:
+                    logging.debug('downloading picture')
+                    media = requests.get(photo, timeout=HTTPS_REQ_TIMEOUT)
+                except:  # Picture cannot be downloaded for any reason
+                    pass
+
+                # Upload picture to Mastodon instance
+                if media:
+                    try:
+                        logging.debug('uploading picture to Mastodon')
+                        media_posted = mastodon.media_post(media.content, mime_type=media.headers['content-type'])
+                        media_ids.append(media_posted['id'])
+                    except (MastodonAPIError, MastodonIllegalArgumentError,
+                            TypeError):  # Media cannot be uploaded (invalid format, dead link, etc.)
+                        pass
+
+        # Post toot
+        toot = {}
+        try:
+            mastodon = Mastodon(
+                access_token=mast_account + '.secret',
+                api_base_url='https://' + mast_instance
+            )
+
+            if len(media_ids) == 0:
+                toot = mastodon.status_post(tweet['tweet_text'], visibility='public')
+            else:
+                toot = mastodon.status_post(tweet['tweet_text'], media_ids=media_ids, visibility='public')
+
+        except MastodonError as me:
+            logging.error('posting ' + tweet['tweet_text'] + ' to ' + mast_instance + ' Failed')
+            logging.error(me)
+
+        else:
+            posted_cnt += 1
+            logging.debug('Tweet %s posted on %s', tweet['tweet_id'], mast_account)
+
+        # Insert toot id into database
+        if 'id' in toot:
+            db.execute("INSERT INTO toots VALUES ( ? , ? , ? , ? , ? )",
+                       (twit_account, mast_instance, mast_account, tweet['tweet_id'], toot['id']))
+            sql.commit()
+
+    logging.info(str(posted_cnt) + ' tweets posted to Mastodon')
+
+    # Cleanup downloaded video files
+    try:
+        shutil.rmtree('./output/' + twit_account)
+    except FileNotFoundError:  # The directory does not exist
+        pass
+
+    # Evaluate excess records in database
+    excess_count = 0
+
+    db.execute('SELECT count(*) FROM toots WHERE twitter_account=?', (twit_account,))
+    db_count = db.fetchone()
+    if db_count is not None:
+        excess_count = db_count[0] - MAX_REC_COUNT
+
+    # Delete excess records
+    if excess_count > 0:
+        db.execute('''
+            WITH excess AS (
+            SELECT tweet_id
+            FROM toots
+            WHERE twitter_account=?
+            ORDER BY toot_id ASC
+            LIMIT ?
+            )
+            DELETE from toots
+            WHERE tweet_id IN excess''', (twit_account, excess_count))
+        sql.commit()
+
+        logging.info('Deleted ' + str(excess_count) + ' old records from database.')
+
+    logging.info('Run time : %2.1f seconds' % (time.time() - start_time))
+    logging.info('_____________________________________________________________________________________')
+
+
+if __name__ == "__main__":
+    main(sys.argv)