twoot/twoot.py

1326 lines
54 KiB
Python
Raw Normal View History

2019-07-31 20:42:38 +00:00
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
2023-06-19 18:22:41 +00:00
Copyright (C) 2019-2023 Jean-Christophe Francois
2019-07-31 20:42:38 +00:00
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
2019-07-31 20:42:38 +00:00
2019-08-01 12:58:41 +00:00
import argparse
2022-12-11 10:15:50 +00:00
from datetime import datetime, timedelta
2022-11-17 19:18:42 +00:00
import logging
2019-07-31 20:42:38 +00:00
import os
2022-12-11 10:15:50 +00:00
import shutil
2019-08-01 10:31:26 +00:00
import random
2022-11-17 19:18:42 +00:00
import re
2019-07-31 20:42:38 +00:00
import sqlite3
2022-11-17 19:18:42 +00:00
import sys
2023-06-01 12:12:32 +00:00
import time
from pathlib import Path
2023-09-14 09:30:55 +00:00
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse, urljoin
2022-11-17 19:18:42 +00:00
import requests
from bs4 import BeautifulSoup, element
from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError
2023-07-14 18:18:56 +00:00
import pytz
2020-03-25 16:40:07 +00:00
2022-09-15 17:58:17 +00:00
# Number of records to keep in db table for each twitter account
MAX_REC_COUNT = 50
2022-11-06 10:50:08 +00:00
# How many seconds to wait before giving up on a download (except video download)
HTTPS_REQ_TIMEOUT = 10
2019-09-17 13:44:03 +00:00
# Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
2019-08-01 10:31:26 +00:00
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 OPR/104.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Vivaldi/6.4.3160.34',
2022-11-17 19:18:42 +00:00
]
2022-11-23 08:59:06 +00:00
2023-09-14 15:41:51 +00:00
def main(argv):
# Start stopwatch
global START_TIME
START_TIME = time.time()
# Build parser for command line arguments
parser = argparse.ArgumentParser(description='toot tweets.')
parser.add_argument('-f', metavar='<.toml config file>', action='store')
parser.add_argument('-t', metavar='<twitter account>', action='store')
parser.add_argument('-i', metavar='<mastodon instance>', action='store')
parser.add_argument('-m', metavar='<mastodon account>', action='store')
parser.add_argument('-p', metavar='<mastodon password>', action='store')
parser.add_argument('-r', action='store_true', help='Also post replies to other tweets')
parser.add_argument('-s', action='store_true', help='Suppress retweets')
parser.add_argument('-l', action='store_true', help='Remove link redirection')
parser.add_argument('-u', action='store_true', help='Remove trackers from URLs')
parser.add_argument('-v', action='store_true', help='Ingest twitter videos and upload to Mastodon instance')
parser.add_argument('-o', action='store_true', help='Do not add reference to Original tweet')
parser.add_argument('-q', action='store_true', help='update profile if changed')
parser.add_argument('-a', metavar='<max age (in days)>', action='store', type=float)
parser.add_argument('-d', metavar='<min delay (in mins)>', action='store', type=float)
parser.add_argument('-c', metavar='<max # of toots to post>', action='store', type=int)
# Parse command line
args = vars(parser.parse_args())
build_config(args)
mast_password = args['p']
# Setup logging to file
logging.basicConfig(
filename=TOML['config']['twitter_account'].lower() + '.log',
format='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
)
# Set default level of logging
log_level = logging.WARNING
# log level as an uppercase string from config
ll_str = TOML['options']['log_level'].upper()
if ll_str == "DEBUG":
log_level = logging.DEBUG
elif ll_str == "INFO":
log_level = logging.INFO
elif ll_str == "WARNING":
log_level = logging.WARNING
elif ll_str == "ERROR":
log_level = logging.ERROR
elif ll_str == "CRITICAL":
log_level == logging.CRITICAL
elif ll_str == "OFF":
# Disable all logging
logging.disable(logging.CRITICAL)
else:
logging.error('Invalid log_level %s in config file. Using WARNING.', str(TOML['options']['log_level']))
# Set desired level of logging
logger = logging.getLogger()
logger.setLevel(log_level)
logging.info('Running with the following configuration:')
logging.info(' Config File : ' + str(args['f']))
logging.info(' twitter_account : ' + TOML['config']['twitter_account'])
logging.info(' mastodon_instance : ' + TOML['config']['mastodon_instance'])
logging.info(' mastodon_user : ' + TOML['config']['mastodon_user'])
logging.info(' upload_videos : ' + str(TOML['options']['upload_videos']))
logging.info(' post_reply_to : ' + str(TOML['options']['post_reply_to']))
logging.info(' skip_retweets : ' + str(TOML['options']['skip_retweets']))
logging.info(' remove_link_redirections : ' + str(TOML['options']['remove_link_redirections']))
logging.info(' remove_trackers_from_urls : ' + str(TOML['options']['remove_trackers_from_urls']))
logging.info(' footer : ' + TOML['options']['footer'])
logging.info(' tweet_time_format : ' + TOML['options']['tweet_time_format'])
logging.info(' tweet_timezone : ' + TOML['options']['tweet_timezone'])
logging.info(' remove_original_tweet_ref : ' + str(TOML['options']['remove_original_tweet_ref']))
logging.info(' update_profile : ' + str(TOML['options']['update_profile']))
logging.info(' tweet_max_age : ' + str(TOML['options']['tweet_max_age']))
logging.info(' tweet_delay : ' + str(TOML['options']['tweet_delay']))
logging.info(' upload_pause : ' + str(TOML['options']['upload_pause']))
logging.info(' toot_cap : ' + str(TOML['options']['toot_cap']))
logging.info(' subst_twitter : ' + str(TOML['options']['subst_twitter']))
logging.info(' subst_youtube : ' + str(TOML['options']['subst_youtube']))
logging.info(' subst_reddit : ' + str(TOML['options']['subst_reddit']))
logging.info(' log_level : ' + TOML['options']['log_level'])
logging.info(' log_days : ' + str(TOML['options']['log_days']))
# Try to open database. If it does not exist, create it
sql = sqlite3.connect('twoot.db')
db = sql.cursor()
db.execute('''CREATE TABLE IF NOT EXISTS toots (twitter_account TEXT, mastodon_instance TEXT,
mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
db.execute('''CREATE INDEX IF NOT EXISTS main_index ON toots (twitter_account,
mastodon_instance, mastodon_account, tweet_id)''')
db.execute('''CREATE INDEX IF NOT EXISTS tweet_id_index ON toots (tweet_id)''')
db.execute('''CREATE TABLE IF NOT EXISTS profiles (mastodon_instance TEXT, mastodon_account TEXT, avatar_url TEXT, banner_url TEXT)''')
db.execute('''CREATE INDEX IF NOT EXISTS profile_index ON profiles (mastodon_instance, mastodon_account)''')
# Select random nitter instance to fetch updates from
nitter_url = 'https://' + TOML['options']['nitter_instances'][random.randint(0, len(TOML['options']['nitter_instances']) - 1)]
2023-10-31 14:24:14 +00:00
# Initiate session
session = requests.Session()
# Get a copy of the default headers that requests would use
headers = requests.utils.default_headers()
# Update default headers with randomly selected user agent
headers.update(
{
'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)],
'Cookie': 'replaceTwitter=; replaceYouTube=; hlsPlayback=on; proxyVideos=',
}
)
2023-09-14 15:41:51 +00:00
# Load twitter page of user
2023-10-31 14:24:14 +00:00
soup, timeline = get_timeline(session, nitter_url)
2023-09-14 15:41:51 +00:00
logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')
# **********************************************************
# Process each tweets and generate an array of dictionaries
# with data ready to be posted on Mastodon
# **********************************************************
tweets = []
out_date_cnt = 0
in_db_cnt = 0
for replied_to_tweet, status in timeline:
# Extract tweet ID and status ID
tweet_link_tag = status.find('a', class_='tweet-link')
if tweet_link_tag is None:
logging.debug("Malformed timeline item (no tweet link), skipping")
continue
tweet_id = tweet_link_tag.get('href').strip('#m')
status_id = tweet_id.split('/')[3]
logging.debug('processing tweet %s', tweet_id)
# Extract time stamp
time_string = status.find('span', class_='tweet-date').a.get('title')
try:
timestamp = datetime.strptime(time_string, '%d/%m/%Y, %H:%M:%S')
except:
# Dec 21, 2021 · 12:00 PM UTC
timestamp = datetime.strptime(time_string, '%b %d, %Y · %I:%M %p %Z')
# Check if time is within acceptable range
if not is_time_valid(timestamp.timestamp()):
out_date_cnt += 1
logging.debug("Tweet outside valid time range, skipping")
continue
# Check if retweets must be skipped
if TOML['options']['skip_retweets']:
# Check if this tweet is a retweet
if len(status.select("div.tweet-body > div > div.retweet-header")) != 0:
logging.debug("Retweet ignored per command-line configuration")
continue
# Check in database if tweet has already been posted
db.execute(
"SELECT * FROM toots WHERE twitter_account=? AND mastodon_instance=? AND mastodon_account=? AND tweet_id=?",
(TOML['config']['twitter_account'], TOML['config']['mastodon_instance'], TOML['config']['mastodon_user'], tweet_id))
tweet_in_db = db.fetchone()
if tweet_in_db is not None:
in_db_cnt += 1
logging.debug("Tweet %s already in database", tweet_id)
# Skip to next tweet
continue
else:
logging.debug('Tweet %s not found in database', tweet_id)
# extract author
author = status.find('a', class_='fullname').get('title')
# Extract user name
author_account = status.find('a', class_='username').get('title').lstrip('@')
# Extract URL of full status page (for video download)
full_status_url = 'https://twitter.com' + tweet_id
# Initialize containers
tweet_text = ''
photos = []
# Add prefix if the tweet is a reply-to
# Only consider item of class 'replying-to' that is a direct child
# of class 'tweet-body' in status. Others can be in a quoted tweet.
replying_to_class = status.select("div.tweet-body > div.replying-to")
if len(replying_to_class) != 0:
tweet_text += 'Replying to ' + replying_to_class[0].a.get_text() + '\n\n'
# Check it the tweet is a retweet from somebody else
if len(status.select("div.tweet-body > div > div.retweet-header")) != 0:
tweet_text = 'RT from ' + author + ' (@' + author_account + ')\n\n'
# extract iterator over tweet text contents
tt_iter = status.find('div', class_='tweet-content media-body').children
# Process text of tweet
tweet_text += process_media_body(tt_iter)
# Process quote: append link to tweet_text
quote_div = status.find('a', class_='quote-link')
if quote_div is not None:
tweet_text += '\n\n' + substitute_source('https://twitter.com' + quote_div.get('href').strip('#m'))
# Process card: extract image if necessary
card_class = status.find('a', class_='card-container')
if card_class is not None:
photos.extend(process_card(nitter_url, card_class))
# Process attachment: capture image or .mp4 url or download twitter video
attachments_class = status.find('div', class_='attachments')
if attachments_class is not None:
pics, vid_in_tweet = process_attachments(nitter_url,
attachments_class,
status_id, author_account)
photos.extend(pics)
if vid_in_tweet:
tweet_text += '\n\n[Video is unavailable]'
# Add custom footer from config file
if TOML['options']['footer'] != '':
tweet_text += '\n\n' + TOML['options']['footer']
# Add footer with link to original tweet
if TOML['options']['remove_original_tweet_ref'] is False:
if TOML['options']['footer'] != '':
tweet_text += '\nOriginal tweet: ' + substitute_source(full_status_url)
else:
tweet_text += '\n\nOriginal tweet: ' + substitute_source(full_status_url)
# Add timestamp to the "Original Tweet" line
if TOML['options']['tweet_time_format'] != "":
timestamp_display = timestamp
# Adjust timezone
if TOML['options']['tweet_timezone'] != "":
timezone_display = pytz.timezone(TOML['options']['tweet_timezone'])
else: # Use local timezone by default
timezone_display = datetime.now().astimezone().tzinfo
logging.debug("Timestamp UTC: " + str(timestamp))
logging.debug("Timezone to use: " + str(timezone_display))
timestamp_display = pytz.utc.localize(timestamp).astimezone(timezone_display)
logging.debug("Timestamp converted " + str(timestamp_display))
tweet_text += ' ' + datetime.strftime(timestamp_display, TOML['options']['tweet_time_format'])
# If no media was specifically added in the tweet, try to get the first picture
# with "twitter:image" meta tag in first linked page in tweet text
if not photos:
m = re.search(r"http[^ \n\xa0]*", tweet_text)
if m is not None:
link_url = m.group(0)
if link_url.endswith(".html"): # Only process a web page
try:
r = requests.get(link_url, timeout=HTTPS_REQ_TIMEOUT)
if r.status_code == 200:
# Matches the first instance of either twitter:image or twitter:image:src meta tag
match = re.search(r'<meta name="twitter:image(?:|:src)" content="(.+?)".*?>', r.text)
if match is not None:
url = match.group(1).replace('&amp;', '&') # Remove HTML-safe encoding from URL if any
photos.append(url)
# Give up if anything goes wrong
except (requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
requests.exceptions.ContentDecodingError,
requests.exceptions.TooManyRedirects,
requests.exceptions.MissingSchema):
pass
else:
logging.debug("downloaded twitter:image from linked page")
# Check if video was downloaded
video_file = None
video_path = Path('./output') / TOML['config']['twitter_account'] / status_id
if video_path.exists():
# list video files
video_file_list = list(video_path.glob('*.mp4'))
if len(video_file_list) != 0:
# Extract posix path of first video file in list
video_file = video_file_list[0].absolute().as_posix()
# Add dictionary with content of tweet to list
tweet = {
"author": author,
"author_account": author_account,
"timestamp": timestamp.timestamp(),
"tweet_id": tweet_id,
"tweet_text": tweet_text,
"video": video_file,
"photos": photos,
"replied_to_tweet": replied_to_tweet,
}
tweets.append(tweet)
logging.debug('Tweet %s added to list of toots to upload', tweet_id)
# Log summary stats
logging.info(str(out_date_cnt) + ' tweets outside of valid time range')
logging.info(str(in_db_cnt) + ' tweets already in database')
# Initialise Mastodon object
mastodon = None
# Update profile if it has changed
2023-10-31 14:24:14 +00:00
mastodon = update_profile(session, nitter_url, soup, sql, mast_password)
2023-09-14 15:41:51 +00:00
# Login to account on maston instance
if len(tweets) != 0 and mastodon is None:
mastodon = login(mast_password)
# Check toot character limit on mastodon instance
if mastodon is not None:
try:
max_characters = mastodon.instance().configuration.statuses['max_characters']
logging.debug('Instance character limit is ' + str(max_characters))
except Exception:
# Default value for Mastodon
max_characters = 500
logging.debug('Tried to get toot character limit from Mastodon instance but failed. Assuming 500')
# **********************************************************
# Iterate tweets in list.
# post each on Mastodon and record it in database
# **********************************************************
posted_cnt = 0
for tweet in reversed(tweets):
# Check if we have reached the cap on the number of toots to post
if TOML['options']['toot_cap'] != 0 and posted_cnt >= TOML['options']['toot_cap']:
logging.info('%d toots not posted due to configured cap', len(tweets) - TOML['options']['toot_cap'])
break
logging.debug('Uploading Tweet %s', tweet['tweet_id'])
media_ids = []
# Upload video if there is one
if tweet['video'] is not None:
try:
logging.debug("Uploading video to Mastodon")
media_posted = mastodon.media_post(tweet['video'])
media_ids.append(media_posted['id'])
except (MastodonAPIError, MastodonIllegalArgumentError,
TypeError): # Media cannot be uploaded (invalid format, dead link, etc.)
logging.debug("Uploading video failed")
pass
else: # Only upload pic if no video was uploaded
# Upload photos
for photo in tweet['photos']:
media = False
# Download picture
try:
logging.debug('downloading picture')
media = requests.get(photo, timeout=HTTPS_REQ_TIMEOUT)
except: # Picture cannot be downloaded for any reason
pass
# Upload picture to Mastodon instance
if media:
try:
logging.debug('uploading picture to Mastodon')
media_posted = mastodon.media_post(media.content, mime_type=media.headers['content-type'])
media_ids.append(media_posted['id'])
except (MastodonAPIError, MastodonIllegalArgumentError,
TypeError): # Media cannot be uploaded (invalid format, dead link, etc.)
pass
# Find in database toot id of replied_to_tweet
replied_to_toot = None
if tweet['replied_to_tweet'] is not None:
logging.debug("Searching db for toot corresponding to replied-to-tweet " + tweet['replied_to_tweet'])
db.execute("SELECT toot_id FROM toots WHERE tweet_id=?", [tweet['replied_to_tweet']])
replied_to_toot = db.fetchone()
if replied_to_toot is None:
logging.warning('Replied-to tweet %s not found in database', tweet['replied_to_tweet'])
else:
logging.debug("toot %s found", replied_to_toot)
# Post toot
toot = {}
try:
if len(media_ids) == 0:
toot = mastodon.status_post(tweet['tweet_text'], replied_to_toot)
else:
toot = mastodon.status_post(tweet['tweet_text'], replied_to_toot, media_ids=media_ids)
except MastodonAPIError as e:
_, status_code, _, exception_message = e.args
if status_code == 500:
logging.error('Mastodon internal server error')
logging.error('posting ' + tweet['tweet_id'] + ' to ' + TOML['config']['mastodon_instance'] + ' Failed')
continue
elif exception_message.find('Text character limit') != -1:
2023-09-14 15:41:51 +00:00
# ERROR (('Mastodon API returned error', 422, 'Unprocessable Entity', 'Validation failed: Text character limit of 500 exceeded'))
logging.error('Toot text too long: %s characters', str(len(tweet['tweet_text'])))
logging.error('posting ' + tweet['tweet_id'] + ' to ' + TOML['config']['mastodon_instance'] + ' Failed')
continue
elif exception_message.find('Try again in a moment') != -1:
# ERROR ('Mastodon API returned error', 422, 'Unprocessable Entity', 'Cannot attach files that have not finished processing. Try again in a moment!')
logging.warning('Mastodon API Error 422: Cannot attach files that have not finished processing. Waiting 30 seconds and retrying.')
# Wait 30 seconds
time.sleep(30)
# retry posting
try:
toot = mastodon.status_post(tweet['tweet_text'], media_ids=media_ids)
except MastodonError as me:
logging.error('posting ' + tweet['tweet_id'] + ' to ' + TOML['config']['mastodon_instance'] + ' Failed')
logging.error(me)
else:
logging.warning("Retry successful")
except MastodonError as me:
logging.error('posting ' + tweet['tweet_id'] + ' to ' + TOML['config']['mastodon_instance'] + ' Failed')
logging.error(me)
else:
posted_cnt += 1
logging.debug('Tweet %s posted on %s', tweet['tweet_id'], TOML['config']['mastodon_user'])
# Test to find out if slowing down successive posting helps with ordering of threads
time.sleep(TOML['options']['upload_pause'])
# Insert toot id into database
if 'id' in toot:
db.execute("INSERT INTO toots VALUES ( ? , ? , ? , ? , ? )",
(TOML['config']['twitter_account'], TOML['config']['mastodon_instance'], TOML['config']['mastodon_user'], tweet['tweet_id'], toot['id']))
sql.commit()
logging.info(str(posted_cnt) + ' tweets posted to Mastodon')
# Cleanup downloaded video files
try:
shutil.rmtree('./output/' + TOML['config']['twitter_account'])
except FileNotFoundError: # The directory does not exist
pass
# Evaluate excess records in database
excess_count = 0
db.execute('SELECT count(*) FROM toots WHERE twitter_account=?', (TOML['config']['twitter_account'],))
db_count = db.fetchone()
if db_count is not None:
excess_count = db_count[0] - MAX_REC_COUNT
# Delete excess records
if excess_count > 0:
db.execute('''
WITH excess AS (
SELECT tweet_id
FROM toots
WHERE twitter_account=?
ORDER BY toot_id ASC
LIMIT ?
)
DELETE from toots
WHERE tweet_id IN excess''', (TOML['config']['twitter_account'], excess_count))
sql.commit()
logging.info('Deleted ' + str(excess_count) + ' old records from database.')
shutdown(0)
2022-12-11 10:15:50 +00:00
def build_config(args):
"""
Receives the arguments passed on the command line
populates the TOML global dict with default values for all 'options' keys
if a config file is provided, load the keys from the config file
if no config file is provided, use command-line args
verify that a valid config is available (all keys in 'config' present)
:param args: list of command line arguments
"""
# Create global struct containing configuration
global TOML
# Default options
options = {
2023-07-14 11:12:25 +00:00
'nitter_instances': [
2024-02-09 13:05:37 +00:00
'n.opnxng.com', # added 10/11/2023
'nitter.mint.lgbt', # added 09/02/2024
'nitter.1d4.us', # added 09/02/2024
2024-02-12 12:17:03 +00:00
# 'nitter.ktachibana.party', # added 01/11/2023
2024-02-09 13:05:37 +00:00
# 'nitter.x86-64-unknown-linux-gnu.zip', # down 09/02/2024
# 'nitter.tinfoil-hat.net', # down 09/02/2024
# 'nitter.eu.projectsegfau.lt', # down 14/11/2023
# 'nitter.privacydev.net', # down 09/11/2023
2023-11-01 10:48:24 +00:00
# 'nitter.salastil.com', # added 25/08/2023
# 'nitter.poast.org', # added 25/08/2023
# 'nitter.d420.de', # added 25/08/2023
2023-11-02 17:43:34 +00:00
# 'nitter.woodland.cafe', # removed 02/11/2023
2023-07-14 11:12:25 +00:00
],
2022-12-11 10:15:50 +00:00
'upload_videos': False,
'post_reply_to': False,
'skip_retweets': False,
'remove_link_redirections': False,
'remove_trackers_from_urls': False,
2023-06-28 19:47:48 +00:00
'footer': "",
'tweet_time_format': "",
2023-07-11 11:15:43 +00:00
'tweet_timezone': "",
2022-12-11 10:15:50 +00:00
'remove_original_tweet_ref': False,
'tweet_max_age': float(1),
'tweet_delay': float(0),
2023-07-14 11:21:12 +00:00
'upload_pause': float(0),
2022-12-11 10:15:50 +00:00
'toot_cap': int(0),
'subst_twitter': [],
'subst_youtube': [],
'subst_reddit': [],
2023-06-15 12:35:27 +00:00
'update_profile': False,
2022-12-11 10:15:50 +00:00
'log_level': "WARNING",
'log_days': 3,
}
# Create default config object
2023-06-12 15:43:08 +00:00
TOML = {'config': {}, 'options': options}
2022-12-11 10:15:50 +00:00
# Load config file if it was provided
toml_file = args['f']
if toml_file is not None:
2023-06-12 15:43:08 +00:00
try: # Included in python from version 3.11
2022-12-11 10:15:50 +00:00
import tomllib
except ModuleNotFoundError:
# for python < 3.11, tomli module must be installed
import tomli as tomllib
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
loaded_toml = None
# Load toml file
try:
with open(toml_file, 'rb') as config_file:
loaded_toml = tomllib.load(config_file)
except FileNotFoundError:
print('config file not found')
2023-06-19 18:13:46 +00:00
shutdown(-1)
2022-12-11 10:15:50 +00:00
except tomllib.TOMLDecodeError:
print('Malformed config file')
2023-06-19 18:13:46 +00:00
shutdown(-1)
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
TOML['config'] = loaded_toml['config']
for k in TOML['options'].keys():
try: # Go through all valid keys
TOML['options'][k] = loaded_toml['options'][k]
except KeyError: # Key was not found in file
pass
else:
# Override config parameters with command-line values provided
if args['t'] is not None:
TOML['config']['twitter_account'] = args['t']
if args['i'] is not None:
TOML['config']['mastodon_instance'] = args['i']
if args['m'] is not None:
TOML['config']['mastodon_user'] = args['m']
if args['v'] is True:
TOML['options']['upload_videos'] = args['v']
if args['r'] is True:
TOML['options']['post_reply_to'] = args['r']
if args['s'] is True:
TOML['options']['skip_retweets'] = args['s']
if args['l'] is True:
TOML['options']['remove_link_redirections'] = args['l']
if args['u'] is True:
TOML['options']['remove_trackers_from_urls'] = args['u']
if args['o'] is True:
TOML['options']['remove_original_tweet_ref'] = args['o']
if args['a'] is not None:
TOML['options']['tweet_max_age'] = float(args['a'])
if args['d'] is not None:
TOML['options']['tweet_delay'] = float(args['d'])
if args['c'] is not None:
TOML['options']['toot_cap'] = int(args['c'])
2023-06-15 12:49:30 +00:00
if args['q'] is True:
TOML['options']['update_profile'] = args['q']
2022-12-11 10:15:50 +00:00
# Verify that we have a minimum config to run
if 'twitter_account' not in TOML['config'].keys() or TOML['config']['twitter_account'] == "":
print('CRITICAL: Missing Twitter account')
2023-06-15 12:49:30 +00:00
exit(-1)
2022-12-11 10:15:50 +00:00
if 'mastodon_instance' not in TOML['config'].keys() or TOML['config']['mastodon_instance'] == "":
print('CRITICAL: Missing Mastodon instance')
2023-06-15 12:49:30 +00:00
exit(-1)
2022-12-11 10:15:50 +00:00
if 'mastodon_user' not in TOML['config'].keys() or TOML['config']['mastodon_user'] == "":
print('CRITICAL: Missing Mastodon user')
2023-06-15 12:49:30 +00:00
exit(-1)
2022-12-11 10:15:50 +00:00
2023-06-14 14:49:15 +00:00
2023-10-31 14:24:14 +00:00
def get_timeline(session, nitter_url):
2023-09-14 09:30:55 +00:00
"""
Download timeline of twitter account
2023-10-31 14:24:14 +00:00
:param session: configured requests session including user agent
:param nitter_url: url of the account page to download
2023-09-14 09:30:55 +00:00
:return: list of tuples with url of tweet replied-to (or None) and content of tweet
"""
2023-07-13 09:36:04 +00:00
# Define url to use
url = nitter_url + '/' + TOML['config']['twitter_account']
# Use different page if we need to handle replies
if TOML['options']['post_reply_to']:
url += '/with_replies'
2023-07-12 20:02:06 +00:00
# Download twitter page of user
try:
2023-10-31 15:25:34 +00:00
twit_account_page = session.get(url, timeout=HTTPS_REQ_TIMEOUT)
2023-07-12 20:02:06 +00:00
except requests.exceptions.ConnectionError:
logging.fatal('Host did not respond when trying to download ' + url)
shutdown(-1)
except requests.exceptions.Timeout:
logging.fatal(url + ' took too long to respond')
shutdown(-1)
# Verify that download worked
if twit_account_page.status_code != 200:
logging.fatal('The Nitter page did not download correctly from ' + url + ' (' + str(
twit_account_page.status_code) + '). Aborting')
shutdown(-1)
logging.debug('Nitter page downloaded successfully from ' + url)
# DEBUG: Save page to file
2023-07-13 09:36:04 +00:00
# of = open('user_page_debug.html', 'w')
2023-07-12 20:02:06 +00:00
# of.write(twit_account_page.text)
# of.close()
# Make soup
soup = BeautifulSoup(twit_account_page.text, 'html.parser')
2023-07-13 09:36:04 +00:00
# Get the div containing tweets
tl = soup.find('div', class_='timeline')
2023-07-12 20:19:04 +00:00
2023-07-13 09:36:04 +00:00
# Get the list of direct children of timeline
2024-02-12 12:17:03 +00:00
try:
list = tl.find_all('div', recursive=False)
except AttributeError:
logging.fatal('The Nitter page ' + url + ' does not include the expected content')
logging.fatal(twit_account_page.text)
shutdown(-1)
2023-07-13 09:36:04 +00:00
timeline = []
2023-07-12 20:19:04 +00:00
for item in list:
classes = item['class']
2023-07-14 11:11:20 +00:00
if 'timeline-item' in classes: # Individual tweet
timeline.append((None, item))
2023-07-14 11:11:20 +00:00
elif 'thread-line' in classes: # First tweet of a thread
2023-07-13 09:36:04 +00:00
# Get the first item of thread
first_item = item.find('div', class_='timeline-item')
# Get the url of the tweet
2023-07-13 09:36:04 +00:00
thread_link_tag = item.find('a', class_='tweet-link')
if thread_link_tag is not None:
2023-07-16 13:42:58 +00:00
thread_url = thread_link_tag.get('href').strip('#m')
# Get the rest of the items of the thread
2023-07-16 13:43:13 +00:00
timeline.extend(_get_rest_of_thread(session, headers, nitter_url, thread_url, first_item))
2023-07-13 09:36:04 +00:00
else:
# Ignore other classes
continue
2023-07-12 20:02:06 +00:00
return soup, timeline
2023-09-14 15:41:51 +00:00
def _get_rest_of_thread(session, headers, nitter_url, thread_url, first_item):
2022-11-22 10:05:16 +00:00
"""
2023-09-14 15:41:51 +00:00
Dowload page with full thread of tweets and extract all replied to tweet reference by url.
Only used by `get_timeline()`.
:param session: Existing HTTP session with Nitter instance
:param headers: HTTP headers to use
:param nitter url: url of the nitter instance to use
:param thread_url: url of the first tweet in thread
:return: list of tuples with url of tweet replied-to (or None) and content of tweet
2022-11-22 10:05:16 +00:00
"""
2023-09-14 15:41:51 +00:00
# Add first item to timeline
timeline = [(None, first_item)]
2022-11-22 10:05:16 +00:00
2023-09-14 15:41:51 +00:00
logging.debug("Downloading tweets in thread from separate page")
# Download page with thread
url = nitter_url + thread_url
2022-11-22 10:05:16 +00:00
try:
2023-09-14 15:41:51 +00:00
thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
except requests.exceptions.ConnectionError:
logging.fatal('Host did not respond when trying to download ' + url)
shutdown(-1)
except requests.exceptions.Timeout:
logging.fatal(url + ' took too long to respond')
shutdown(-1)
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Verify that download worked
if thread_page.status_code != 200:
logging.fatal('The Nitter page did not download correctly from ' + url + ' (' + str(thread_page.status_code) + '). Aborting')
shutdown(-1)
2023-02-27 11:48:48 +00:00
2023-09-14 15:41:51 +00:00
logging.debug('Nitter page downloaded successfully from ' + url)
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# DEBUG: Save page to file
# of = open('thread_page_debug.html', 'w')
# of.write(twit_account_page.text)
# of.close()
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Make soup
soup = BeautifulSoup(thread_page.text, 'html.parser')
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
list = []
# Get all items in thread after main tweet
after_tweet = soup.find('div', 'after-tweet')
if after_tweet is not None:
list = after_tweet.find_all('div', class_='timeline-item')
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Build timeline of tuples
previous_tweet_url = thread_url
for item in list:
timeline.append((previous_tweet_url, item))
# Get the url of the tweet
tweet_link_tag = item.find('a', class_='tweet-link')
if tweet_link_tag is not None:
previous_tweet_url = tweet_link_tag.get('href').strip('#m')
else:
previous_tweet_url = None
logging.error('Thread tweet is missing link tag')
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# return timeline in reverse chronological order
timeline.reverse()
return timeline
2023-06-12 15:43:08 +00:00
2022-11-17 19:18:42 +00:00
2023-09-14 15:41:51 +00:00
def is_time_valid(timestamp):
ret = True
# Check that the tweet is not too young (might be deleted) or too old
age_in_hours = (time.time() - float(timestamp)) / 3600.0
min_delay_in_hours = TOML['options']['tweet_delay'] / 60.0
max_age_in_hours = TOML['options']['tweet_max_age'] * 24.0
2022-11-17 19:18:42 +00:00
2023-09-14 15:41:51 +00:00
if age_in_hours < min_delay_in_hours or age_in_hours > max_age_in_hours:
ret = False
2022-11-17 19:18:42 +00:00
2023-09-14 15:41:51 +00:00
return ret
2022-11-18 11:57:44 +00:00
2019-07-31 20:42:38 +00:00
2022-12-11 10:15:50 +00:00
def process_media_body(tt_iter):
2020-12-18 10:45:43 +00:00
"""
Receives an iterator over all the elements contained in the tweet-text container.
2020-12-17 21:08:43 +00:00
Processes them to make them suitable for posting on Mastodon
:param tt_iter: iterator over the HTML elements in the text of the tweet
2020-12-17 21:08:43 +00:00
:return: cleaned up text of the tweet
2020-12-18 10:45:43 +00:00
"""
2022-12-11 10:15:50 +00:00
2019-07-31 20:42:38 +00:00
tweet_text = ''
# Iterate elements
for tag in tt_iter:
# If element is plain text, copy it verbatim
if isinstance(tag, element.NavigableString):
tweet_text += tag.string
# If it is an 'a' html tag
2020-12-17 21:08:43 +00:00
elif tag.name == 'a':
tag_text = tag.get_text()
2020-12-18 13:57:22 +00:00
if tag_text.startswith('@'):
2020-12-17 21:08:43 +00:00
# Only keep user name
tweet_text += tag_text
2020-12-18 13:57:22 +00:00
elif tag_text.startswith('#'):
2020-12-17 21:08:43 +00:00
# Only keep hashtag text
tweet_text += tag_text
else:
2022-11-22 10:05:16 +00:00
# This is a real link
2022-12-11 10:15:50 +00:00
url = deredir_url(tag.get('href'))
url = substitute_source(url)
url = clean_url(url)
2023-02-27 11:48:48 +00:00
2022-12-11 10:15:50 +00:00
tweet_text += url
2019-07-31 20:42:38 +00:00
else:
2023-07-19 16:31:50 +00:00
logging.warning("No handler for tag %s in twitter text: ", tag.prettify())
2019-07-31 20:42:38 +00:00
return tweet_text
def process_card(nitter_url, card_container):
2020-12-18 10:45:43 +00:00
"""
2020-12-17 21:59:21 +00:00
Extract image from card in case mastodon does not do it
:param card_container: soup of 'a' tag containing card markup
:return: list with url of image
2020-12-18 10:45:43 +00:00
"""
2020-12-17 21:59:21 +00:00
list = []
2020-12-18 20:32:26 +00:00
img = card_container.div.div.img
if img is not None:
image_url = nitter_url + img.get('src')
2020-12-18 20:32:26 +00:00
list.append(image_url)
logging.debug('Extracted image from card')
2020-12-17 21:59:21 +00:00
return list
2020-12-18 10:45:43 +00:00
2022-12-11 10:15:50 +00:00
def process_attachments(nitter_url, attachments_container, status_id, author_account):
2020-12-18 10:45:43 +00:00
"""
Extract images or video from attachments. Videos are downloaded on the file system.
:param nitter_url: url of nitter mirror
:param attachments_container: soup of 'div' tag containing attachments markup
2020-12-18 12:26:26 +00:00
:param twit_account: name of twitter account
2020-12-18 16:55:12 +00:00
:param status_id: id of tweet being processed
2020-12-18 12:26:26 +00:00
:param author_account: author of tweet with video attachment
2020-12-18 10:45:43 +00:00
:return: list with url of images
"""
# Collect url of images
pics = []
images = attachments_container.find_all('a', class_='still-image')
for image in images:
2023-09-14 15:41:51 +00:00
pics.append(nitter_url + image.get('href'))
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
logging.debug('collected ' + str(len(pics)) + ' image(s) from attachments')
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Download nitter video (converted animated GIF)
gif_class = attachments_container.find('video', class_='gif')
if gif_class is not None:
gif_video_file = nitter_url + gif_class.source.get('src')
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
video_path = os.path.join('output', TOML['config']['twitter_account'], status_id, author_account, status_id)
os.makedirs(video_path, exist_ok=True)
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
# Open directory for writing file
orig_dir = os.getcwd()
os.chdir(video_path)
with requests.get(gif_video_file, stream=True, timeout=HTTPS_REQ_TIMEOUT) as r:
try:
# Raise exception if response code is not 200
r.raise_for_status()
# Download chunks and write them to file
with open('gif_video.mp4', 'wb') as f:
for chunk in r.iter_content(chunk_size=16 * 1024):
f.write(chunk)
2019-08-01 12:58:41 +00:00
logging.debug('Downloaded video of GIF animation from attachments')
2023-09-14 15:41:51 +00:00
except: # Don't do anything if video can't be found or downloaded
logging.debug('Could not download video of GIF animation from attachments')
pass
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Close directory
os.chdir(orig_dir)
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Download twitter video
vid_in_tweet = False
vid_container = attachments_container.find('div', class_='video-container')
if vid_container is not None:
if TOML['options']['upload_videos']:
logging.debug("downloading video from twitter")
import youtube_dl
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
video_path_source = vid_container.source
if video_path_source is not None:
video_path = video_path_source['src']
if video_path is not None:
video_file = urljoin(nitter_url, video_path)
ydl_opts = {
'outtmpl': "output/" + TOML['config']['twitter_account'] + "/" + status_id + "/%(id)s.%(ext)s",
# 'format': "best[width<=500]",
'socket_timeout': 60,
'quiet': True,
}
2020-12-18 16:21:41 +00:00
2023-09-14 15:41:51 +00:00
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([video_file])
except Exception as e:
logging.warning('Error downloading twitter video: ' + str(e))
vid_in_tweet = True
else:
logging.debug('downloaded twitter video from attachments')
else:
logging.debug("Media is unavailable")
vid_in_tweet = True
else:
logging.debug("Media is unavailable")
vid_in_tweet = True
2021-06-01 13:49:11 +00:00
2023-09-14 15:41:51 +00:00
return pics, vid_in_tweet
2022-12-21 08:41:59 +00:00
2023-10-31 14:24:14 +00:00
def update_profile(session, nitter_url, soup, sql, mast_password):
2023-09-14 15:41:51 +00:00
"""
Update profile on Mastodon
Check if avatar or banner pictures have changed since last run
If they have, download them and upload them on the Mastodon account profile
2023-10-31 14:24:14 +00:00
:param session: Confifgured requests session including user agent
2023-09-14 15:41:51 +00:00
:param nitter_url: url of the Nitter instance that is being used
:param soup: BeautifulSoup object containing the page
:param sql: database connection
:param mast_password: <PASSWORD>
:return: mastodon object if we had to login to update, None otherwise
"""
# Check if TOML option to update profile is set
if TOML['options']['update_profile'] is False:
return None
2022-12-21 08:41:59 +00:00
else:
2023-09-14 15:41:51 +00:00
logging.debug("Checking twitter profile for changes")
2023-02-27 11:48:48 +00:00
db = sql.cursor()
2023-09-14 15:41:51 +00:00
# Extract avatar picture address
try:
new_avatar_url = soup.find('div', class_='profile-card-info').findChild('a').findChild('img').get('src')
except AttributeError:
new_avatar_url = None
2023-09-14 15:41:51 +00:00
# Extract banner picture address
try:
new_banner_url = soup.find('div', class_='profile-banner').findChild('a').findChild('img').get('src')
except AttributeError:
new_banner_url = None
2023-09-14 15:41:51 +00:00
# Get the original urls of the avatar and banner pictures on the account profile
db.execute("SELECT avatar_url, banner_url FROM profiles WHERE mastodon_instance=? AND mastodon_account=?", (TOML['config']['mastodon_instance'], TOML['config']['mastodon_user'],))
profile_in_db = db.fetchone()
2023-09-14 15:41:51 +00:00
changed = False
if profile_in_db is not None:
cur_avatar_url = profile_in_db[0]
cur_banner_url = profile_in_db[1]
2023-07-24 19:51:20 +00:00
2023-09-14 15:41:51 +00:00
# Check if urls have changed
if new_avatar_url != cur_avatar_url:
changed = True
logging.info('avatar image changed on twitter profile')
if new_banner_url != cur_banner_url:
changed = True
logging.info('banner image changed on twitter profile')
else:
# Mastodon user not found in database. Add new record
db.execute("INSERT INTO profiles (mastodon_instance, mastodon_account, avatar_url, banner_url) VALUES (?, ?, ?, ?)", (TOML['config']['mastodon_instance'], TOML['config']['mastodon_user'], None, None))
sql.commit()
changed = True
logging.debug("added new profile to database")
2023-09-14 15:41:51 +00:00
mastodon = None
2020-10-14 19:51:00 +00:00
2023-09-14 15:41:51 +00:00
# Update if necessary
if changed:
logging.info('updating profile on Mastodon')
2023-09-14 15:41:51 +00:00
new_avatar_img = None
new_avatar_mime = None
new_banner_img = None
new_banner_mime = None
2023-09-14 15:41:51 +00:00
# Download images
2023-10-31 14:24:14 +00:00
new_avatar = session.get(nitter_url + new_avatar_url, timeout=HTTPS_REQ_TIMEOUT) if new_avatar_url is not None else None
2023-09-14 15:41:51 +00:00
if new_avatar is not None:
new_avatar_img = new_avatar.content if new_avatar.status_code == 200 else None
new_avatar_mime = new_avatar.headers['content-type'] if new_avatar.status_code == 200 else None
if new_avatar.status_code != 200:
logging.error("Could not download avatar image from " + nitter_url + new_avatar_url)
logging.error("Status code: " + str(new_avatar.status_code))
else:
logging.debug("Avatar image downloaded")
2023-10-31 14:24:14 +00:00
new_banner = session.get(nitter_url + new_banner_url, timeout=HTTPS_REQ_TIMEOUT) if new_banner_url is not None else None
2023-09-14 15:41:51 +00:00
if new_banner is not None:
new_banner_img = new_banner.content if new_banner.status_code == 200 else None
new_banner_mime = new_banner.headers['content-type'] if new_banner.status_code == 200 else None
if new_banner.status_code != 200:
logging.error("Could not download banner image from " + nitter_url + new_banner_url)
2023-10-01 20:44:47 +00:00
logging.error("Status code: " + str(new_banner.status_code))
2023-09-14 15:41:51 +00:00
else:
logging.debug("Banner image downloaded")
2022-11-13 21:17:43 +00:00
2023-09-14 15:41:51 +00:00
mastodon = login(mast_password)
2023-09-14 15:41:51 +00:00
# Update profile on Mastodon
try:
mastodon.account_update_credentials(avatar=new_avatar_img, avatar_mime_type=new_avatar_mime, header=new_banner_img, header_mime_type=new_banner_mime)
except Exception as e:
logging.error("Could not update profile")
logging.error(e)
2020-11-09 14:55:42 +00:00
else:
2023-09-14 15:41:51 +00:00
logging.info("Profile updated on Mastodon")
# Add urls to database
db.execute("UPDATE profiles SET avatar_url=?, banner_url=? WHERE mastodon_instance=? AND mastodon_account=?", (new_avatar_url, new_banner_url, TOML['config']['mastodon_instance'], TOML['config']['mastodon_user']))
sql.commit()
logging.debug("Profile updated on database")
else:
logging.info("No changes to profile found")
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
return mastodon
2019-08-01 12:58:41 +00:00
2020-12-16 21:46:01 +00:00
2023-09-14 15:41:51 +00:00
def login(password):
"""
Login to Mastodon account and return mastodon object used to post content
:param password: Password associated to account. None if not provided
:return: mastodon object
"""
# Create Mastodon application if it does not exist yet
if not os.path.isfile(TOML['config']['mastodon_instance'] + '.secret'):
try:
Mastodon.create_app(
'feedtoot',
api_base_url='https://' + TOML['config']['mastodon_instance'],
to_file=TOML['config']['mastodon_instance'] + '.secret'
)
2020-12-17 16:56:12 +00:00
2023-09-14 15:41:51 +00:00
except MastodonError as me:
logging.fatal('failed to create app on ' + TOML['config']['mastodon_instance'])
logging.fatal(me)
shutdown(-1)
2023-09-14 15:41:51 +00:00
mastodon = None
2020-02-14 06:58:39 +00:00
2023-09-14 15:41:51 +00:00
# Log in to Mastodon instance with password
if password is not None:
try:
mastodon = Mastodon(
client_id=TOML['config']['mastodon_instance'] + '.secret',
api_base_url='https://' + TOML['config']['mastodon_instance']
)
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
mastodon.log_in(
username=TOML['config']['mastodon_user'],
password=password,
to_file=TOML['config']['mastodon_user'] + ".secret"
)
logging.info('Logging in to ' + TOML['config']['mastodon_instance'])
2020-12-17 20:44:32 +00:00
2023-09-14 15:41:51 +00:00
except MastodonError as me:
logging.fatal('Login to ' + TOML['config']['mastodon_instance'] + ' Failed\n')
logging.fatal(me)
shutdown(-1)
2020-12-17 20:44:32 +00:00
2023-09-14 15:41:51 +00:00
if os.path.isfile(TOML['config']['mastodon_user'] + '.secret'):
logging.warning('''You successfully logged in using a password and an access token
has been saved. The password can therefore be omitted from the
command-line in future invocations''')
else: # No password provided, login with token
# Using token in existing .secret file
if os.path.isfile(TOML['config']['mastodon_user'] + '.secret'):
try:
mastodon = Mastodon(
access_token=TOML['config']['mastodon_user'] + '.secret',
api_base_url='https://' + TOML['config']['mastodon_instance'])
except MastodonError as me:
logging.fatal('Login to ' + TOML['config']['mastodon_instance'] + ' Failed\n')
logging.fatal(me)
shutdown(-1)
else:
logging.fatal('No .secret file found. Password required to log in')
shutdown(-1)
2020-12-17 20:44:32 +00:00
2023-09-14 15:41:51 +00:00
return mastodon
2019-08-01 12:58:41 +00:00
2022-12-11 10:15:50 +00:00
2023-09-14 15:41:51 +00:00
def deredir_url(url):
"""
Given a URL, return the URL that the page really downloads from
:param url: url to be de-redirected
:return: direct url
"""
# Check if we need to do anyting
if TOML['options']['remove_link_redirections'] is False:
return url
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Get a copy of the default headers that requests would use
headers = requests.utils.default_headers()
2023-07-11 11:15:43 +00:00
2023-09-14 15:41:51 +00:00
# Update default headers with randomly selected user agent
headers.update(
{
'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)],
}
)
2023-06-23 15:35:01 +00:00
2023-09-14 15:41:51 +00:00
ret = None
try:
# Download the page
ret = requests.head(url, headers=headers, allow_redirects=True, timeout=5)
except:
# If anything goes wrong keep the URL intact
return url
2023-09-14 15:41:51 +00:00
if ret.url != url:
logging.debug("Removed redirection from: " + url + " to: " + ret.url)
2023-09-14 15:41:51 +00:00
# Return the URL that the page was downloaded from
return ret.url
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
def substitute_source(orig_url):
"""
param orig_url: url to check for substitutes
:return: url with replaced domains
"""
parsed_url = urlparse(orig_url)
domain = parsed_url.netloc
2023-09-14 15:41:51 +00:00
logging.debug("Checking domain %s for substitution ", domain)
2020-10-14 19:51:00 +00:00
2023-09-14 15:41:51 +00:00
# Handle twitter
twitter_subst = TOML["options"]["subst_twitter"]
# Do not substitiute if subdomain is present (e.g. i.twitter.com)
if (domain == 'twitter.com' or domain == 'www.twitter.com') and twitter_subst != []:
domain = twitter_subst[random.randint(0, len(twitter_subst) - 1)]
logging.debug("Replaced twitter.com by " + domain)
2023-06-14 14:22:28 +00:00
2023-09-14 15:41:51 +00:00
# Handle youtube
youtube_subst = TOML["options"]["subst_youtube"]
# Do not substitiute if subdomain is present (e.g. i.youtube.com)
if (domain == 'youtube.com' or domain == 'wwww.youtube.com') and youtube_subst != []:
domain = youtube_subst[random.randint(0, len(youtube_subst) - 1)]
logging.debug("Replaced youtube.com by " + domain)
2023-06-14 14:22:28 +00:00
2023-09-14 15:41:51 +00:00
# Handle reddit
reddit_subst = TOML["options"]["subst_reddit"]
# Do not substitiute if subdomain is present (e.g. i.reddit.com)
if (domain == 'reddit.com' or domain == 'www.reddit.com') and reddit_subst != []:
domain = reddit_subst[random.randint(0, len(reddit_subst) - 1)]
logging.debug("Replaced reddit.com by " + domain)
2023-09-14 15:41:51 +00:00
dest_url = urlunparse([
parsed_url.scheme,
domain,
parsed_url.path,
parsed_url.params,
parsed_url.query,
parsed_url.fragment
])
2023-07-19 15:59:28 +00:00
2023-09-14 15:41:51 +00:00
return dest_url
2019-08-01 12:58:41 +00:00
2021-06-01 09:54:08 +00:00
2023-09-14 15:41:51 +00:00
def clean_url(orig_url):
"""
Given a URL, return it with the UTM parameters removed from query and fragment
:param dirty_url: url to be cleaned
:return: url cleaned
>>> clean_url('https://example.com/video/this-aerial-ropeway?utm_source=Twitter&utm_medium=video&utm_campaign=organic&utm_content=Nov13&a=aaa&b=1#mkt_tok=tik&mkt_tik=tok')
'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
"""
# Check if we have to do anything
if TOML['options']['remove_trackers_from_urls'] is False:
return orig_url
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Parse a URL into 6 components:
# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
url_parsed = urlparse(orig_url)
2023-09-14 15:41:51 +00:00
# Reassemble URL after removal of trackers
dest_url = urlunparse([
url_parsed.scheme,
url_parsed.netloc,
url_parsed.path,
url_parsed.params,
_remove_trackers_query(url_parsed.query),
_remove_trackers_fragment(url_parsed.fragment)
])
if dest_url != orig_url:
logging.debug('Cleaned URL from: ' + orig_url + ' to: ' + dest_url)
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
return dest_url
2023-09-14 15:41:51 +00:00
def _remove_trackers_query(query_str):
"""
private function
Given a query string from a URL, strip out the known trackers
:param query_str: query to be cleaned
:return: query cleaned
"""
# Avalaible URL tracking parameters :
# UTM tags by Google Ads, M$ Ads, ...
# tag by TikTok
# tags by Snapchat
# tags by Facebook
params_to_remove = {
"gclid", "_ga", "gclsrc", "dclid",
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "utm_cid",
"utm_reader", "utm_name", "utm_referrer", "utm_social", "utm_social-type", "utm_brand"
"mkt_tok",
"campaign_name", "ad_set_name", "campaign_id", "ad_set_id",
"fbclid", "campaign_name", "ad_set_name", "ad_set_id", "media", "interest_group_name", "ad_set_id"
"igshid",
"cvid", "oicd", "msclkid",
"soc_src", "soc_trk",
"_openstat", "yclid",
"xtor", "xtref", "adid",
}
query_to_clean = dict(parse_qsl(query_str, keep_blank_values=True))
query_cleaned = [(k, v) for k, v in query_to_clean.items() if k not in params_to_remove]
return urlencode(query_cleaned, doseq=True)
2023-07-16 10:48:26 +00:00
2023-09-14 15:41:51 +00:00
def _remove_trackers_fragment(fragment_str):
"""
private function
Given a fragment string from a URL, strip out the known trackers
:param query_str: fragment to be cleaned
:return: cleaned fragment
"""
params_to_remove = {
"Echobox",
}
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
if '=' in fragment_str:
fragment_str = fragment_str.split('&')
query_cleaned = [i for i in fragment_str if i.split('=')[0] not in params_to_remove]
fragment_str = '&'.join(query_cleaned)
return fragment_str
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
def shutdown(exit_code):
"""
Cleanly stop execution with a message on execution duration
Remove log messages older that duration specified in config from log file
:param exit_code: return value to pass to shell when exiting
"""
logging.info('Run time : {t:2.1f} seconds.'.format(t=time.time() - START_TIME))
logging.info('_____________________________________________________________________________________')
2020-10-14 19:51:00 +00:00
2023-09-14 15:41:51 +00:00
# Close logger and log file
logging.shutdown()
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Remove older log messages
# Max allowed age of log message
max_delta = timedelta(TOML['options']['log_days'])
2020-12-18 21:09:34 +00:00
2023-09-14 15:41:51 +00:00
# Open log file
log_file_name = TOML['config']['twitter_account'].lower() + '.log'
new_log_file_name = TOML['config']['twitter_account'].lower() + '.log.new'
try:
2023-09-14 15:41:51 +00:00
log_file = open(log_file_name, 'r')
except FileNotFoundError:
# Nothing to do if there is no log file
exit(exit_code)
2019-08-01 12:58:41 +00:00
2023-09-14 15:41:51 +00:00
# Check each line
pos = log_file.tell()
while True:
line = log_file.readline()
# Check if we reached the end of the file
if not line:
exit(exit_code)
2022-09-15 17:58:17 +00:00
2023-09-14 15:41:51 +00:00
try:
# Extract date on log line
date = datetime.strptime(line[:10], '%Y-%m-%d')
except ValueError:
# date was not found on this line, try next one
continue
2022-09-15 17:58:17 +00:00
2023-09-14 15:41:51 +00:00
# Time difference between log message and now
log_delta = datetime.now() - date
# Only keep the number of days of the difference
log_delta = timedelta(days=log_delta.days)
if log_delta < max_delta:
logging.debug("Truncating log file")
# Reset file pointer to position before reading last line
log_file.seek(pos)
remainder = log_file.read()
output_file = open(new_log_file_name, 'w')
output_file.write(remainder)
output_file.close()
# replace log file by new one
shutil.move(new_log_file_name, log_file_name)
2022-09-15 17:58:17 +00:00
2023-09-14 15:41:51 +00:00
break # Exit while loop
2023-02-27 11:48:48 +00:00
2023-09-14 15:41:51 +00:00
# Update read pointer position
pos = log_file.tell()
exit(exit_code)
2023-06-12 15:43:08 +00:00
2019-08-01 12:58:41 +00:00
if __name__ == "__main__":
main(sys.argv)