Moved time check to beginning of process

This commit is contained in:
jeancf 2020-12-17 17:31:43 +01:00
parent d92bcea2a7
commit 0787669a3a

View File

@ -31,7 +31,6 @@ import re
from pathlib import Path from pathlib import Path
from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError
import subprocess import subprocess
import json.decoder
import shutil import shutil
@ -48,7 +47,7 @@ logging.basicConfig(filename="twoot.log", level=logging.INFO)
logging.info('*********** NEW RUN ***********') logging.info('*********** NEW RUN ***********')
def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids): def process_tweet_content(tt_iter, twit_account, status_id, tweet_uri, get_vids):
''' '''
Receives an iterator over all the elements contained in the tweet-text container. Receives an iterator over all the elements contained in the tweet-text container.
Processes them to remove Twitter-specific stuff and make them suitable for Processes them to remove Twitter-specific stuff and make them suitable for
@ -155,6 +154,18 @@ def contains_class(body_classes, some_class):
return found return found
def is_time_valid(timestamp, max_age, min_delay):
ret = True
# Check that the tweet is not too young (might be deleted) or too old
age_in_hours = (time.time() - float(timestamp)) / 3600.0
min_delay_in_hours = min_delay / 60.0
max_age_in_hours = max_age * 24.0
if age_in_hours < min_delay_in_hours or age_in_hours > max_age_in_hours:
ret = False
return ret
def main(argv): def main(argv):
@ -237,7 +248,12 @@ def main(argv):
# Extract twitter timeline # Extract twitter timeline
timeline = soup.find_all('div', class_='timeline-item') timeline = soup.find_all('div', class_='timeline-item')
logging.info('Processing timeline') logging.info('Processing ' + len(timeline) + ' tweets found in timeline')
# **********************************************************
# Process each tweets and generate dictionary
# with data ready to be posted on Mastodon
# **********************************************************
for status in timeline: for status in timeline:
# Extract tweet ID and status ID # Extract tweet ID and status ID
tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m') tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
@ -245,6 +261,15 @@ def main(argv):
logging.debug('processing tweet %s', tweet_id) logging.debug('processing tweet %s', tweet_id)
# Extract time stamp
time_string = status.find('span', class_='tweet-date').a.get('title')
timestamp = datetime.datetime.strptime(time_string, '%d/%m/%Y, %H:%M:%S').timestamp()
# Check if time is within acceptable range
if not is_time_valid(timestamp, max_age, min_delay):
logging.debug("Tweet outside valid time range, skipping")
continue
# Check in database if tweet has already been posted # Check in database if tweet has already been posted
db.execute("SELECT * FROM toots WHERE twitter_account=? AND mastodon_instance=? AND mastodon_account=? AND tweet_id=?", db.execute("SELECT * FROM toots WHERE twitter_account=? AND mastodon_instance=? AND mastodon_account=? AND tweet_id=?",
(twit_account, mast_instance, mast_account, tweet_id)) (twit_account, mast_instance, mast_account, tweet_id))
@ -257,19 +282,6 @@ def main(argv):
else: else:
logging.debug('Tweet %s not found in database', tweet_id) logging.debug('Tweet %s not found in database', tweet_id)
reply_to_username = None
# TODO Check if the tweet is a reply-to
reply_to_div = None
if reply_to_div is not None:
# Do we need to handle reply-to tweets?
if tweets_and_replies:
# TODO Capture user name being replied to
pass
else:
# Skip this tweet
logging.debug("Tweet is a reply-to and we don't want that. Skipping.")
continue
# extract author # extract author
author = status.find('a', class_='fullname').get('title') author = status.find('a', class_='fullname').get('title')
@ -279,22 +291,16 @@ def main(argv):
# Extract URL of full status page (for video download) # Extract URL of full status page (for video download)
full_status_url = 'https://twitter.com' + tweet_id full_status_url = 'https://twitter.com' + tweet_id
# Extract time stamp # TODO Check if the tweet is a reply-to
time_string = status.find('span', class_='tweet-date').a.get('title')
timestamp = datetime.datetime.strptime(time_string, '%d/%m/%Y, %H:%M:%S').timestamp() # Check it the tweet is a retweet from somebody else
if author_account.lower() != twit_account.lower():
tweet_text = 'RT from ' + author + ' (@' + author_account + ')\n\n'
# extract iterator over tweet text contents # extract iterator over tweet text contents
tt_iter = status.find('div', class_='tweet-content media-body').children tt_iter = status.find('div', class_='tweet-content media-body').children
tweet_text = cleanup_tweet_text(tt_iter, twit_account, status_id, full_status_url, get_vids) tweet_text += process_tweet_content(tt_iter, twit_account, status_id, full_status_url, get_vids)
# Mention if the tweet is a reply-to
if reply_to_username is not None:
tweet_text = 'In reply to ' + reply_to_username + '\n\n' + tweet_text
# Check it the tweet is a retweet from somebody else
if author_account.lower() != twit_account.lower():
tweet_text = 'RT from ' + author + ' (@' + author_account + ')\n\n' + tweet_text
# Add footer with link to original tweet # Add footer with link to original tweet
tweet_text += '\n\nOriginal tweet : ' + full_status_url tweet_text += '\n\nOriginal tweet : ' + full_status_url
@ -358,15 +364,17 @@ def main(argv):
} }
tweets.append(tweet) tweets.append(tweet)
logging.debug('Tweet %s added to list to upload', tweet_id) logging.debug('Tweet %s added to list of toots to upload', tweet_id)
# TODO Log summary stats: how many not in db, how many in valid timeframe
# DEBUG: Print extracted tweets # DEBUG: Print extracted tweets
# for t in tweets: #for t in tweets:
# print(t) #print(t)
# ********************************************************** # **********************************************************
# Iterate tweets in list. # Iterate tweets in list.
# post each on Mastodon and reference to it in database # post each on Mastodon and record it in database
# ********************************************************** # **********************************************************
# Create Mastodon application if it does not exist yet # Create Mastodon application if it does not exist yet
@ -396,22 +404,12 @@ def main(argv):
) )
except MastodonError as me: except MastodonError as me:
print('ERROR: Login to ' + mast_instance + ' Failed') logging.fatal('ERROR: Login to ' + mast_instance + ' Failed\n' + me)
print(me)
sys.exit(1) sys.exit(1)
# Upload tweets # Upload tweets
for tweet in reversed(tweets): for tweet in reversed(tweets):
logging.debug('Uploading Tweet %s', tweet["tweet_id"]) logging.debug('Uploading Tweet %s', tweet["tweet_id"])
# Check that the tweet is not too young (might be deleted) or too old
age_in_hours = (time.time() - float(tweet['timestamp'])) / 3600.0
min_delay_in_hours = min_delay / 60.0
max_age_in_hours = max_age * 24.0
if age_in_hours < min_delay_in_hours or age_in_hours > max_age_in_hours:
# Skip to next tweet
logging.debug("Tweet too young or too old, skipping")
continue
media_ids = [] media_ids = []
@ -444,7 +442,6 @@ def main(argv):
pass pass
# Post toot # Post toot
logging.debug('Doing it now')
try: try:
mastodon = Mastodon( mastodon = Mastodon(
access_token=mast_account + '.secret', access_token=mast_account + '.secret',