From 9fc76b9981c3da109dad21383bbf4b657a9d0ba3 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 18:47:27 +0100 Subject: [PATCH 01/42] Updated user agents --- twoot.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/twoot.py b/twoot.py index c5e6274..ecd468e 100755 --- a/twoot.py +++ b/twoot.py @@ -37,10 +37,10 @@ import shutil # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/ USER_AGENTS = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/73.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13 Safari/605.1.15', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; Xbox; Xbox One) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edge/44.18363.8131', + 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60', ] # Setup logging to file From 894c13d551376330af43da647c5b7036686f4435 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 19:43:17 +0100 Subject: [PATCH 02/42] Download page from nitter.net --- twoot.py | 62 ++++++++++++-------------------------------------------- 1 file changed, 13 insertions(+), 49 deletions(-) diff --git a/twoot.py b/twoot.py index ecd468e..6ec825a 100755 --- a/twoot.py +++ b/twoot.py @@ -44,48 +44,8 @@ USER_AGENTS = [ ] # Setup logging to file -logging.basicConfig(filename="twoot.log", level=logging.WARNING) -logging.debug('*********** NEW RUN ***********') - -def handle_no_js(session, page, headers): - """ - Check if page is a "No Javascript" page instead of the content that we wanted - If it is, submit the form on the page as POST request to get the correct page and return it - :param session: current requests session - :param page: Response object to check - :param headers: HTTP headers used in initial request - :return: correct page (Response object) - """ - # DEBUG: Save page to file - #of = open('no_js_page.html', 'w') - #of.write(page.text) - #of.close() - - # Set default return value - new_page = page - - # Make soup - soup = BeautifulSoup(page.text, 'html.parser') - - if soup.form.p is not None: - if 'JavaScript is disabled' in str(soup.form.p.string): - # Submit POST form response with cookies - headers.update( - { - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': page.request.url, - } - ) - - action = soup.form.get('action') - - # Submit the form - new_page = session.post(action, headers=headers, cookies=page.cookies) - - # Verify that download worked - assert (new_page.status_code == 200), 'The twitter page did not download correctly. Aborting' - - return new_page +logging.basicConfig(filename="twoot.log", level=logging.INFO) +logging.info('*********** NEW RUN ***********') def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids): @@ -195,6 +155,7 @@ def contains_class(body_classes, some_class): return found + def main(argv): # Build parser for command line arguments @@ -220,6 +181,8 @@ def main(argv): max_age = float(args['a']) min_delay = float(args['d']) + logging.info('Updating ' + twit_account + ' on ' + mast_instance) + # Try to open database. If it does not exist, create it sql = sqlite3.connect('twoot.db') db = sql.cursor() @@ -246,21 +209,22 @@ def main(argv): } ) - url = 'https://mobile.twitter.com/' + twit_account - # Download twitter page of user. We should get a 'no javascript' landing page and some cookies + url = 'https://nitter.net/' + twit_account + # Download twitter page of user. twit_account_page = session.get(url, headers=headers) # Verify that download worked assert twit_account_page.status_code == 200,\ 'The twitter page did not download correctly. Aborting' - # If we got a No Javascript page, download the correct page - twit_account_page = handle_no_js(session, twit_account_page, headers) + logging.info('Page downloaded successfully') # DEBUG: Save page to file - #of = open(twit_account + '.html', 'w') - #of.write(twit_account_page.text) - #of.close() + of = open(twit_account + '.html', 'w') + of.write(twit_account_page.text) + of.close() + + exit(0) # Make soup soup = BeautifulSoup(twit_account_page.text, 'html.parser') From e2841535f64637c5af96e9f7b7ad312ab20c01a8 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 20:42:44 +0100 Subject: [PATCH 03/42] Extracted twit_account --- twoot.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/twoot.py b/twoot.py index 6ec825a..bf41671 100755 --- a/twoot.py +++ b/twoot.py @@ -215,7 +215,7 @@ def main(argv): # Verify that download worked assert twit_account_page.status_code == 200,\ - 'The twitter page did not download correctly. Aborting' + 'The nitter page did not download correctly. Aborting' logging.info('Page downloaded successfully') @@ -224,17 +224,14 @@ def main(argv): of.write(twit_account_page.text) of.close() - exit(0) - # Make soup soup = BeautifulSoup(twit_account_page.text, 'html.parser') - # Verify that we now have the correct twitter page - body_classes = soup.body.get_attribute_list('class') - assert contains_class(body_classes, 'users-show-page'), 'This is not the correct twitter page. Quitting' - # Replace twit_account with version with correct capitalization - twit_account = soup.find('span', class_='screen-name').get_text() + ta = soup.find('meta', property='og:title').get('content') + twit_account = re.search('\(@(.+)\)', ta).group(1) + print(twit_account) + exit(0) # Extract twitter timeline timeline = soup.find_all('table', class_='tweet') From 910b7a8b13651a43cbbe10ac1bfac4b74cad1501 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 20:48:00 +0100 Subject: [PATCH 04/42] Safer implementation --- twoot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index bf41671..7a26af0 100755 --- a/twoot.py +++ b/twoot.py @@ -229,7 +229,10 @@ def main(argv): # Replace twit_account with version with correct capitalization ta = soup.find('meta', property='og:title').get('content') - twit_account = re.search('\(@(.+)\)', ta).group(1) + ta_match = re.search('\(@(.+)\)', ta) + if ta_match is not None: + twit_account = ta_match.group(1) + print(twit_account) exit(0) From c25e36b498958d5fc14567574b480dfbdf74b6b4 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 20:55:26 +0100 Subject: [PATCH 05/42] Extracted timeline --- twoot.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/twoot.py b/twoot.py index 7a26af0..57d2641 100755 --- a/twoot.py +++ b/twoot.py @@ -233,11 +233,10 @@ def main(argv): if ta_match is not None: twit_account = ta_match.group(1) - print(twit_account) - exit(0) - # Extract twitter timeline - timeline = soup.find_all('table', class_='tweet') + timeline = soup.find_all('div', class_='timeline-item') + print(len(timeline)) + exit(0) for status in timeline: # Extract tweet ID and status ID From 7cc076053febf993ddbda8a81e9e1603d41467b9 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 21:55:13 +0100 Subject: [PATCH 06/42] Extracted tweet_id and status_id --- twoot.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/twoot.py b/twoot.py index 57d2641..d1420f0 100755 --- a/twoot.py +++ b/twoot.py @@ -235,12 +235,11 @@ def main(argv): # Extract twitter timeline timeline = soup.find_all('div', class_='timeline-item') - print(len(timeline)) - exit(0) + logging.info('Processing timeline') for status in timeline: # Extract tweet ID and status ID - tweet_id = str(status['href']).strip('?p=v') + tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m') status_id = tweet_id.split('/')[3] logging.debug('processing tweet %s', tweet_id) @@ -250,10 +249,6 @@ def main(argv): (twit_account, mast_instance, mast_account, tweet_id)) tweet_in_db = db.fetchone() - logging.debug("SELECT * FROM toots WHERE twitter_account='{}' AND mastodon_instance='{}' AND mastodon_account='{}' AND tweet_id='{}'" - .format(twit_account, mast_instance, mast_account, tweet_id) - ) - if tweet_in_db is not None: logging.debug("Tweet %s already in database", tweet_id) # Skip to next tweet @@ -262,13 +257,12 @@ def main(argv): logging.debug('Tweet %s not found in database', tweet_id) reply_to_username = None - # Check if the tweet is a reply-to - reply_to_div = status.find('div', class_='tweet-reply-context username') + # TODO Check if the tweet is a reply-to + reply_to_div = None if reply_to_div is not None: # Do we need to handle reply-to tweets? if tweets_and_replies: - # Capture user name being replied to - reply_to_username = reply_to_div.a.get_text() + # TODO Capture user name being replied to else: # Skip this tweet logging.debug("Tweet is a reply-to and we don't want that. Skipping.") From e87599d40b538e20b084cdc1411d631c896288f5 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 21:57:03 +0100 Subject: [PATCH 07/42] Removed downloading of full status page of the tweet --- twoot.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/twoot.py b/twoot.py index d1420f0..816ffe6 100755 --- a/twoot.py +++ b/twoot.py @@ -268,32 +268,6 @@ def main(argv): logging.debug("Tweet is a reply-to and we don't want that. Skipping.") continue - # Extract url of full status page - full_status_url = 'https://mobile.twitter.com' + tweet_id + '?p=v' - - # fetch full status page - full_status_page = session.get(full_status_url, headers=headers) - - # Verify that download worked - assert full_status_page.status_code == 200, \ - 'The twitter page did not download correctly. Aborting' - - # If we got a No Javascript page, download the correct page - full_status_page = handle_no_js(session, full_status_page, headers) - - # DEBUG: Save page to file - #of = open('full_status_page.html', 'w') - #of.write(full_status_page.text) - #of.close() - - # Make soup - soup = BeautifulSoup(full_status_page.text, 'html.parser') - - # Verify that we now have the correct twitter page - body_classes = soup.body.get_attribute_list('class') - assert contains_class(body_classes, 'tweets-show-page'), \ - 'This is not the correct twitter page. Quitting' - # Check if tweet contains pic censored as "Sensitive material" if soup.find('div', class_='accept-data') is not None: # If it does, submit form to obtain uncensored tweet From 4e6a97d765c7fbe980241647aa6f4e3ca206eb9f Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 21:58:24 +0100 Subject: [PATCH 08/42] Removed downloading of status page with uncensored pics --- twoot.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/twoot.py b/twoot.py index 816ffe6..e5b66d9 100755 --- a/twoot.py +++ b/twoot.py @@ -268,40 +268,6 @@ def main(argv): logging.debug("Tweet is a reply-to and we don't want that. Skipping.") continue - # Check if tweet contains pic censored as "Sensitive material" - if soup.find('div', class_='accept-data') is not None: - # If it does, submit form to obtain uncensored tweet - # Submit POST form response with cookies - headers.update( - { - 'Origin': 'https://mobile.twitter.com', - 'Host': 'mobile.twitter.com', - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': full_status_url, - } - ) - - # Data payload for POST request - authenticity_token = soup.find('input', {'name': 'authenticity_token'}).get('value') - form_input = {'show_media': 1, 'authenticity_token': authenticity_token, 'commit': 'Display media'} - - full_status_page = session.post(full_status_url, data=form_input, headers=headers) - - # Verify that download worked - assert full_status_page.status_code == 200, \ - 'The twitter page did not download correctly. Aborting' - - # DEBUG: Save page to file - #of = open('full_status_page_uncensored.html', 'w') - #of.write(full_status_page.text) - #of.close() - - # Remake soup - soup = BeautifulSoup(full_status_page.text, 'html.parser') - - # Isolate table main-tweet - tmt = soup.find('table', class_='main-tweet') - # Extract avatar author_logo_url = tmt.find('td', class_='avatar').a.img['src'] From 19d988dfcbd0e7ad7a382fb0447e8b3dad619b73 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 22:03:09 +0100 Subject: [PATCH 09/42] Removed extracting avatar --- twoot.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/twoot.py b/twoot.py index e5b66d9..5a310cd 100755 --- a/twoot.py +++ b/twoot.py @@ -268,9 +268,6 @@ def main(argv): logging.debug("Tweet is a reply-to and we don't want that. Skipping.") continue - # Extract avatar - author_logo_url = tmt.find('td', class_='avatar').a.img['src'] - # extract author author = tmt.find('div', class_='fullname').a.strong.get_text() From e6e24cbfd5e7b1e77f6312a88cd20d06b92f9905 Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 22:15:27 +0100 Subject: [PATCH 10/42] Extracted author, author_account, time_string, timestamp --- twoot.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/twoot.py b/twoot.py index 5a310cd..d312626 100755 --- a/twoot.py +++ b/twoot.py @@ -263,20 +263,21 @@ def main(argv): # Do we need to handle reply-to tweets? if tweets_and_replies: # TODO Capture user name being replied to + pass else: # Skip this tweet logging.debug("Tweet is a reply-to and we don't want that. Skipping.") continue # extract author - author = tmt.find('div', class_='fullname').a.strong.get_text() + author = status.find('a', class_='fullname').get('title') # Extract user name - author_account = str(tmt.find('span', class_='username').span.next_sibling).strip('\n ') + author_account = status.find('a', class_='username').get('title').lstrip('@') # Extract time stamp - time_string = tmt.find('div', class_='metadata').a.get_text() - timestamp = datetime.datetime.strptime(time_string, '%I:%M %p - %d %b %Y').timestamp() + time_string = status.find('span', class_='tweet-date').a.get('title') + timestamp = datetime.datetime.strptime(time_string, '%d/%m/%Y, %H:%M:%S').timestamp() # extract iterator over tweet text contents tt_iter = tmt.find('div', class_='tweet-text').div.children From 857a7f9b9e735d17139c8eb2bd0358dfe2fc1cec Mon Sep 17 00:00:00 2001 From: jeancf Date: Wed, 16 Dec 2020 22:46:01 +0100 Subject: [PATCH 11/42] Extracted full_status_url --- twoot.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/twoot.py b/twoot.py index d312626..9b8f54d 100755 --- a/twoot.py +++ b/twoot.py @@ -275,12 +275,15 @@ def main(argv): # Extract user name author_account = status.find('a', class_='username').get('title').lstrip('@') + # Extract URL of full status page (for video download) + full_status_url = 'https://twitter.com' + tweet_id + # Extract time stamp time_string = status.find('span', class_='tweet-date').a.get('title') timestamp = datetime.datetime.strptime(time_string, '%d/%m/%Y, %H:%M:%S').timestamp() # extract iterator over tweet text contents - tt_iter = tmt.find('div', class_='tweet-text').div.children + tt_iter = status.find('div', class_='tweet-content media-body').children tweet_text = cleanup_tweet_text(tt_iter, twit_account, status_id, full_status_url, get_vids) @@ -293,12 +296,12 @@ def main(argv): tweet_text = 'RT from ' + author + ' (@' + author_account + ')\n\n' + tweet_text # Add footer with link to original tweet - tweet_text += '\n\nOriginal tweet : https://twitter.com' + tweet_id + tweet_text += '\n\nOriginal tweet : ' + full_status_url photos = [] # The no_js version of twitter only shows one photo # Check if there are photos attached - media = tmt.find('div', class_='media') + media = status.find('div', class_='media') if media: # Extract photo url and add it to list pic = str(media.img['src']).strip(':small') From 3a2c8093a39caaee4535fc96b9ecfec57e5c1694 Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 10:15:46 +0100 Subject: [PATCH 12/42] Improved logging in cleanup_tweet_text --- twoot.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/twoot.py b/twoot.py index 9b8f54d..0293685 100755 --- a/twoot.py +++ b/twoot.py @@ -104,9 +104,9 @@ def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids): tweet_text += '\n\n[Video embedded in original tweet]' except OSError: logging.error("Could not execute twitterdl.py (is it there? Is it set as executable?)") - sys.exit(-1) except subprocess.TimeoutExpired: # Video download and encoding took too long + logging.error("twitterdl.py execution timed out") tweet_text += '\n\n[Video embedded in original tweet]' else: tweet_text += '\n\n[Video embedded in original tweet]' @@ -137,7 +137,7 @@ def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids): pass else: - print("*** WARNING: No handler for tag in twitter text: " + tag.prettify()) + logging.warning("No handler for tag in twitter text: " + tag.prettify()) return tweet_text @@ -349,7 +349,6 @@ def main(argv): tweet = { "author": author, "author_account": author_account, - "author_logo_url": author_logo_url, "timestamp": timestamp, "tweet_id": tweet_id, "tweet_text": tweet_text, From d92bcea2a7e9d1bfa5e23d6ea88cda9f4fab1428 Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 10:44:30 +0100 Subject: [PATCH 13/42] Added cookie to preserve twitter and youtube addresses --- twoot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/twoot.py b/twoot.py index 0293685..35e8503 100755 --- a/twoot.py +++ b/twoot.py @@ -206,6 +206,7 @@ def main(argv): headers.update( { 'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS)-1)], + 'Cookie': 'replaceTwitter=; replaceYouTube=', } ) From 0787669a3a3557e7f9cce39a1382e663df0529b4 Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 17:31:43 +0100 Subject: [PATCH 14/42] Moved time check to beginning of process --- twoot.py | 85 +++++++++++++++++++++++++++----------------------------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/twoot.py b/twoot.py index 35e8503..ee418c9 100755 --- a/twoot.py +++ b/twoot.py @@ -31,7 +31,6 @@ import re from pathlib import Path from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError import subprocess -import json.decoder import shutil @@ -48,7 +47,7 @@ logging.basicConfig(filename="twoot.log", level=logging.INFO) logging.info('*********** NEW RUN ***********') -def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids): +def process_tweet_content(tt_iter, twit_account, status_id, tweet_uri, get_vids): ''' Receives an iterator over all the elements contained in the tweet-text container. Processes them to remove Twitter-specific stuff and make them suitable for @@ -155,6 +154,18 @@ def contains_class(body_classes, some_class): return found +def is_time_valid(timestamp, max_age, min_delay): + ret = True + # Check that the tweet is not too young (might be deleted) or too old + age_in_hours = (time.time() - float(timestamp)) / 3600.0 + min_delay_in_hours = min_delay / 60.0 + max_age_in_hours = max_age * 24.0 + + if age_in_hours < min_delay_in_hours or age_in_hours > max_age_in_hours: + ret = False + + return ret + def main(argv): @@ -237,7 +248,12 @@ def main(argv): # Extract twitter timeline timeline = soup.find_all('div', class_='timeline-item') - logging.info('Processing timeline') + logging.info('Processing ' + len(timeline) + ' tweets found in timeline') + + # ********************************************************** + # Process each tweets and generate dictionary + # with data ready to be posted on Mastodon + # ********************************************************** for status in timeline: # Extract tweet ID and status ID tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m') @@ -245,6 +261,15 @@ def main(argv): logging.debug('processing tweet %s', tweet_id) + # Extract time stamp + time_string = status.find('span', class_='tweet-date').a.get('title') + timestamp = datetime.datetime.strptime(time_string, '%d/%m/%Y, %H:%M:%S').timestamp() + + # Check if time is within acceptable range + if not is_time_valid(timestamp, max_age, min_delay): + logging.debug("Tweet outside valid time range, skipping") + continue + # Check in database if tweet has already been posted db.execute("SELECT * FROM toots WHERE twitter_account=? AND mastodon_instance=? AND mastodon_account=? AND tweet_id=?", (twit_account, mast_instance, mast_account, tweet_id)) @@ -257,19 +282,6 @@ def main(argv): else: logging.debug('Tweet %s not found in database', tweet_id) - reply_to_username = None - # TODO Check if the tweet is a reply-to - reply_to_div = None - if reply_to_div is not None: - # Do we need to handle reply-to tweets? - if tweets_and_replies: - # TODO Capture user name being replied to - pass - else: - # Skip this tweet - logging.debug("Tweet is a reply-to and we don't want that. Skipping.") - continue - # extract author author = status.find('a', class_='fullname').get('title') @@ -279,22 +291,16 @@ def main(argv): # Extract URL of full status page (for video download) full_status_url = 'https://twitter.com' + tweet_id - # Extract time stamp - time_string = status.find('span', class_='tweet-date').a.get('title') - timestamp = datetime.datetime.strptime(time_string, '%d/%m/%Y, %H:%M:%S').timestamp() + # TODO Check if the tweet is a reply-to + + # Check it the tweet is a retweet from somebody else + if author_account.lower() != twit_account.lower(): + tweet_text = 'RT from ' + author + ' (@' + author_account + ')\n\n' # extract iterator over tweet text contents tt_iter = status.find('div', class_='tweet-content media-body').children - tweet_text = cleanup_tweet_text(tt_iter, twit_account, status_id, full_status_url, get_vids) - - # Mention if the tweet is a reply-to - if reply_to_username is not None: - tweet_text = 'In reply to ' + reply_to_username + '\n\n' + tweet_text - - # Check it the tweet is a retweet from somebody else - if author_account.lower() != twit_account.lower(): - tweet_text = 'RT from ' + author + ' (@' + author_account + ')\n\n' + tweet_text + tweet_text += process_tweet_content(tt_iter, twit_account, status_id, full_status_url, get_vids) # Add footer with link to original tweet tweet_text += '\n\nOriginal tweet : ' + full_status_url @@ -358,15 +364,17 @@ def main(argv): } tweets.append(tweet) - logging.debug('Tweet %s added to list to upload', tweet_id) + logging.debug('Tweet %s added to list of toots to upload', tweet_id) + + # TODO Log summary stats: how many not in db, how many in valid timeframe # DEBUG: Print extracted tweets -# for t in tweets: -# print(t) + #for t in tweets: + #print(t) # ********************************************************** # Iterate tweets in list. - # post each on Mastodon and reference to it in database + # post each on Mastodon and record it in database # ********************************************************** # Create Mastodon application if it does not exist yet @@ -396,22 +404,12 @@ def main(argv): ) except MastodonError as me: - print('ERROR: Login to ' + mast_instance + ' Failed') - print(me) + logging.fatal('ERROR: Login to ' + mast_instance + ' Failed\n' + me) sys.exit(1) # Upload tweets for tweet in reversed(tweets): logging.debug('Uploading Tweet %s', tweet["tweet_id"]) - # Check that the tweet is not too young (might be deleted) or too old - age_in_hours = (time.time() - float(tweet['timestamp'])) / 3600.0 - min_delay_in_hours = min_delay / 60.0 - max_age_in_hours = max_age * 24.0 - - if age_in_hours < min_delay_in_hours or age_in_hours > max_age_in_hours: - # Skip to next tweet - logging.debug("Tweet too young or too old, skipping") - continue media_ids = [] @@ -444,7 +442,6 @@ def main(argv): pass # Post toot - logging.debug('Doing it now') try: mastodon = Mastodon( access_token=mast_account + '.secret', From 557ef6deb9c74298e7044f11d7a605f11c456ff5 Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 17:50:10 +0100 Subject: [PATCH 15/42] Handling reply-to --- twoot.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index ee418c9..294ea21 100755 --- a/twoot.py +++ b/twoot.py @@ -222,6 +222,10 @@ def main(argv): ) url = 'https://nitter.net/' + twit_account + # Use different page if we need to handle replies + if tweets_and_replies: + url += '/with_replies' + # Download twitter page of user. twit_account_page = session.get(url, headers=headers) @@ -248,7 +252,7 @@ def main(argv): # Extract twitter timeline timeline = soup.find_all('div', class_='timeline-item') - logging.info('Processing ' + len(timeline) + ' tweets found in timeline') + logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline') # ********************************************************** # Process each tweets and generate dictionary @@ -292,6 +296,8 @@ def main(argv): full_status_url = 'https://twitter.com' + tweet_id # TODO Check if the tweet is a reply-to + #
Replying to @tomwarren
+ being_replied_to = status.find('div', class_='replying-to').a.get_text() # Check it the tweet is a retweet from somebody else if author_account.lower() != twit_account.lower(): From fbec4004f904b885af5bee1e73045e1e97bad48f Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 17:56:12 +0100 Subject: [PATCH 16/42] Handled reply-to --- twoot.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 294ea21..225ebc0 100755 --- a/twoot.py +++ b/twoot.py @@ -295,9 +295,14 @@ def main(argv): # Extract URL of full status page (for video download) full_status_url = 'https://twitter.com' + tweet_id + # Initialize tweet text + tweet_text = '' + # TODO Check if the tweet is a reply-to #
Replying to @tomwarren
- being_replied_to = status.find('div', class_='replying-to').a.get_text() + replying_to_class = status.find('div', class_='replying-to') + if replying_to_class is not None: + tweet_text += 'Replying to ' + replying_to_class.a.get_text() # Check it the tweet is a retweet from somebody else if author_account.lower() != twit_account.lower(): From 992f91537f3ec38f82394d68ab96bbd683649911 Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 18:59:02 +0100 Subject: [PATCH 17/42] TODO done --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 225ebc0..bf4b8cc 100755 --- a/twoot.py +++ b/twoot.py @@ -298,7 +298,7 @@ def main(argv): # Initialize tweet text tweet_text = '' - # TODO Check if the tweet is a reply-to + # Add prefix if the tweet is a reply-to #
Replying to @tomwarren
replying_to_class = status.find('div', class_='replying-to') if replying_to_class is not None: From 711ec9677ad404161290735cfc35b620ca41d35d Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 21:44:32 +0100 Subject: [PATCH 18/42] Added a bunch of TODO --- twoot.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/twoot.py b/twoot.py index bf4b8cc..83dcf1d 100755 --- a/twoot.py +++ b/twoot.py @@ -47,7 +47,7 @@ logging.basicConfig(filename="twoot.log", level=logging.INFO) logging.info('*********** NEW RUN ***********') -def process_tweet_content(tt_iter, twit_account, status_id, tweet_uri, get_vids): +def process_media_body(tt_iter, twit_account, status_id, tweet_uri, get_vids): ''' Receives an iterator over all the elements contained in the tweet-text container. Processes them to remove Twitter-specific stuff and make them suitable for @@ -299,7 +299,6 @@ def main(argv): tweet_text = '' # Add prefix if the tweet is a reply-to - #
Replying to @tomwarren
replying_to_class = status.find('div', class_='replying-to') if replying_to_class is not None: tweet_text += 'Replying to ' + replying_to_class.a.get_text() @@ -311,7 +310,13 @@ def main(argv): # extract iterator over tweet text contents tt_iter = status.find('div', class_='tweet-content media-body').children - tweet_text += process_tweet_content(tt_iter, twit_account, status_id, full_status_url, get_vids) + tweet_text += process_media_body(tt_iter, twit_account, status_id, full_status_url, get_vids) + + # TODO Process quote: append link to tweet_text + + # TODO Process card : extract image or youtube link + + # TODO Process attachment: capture image or .mp4 url or download twitter video # Add footer with link to original tweet tweet_text += '\n\nOriginal tweet : ' + full_status_url From 80799142828a477939c103e1a98e2fcf9a93b34c Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 22:08:43 +0100 Subject: [PATCH 19/42] Reworked process_media_body --- twoot.py | 90 +++++++++----------------------------------------------- 1 file changed, 14 insertions(+), 76 deletions(-) diff --git a/twoot.py b/twoot.py index 83dcf1d..fed7e62 100755 --- a/twoot.py +++ b/twoot.py @@ -47,16 +47,12 @@ logging.basicConfig(filename="twoot.log", level=logging.INFO) logging.info('*********** NEW RUN ***********') -def process_media_body(tt_iter, twit_account, status_id, tweet_uri, get_vids): +def process_media_body(tt_iter): ''' Receives an iterator over all the elements contained in the tweet-text container. - Processes them to remove Twitter-specific stuff and make them suitable for - posting on Mastodon + Processes them to make them suitable for posting on Mastodon :param tt_iter: iterator over the HTML elements in the text of the tweet - :param twit_account: Used to name directory where videos are downloaded - :param status_id: Used to name directory where videos are downloaded - :param tweet_uri: Used to downloaded videos - :param get_vids: True to download embedded twitter videos and save them on the filesystem + :return: cleaned up text of the tweet ''' tweet_text = '' # Iterate elements @@ -66,75 +62,17 @@ def process_media_body(tt_iter, twit_account, status_id, tweet_uri, get_vids): tweet_text += tag.string # If it is an 'a' html tag - elif tag.name == 'a' and tag.has_attr('class'): - # If element is a #hashtag, only keep text - for tc in tag['class']: - if tc == 'twitter-hashtag': - tweet_text += tag.get_text() - - # If element is a mention of @someuser, only keep text - elif tc == 'twitter-atreply': - tweet_text += tag.get_text() - - # If element is an external link - elif tc == 'twitter_external_link': - # If element is a simple link - if tag.has_attr('data-expanded-url'): - # Add a sometimes missing space before url - if not tweet_text.endswith(' ') and not tweet_text.endswith('\n'): - tweet_text += ' ' - # Add full url - tweet_text += tag['data-expanded-url'] - if tag.has_attr('data-expanded-path'): - data_expanded_path = tag['data-expanded-path'] - if 'video' in data_expanded_path: - if get_vids: - # Download video from twitter and store in filesystem. Running as subprocess to avoid - # requirement to install ffmpeg and ffmpeg-python for those that do not want to post videos - try: - # Set output location to ./output/twit_account/status_id - dl_feedback = subprocess.run( - ["./twitterdl.py", tweet_uri, "-ooutput/" + twit_account + "/" + status_id, "-w 500"], - capture_output=True, - timeout=300 # let's try 5 minutes - ) - if dl_feedback.returncode != 0: - logging.warning('Video in tweet ' + status_id + ' from ' + twit_account + ' failed to download') - tweet_text += '\n\n[Video embedded in original tweet]' - except OSError: - logging.error("Could not execute twitterdl.py (is it there? Is it set as executable?)") - except subprocess.TimeoutExpired: - # Video download and encoding took too long - logging.error("twitterdl.py execution timed out") - tweet_text += '\n\n[Video embedded in original tweet]' - else: - tweet_text += '\n\n[Video embedded in original tweet]' - - # If element is hashflag (hashtag + icon), handle as simple hashtag - elif tag.name == 'span' and tag['class'][0] == 'twitter-hashflag-container': - tweet_text += tag.a.get_text() - - # If tag is an image - elif tag.name == 'img': - # If it is of class 'Emoji' - for tc in tag['class']: - if tc == 'Emoji': - # Get url of Emoji - src = tag["src"] - # Use regex to extract unicode characters from file name - uni_str = re.search('/([0-9A-Fa-f\-]+?).png$', src).group(1) - # build the list of hex unicode characters separated by '-' in the file name - uni_list = uni_str.split('-') - # Extract individual unicode chars and add them to the tweet - for uni_char in uni_list: - # convert string to hex value of unicode character - tweet_text += chr(int(uni_char, 16)) - - # elif tag is a geographical point of interest - elif tag.name == 'span' and tag['class'][0] == 'tweet-poi-geo-text': - # Not sure what to do - pass - + elif tag.name == 'a': + tag_text = tag.get_text() + if tag_text.starts_with('@'): + # Only keep user name + tweet_text += tag_text + elif tag_text.starts_with('#'): + # Only keep hashtag text + tweet_text += tag_text + else: + # This is a real link, keep url + tweet_text += tag.get('href') else: logging.warning("No handler for tag in twitter text: " + tag.prettify()) From 14c24fe847964976472c7d6826b69703339c358f Mon Sep 17 00:00:00 2001 From: jeancf Date: Thu, 17 Dec 2020 22:59:21 +0100 Subject: [PATCH 20/42] started process_attachments() --- twoot.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/twoot.py b/twoot.py index fed7e62..7b9c634 100755 --- a/twoot.py +++ b/twoot.py @@ -79,6 +79,22 @@ def process_media_body(tt_iter): return tweet_text +def process_card(card_container): + ''' + Extract image from card in case mastodon does not do it + :param card_container: soup of 'a' tag containing card markup + :return: list with url of image + ''' + list = [] + link = card_container.get('href') + + # Dailymotion + if link.contains('dailymotion.com'): + image_url = 'twitter.com' + card_container.div.div.img.get('src') + list.append(image_url) + + return list + def contains_class(body_classes, some_class): ''' :param body_classes: list of classes to search @@ -233,8 +249,9 @@ def main(argv): # Extract URL of full status page (for video download) full_status_url = 'https://twitter.com' + tweet_id - # Initialize tweet text + # Initialize containers tweet_text = '' + photos = [] # Add prefix if the tweet is a reply-to replying_to_class = status.find('div', class_='replying-to') @@ -248,18 +265,27 @@ def main(argv): # extract iterator over tweet text contents tt_iter = status.find('div', class_='tweet-content media-body').children - tweet_text += process_media_body(tt_iter, twit_account, status_id, full_status_url, get_vids) + # Process text of tweet + tweet_text += process_media_body(tt_iter) - # TODO Process quote: append link to tweet_text + # Process quote: append link to tweet_text + quote_div = status.find('div', class_='quote-link') + if quote_div is not None: + tweet_text += '\n twitter.com' + quote_div.get('href').strip('#m') - # TODO Process card : extract image or youtube link + # Process card : extract image if necessary + card_class = status.find('a', class_='card-container') + if card_class is not None: + photos.extend(process_card(card_class)) # TODO Process attachment: capture image or .mp4 url or download twitter video + attachments_class = status.find('a', class_='attachments') + if card_class is not None: + photos.extend(process_attachments(attachments_class)) # Add footer with link to original tweet tweet_text += '\n\nOriginal tweet : ' + full_status_url - photos = [] # The no_js version of twitter only shows one photo # Check if there are photos attached media = status.find('div', class_='media') From b4a596eff26ccb36140939e07e43893141f19e96 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 11:45:43 +0100 Subject: [PATCH 21/42] Downloaded pics attachments --- twoot.py | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/twoot.py b/twoot.py index 7b9c634..3bc7897 100755 --- a/twoot.py +++ b/twoot.py @@ -48,12 +48,12 @@ logging.info('*********** NEW RUN ***********') def process_media_body(tt_iter): - ''' + """ Receives an iterator over all the elements contained in the tweet-text container. Processes them to make them suitable for posting on Mastodon :param tt_iter: iterator over the HTML elements in the text of the tweet :return: cleaned up text of the tweet - ''' + """ tweet_text = '' # Iterate elements for tag in tt_iter: @@ -80,11 +80,11 @@ def process_media_body(tt_iter): def process_card(card_container): - ''' + """ Extract image from card in case mastodon does not do it :param card_container: soup of 'a' tag containing card markup :return: list with url of image - ''' + """ list = [] link = card_container.get('href') @@ -95,12 +95,32 @@ def process_card(card_container): return list + +def process_attachments(attachments_container): + """ + Extract images or video from attachments. Videos are downloaded on the file system. + :param card_container: soup of 'div' tag containing attachments markup + :return: list with url of images + """ + # Collect url of images + pics = [] + images = attachments_container.find_all('a', class_='still-image') + for image in images: + pics.append(image.get('href')) + + # TODO Download nitter video (converted animated GIF) + + # TODO Download twitter video + + return pics + + def contains_class(body_classes, some_class): - ''' + """ :param body_classes: list of classes to search :param some_class: class that we are interested in :return: True if found, false otherwise - ''' + """ found = False for body_class in body_classes: if body_class == some_class: @@ -280,20 +300,12 @@ def main(argv): # TODO Process attachment: capture image or .mp4 url or download twitter video attachments_class = status.find('a', class_='attachments') - if card_class is not None: + if attachments_class is not None: photos.extend(process_attachments(attachments_class)) # Add footer with link to original tweet tweet_text += '\n\nOriginal tweet : ' + full_status_url - - # Check if there are photos attached - media = status.find('div', class_='media') - if media: - # Extract photo url and add it to list - pic = str(media.img['src']).strip(':small') - photos.append(pic) - # If no media was specifically added in the tweet, try to get the first picture # with "twitter:image" meta tag in first linked page in tweet text if not photos: From efa84f85d3dbc1b996f66b06c32b0d75d9849713 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 13:26:26 +0100 Subject: [PATCH 22/42] Download nitter video --- twoot.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/twoot.py b/twoot.py index 3bc7897..21815a9 100755 --- a/twoot.py +++ b/twoot.py @@ -96,10 +96,13 @@ def process_card(card_container): return list -def process_attachments(attachments_container): +def process_attachments(attachments_container, twit_account, tweet_id, author_account): """ Extract images or video from attachments. Videos are downloaded on the file system. :param card_container: soup of 'div' tag containing attachments markup + :param twit_account: name of twitter account + :param tweet_id: id of tweet being processed + :param author_account: author of tweet with video attachment :return: list with url of images """ # Collect url of images @@ -108,10 +111,30 @@ def process_attachments(attachments_container): for image in images: pics.append(image.get('href')) - # TODO Download nitter video (converted animated GIF) + # Download nitter video (converted animated GIF) + gif_class = attachments_container.find('video', class_='gif') + if gif_class is not None: + gif_video_file = 'https://nitter.com' + gif_class.source.get('src') + + video_path = os.path.join('./output', twit_account, tweet_id, author_account, tweet_id) + os.makedirs(video_path, 0o777, exist_ok=True) + + # Open directory for writing file + vp = os.open(video_path, os.O_WRONLY) + os.fchdir(vp) + r = requests.get(gif_video_file, stream=True) + + # Download chunks and write them to file + with open('gif_video.mp4', 'wb') as f: + for chunk in r.iter_content(chunk_size=16*1024): + f.write(chunk) + + # Close directory + os.close(vp) # TODO Download twitter video + return pics @@ -301,7 +324,7 @@ def main(argv): # TODO Process attachment: capture image or .mp4 url or download twitter video attachments_class = status.find('a', class_='attachments') if attachments_class is not None: - photos.extend(process_attachments(attachments_class)) + photos.extend(process_attachments(attachments_class, twit_account, tweet_id, author_account)) # Add footer with link to original tweet tweet_text += '\n\nOriginal tweet : ' + full_status_url From 551c47d576488bfbda4dd3b14d826b7b7b558759 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 14:28:17 +0100 Subject: [PATCH 23/42] Implemented process attachment --- twoot.py | 52 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/twoot.py b/twoot.py index 21815a9..5ac682f 100755 --- a/twoot.py +++ b/twoot.py @@ -96,10 +96,11 @@ def process_card(card_container): return list -def process_attachments(attachments_container, twit_account, tweet_id, author_account): +def process_attachments(attachments_container, get_vids, twit_account, tweet_id, author_account): """ Extract images or video from attachments. Videos are downloaded on the file system. :param card_container: soup of 'div' tag containing attachments markup + :param get_vids: whether to download vids or not :param twit_account: name of twitter account :param tweet_id: id of tweet being processed :param author_account: author of tweet with video attachment @@ -116,24 +117,43 @@ def process_attachments(attachments_container, twit_account, tweet_id, author_ac if gif_class is not None: gif_video_file = 'https://nitter.com' + gif_class.source.get('src') - video_path = os.path.join('./output', twit_account, tweet_id, author_account, tweet_id) - os.makedirs(video_path, 0o777, exist_ok=True) + video_path = os.path.join('./output', twit_account, tweet_id, author_account, tweet_id) + os.makedirs(video_path, 0o777, exist_ok=True) - # Open directory for writing file - vp = os.open(video_path, os.O_WRONLY) - os.fchdir(vp) - r = requests.get(gif_video_file, stream=True) + # Open directory for writing file + vp = os.open(video_path, os.O_WRONLY) + os.fchdir(vp) + r = requests.get(gif_video_file, stream=True) - # Download chunks and write them to file - with open('gif_video.mp4', 'wb') as f: - for chunk in r.iter_content(chunk_size=16*1024): - f.write(chunk) + # Download chunks and write them to file + with open('gif_video.mp4', 'wb') as f: + for chunk in r.iter_content(chunk_size=16*1024): + f.write(chunk) - # Close directory - os.close(vp) - - # TODO Download twitter video + # Close directory + os.close(vp) + # Download twitter video + vid_class = attachments_container.find('div', class_='video-container') + if vid_class is not None: + video_file = 'https://twitter.com' + vid_class.video.get('data-url') + if get_vids: + # Download video from twitter and store in filesystem. Running as subprocess to avoid + # requirement to install ffmpeg and ffmpeg-python for those that do not want to post videos + try: + # Set output location to ./output/twit_account/status_id + dl_feedback = subprocess.run( + ["./twitterdl.py", tweet_uri, "-ooutput/" + twit_account + "/" + status_id, "-w 500"], + capture_output=True, + ) + if dl_feedback.returncode != 0: + logging.warning('Video in tweet ' + tweet_id + ' from ' + twit_account + ' failed to download') + tweet_text += '\n\n[Video embedded in original tweet]' + except OSError: + logging.fatal("Could not execute twitterdl.py (is it there? Is it set as executable?)") + sys.exit(-1) + else: + tweet_text += '\n\n[Video embedded in original tweet]' return pics @@ -324,7 +344,7 @@ def main(argv): # TODO Process attachment: capture image or .mp4 url or download twitter video attachments_class = status.find('a', class_='attachments') if attachments_class is not None: - photos.extend(process_attachments(attachments_class, twit_account, tweet_id, author_account)) + photos.extend(process_attachments(attachments_class, get_vids, twit_account, tweet_id, author_account)) # Add footer with link to original tweet tweet_text += '\n\nOriginal tweet : ' + full_status_url From f229976861d782743fccca6ae441500362b32b3a Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 14:39:13 +0100 Subject: [PATCH 24/42] Improved logging. "OMG, it's full of bugs!" --- twoot.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/twoot.py b/twoot.py index 5ac682f..b2fc4c2 100755 --- a/twoot.py +++ b/twoot.py @@ -43,7 +43,7 @@ USER_AGENTS = [ ] # Setup logging to file -logging.basicConfig(filename="twoot.log", level=logging.INFO) +logging.basicConfig(filename="twoot.log", level=logging.DEBUG) logging.info('*********** NEW RUN ***********') @@ -92,6 +92,7 @@ def process_card(card_container): if link.contains('dailymotion.com'): image_url = 'twitter.com' + card_container.div.div.img.get('src') list.append(image_url) + logging.debug('Extracted still image of dailymotion video from card') return list @@ -111,6 +112,7 @@ def process_attachments(attachments_container, get_vids, twit_account, tweet_id, images = attachments_container.find_all('a', class_='still-image') for image in images: pics.append(image.get('href')) + logging.debug('collected ' + str(len(pics)) + ' images from attachments') # Download nitter video (converted animated GIF) gif_class = attachments_container.find('video', class_='gif') @@ -130,6 +132,8 @@ def process_attachments(attachments_container, get_vids, twit_account, tweet_id, for chunk in r.iter_content(chunk_size=16*1024): f.write(chunk) + logging.debug('downloaded video of GIF animation from attachments') + # Close directory os.close(vp) @@ -149,6 +153,9 @@ def process_attachments(attachments_container, get_vids, twit_account, tweet_id, if dl_feedback.returncode != 0: logging.warning('Video in tweet ' + tweet_id + ' from ' + twit_account + ' failed to download') tweet_text += '\n\n[Video embedded in original tweet]' + else: + logging.debug('downloaded twitter video from attachments') + except OSError: logging.fatal("Could not execute twitterdl.py (is it there? Is it set as executable?)") sys.exit(-1) @@ -247,10 +254,11 @@ def main(argv): twit_account_page = session.get(url, headers=headers) # Verify that download worked - assert twit_account_page.status_code == 200,\ - 'The nitter page did not download correctly. Aborting' + if twit_account_page.status_code != 200: + logging.fatal('The Nitter page did not download correctly. Aborting') + exit(-1) - logging.info('Page downloaded successfully') + logging.info('Nitter page downloaded successfully') # DEBUG: Save page to file of = open(twit_account + '.html', 'w') @@ -341,7 +349,7 @@ def main(argv): if card_class is not None: photos.extend(process_card(card_class)) - # TODO Process attachment: capture image or .mp4 url or download twitter video + # Process attachment: capture image or .mp4 url or download twitter video attachments_class = status.find('a', class_='attachments') if attachments_class is not None: photos.extend(process_attachments(attachments_class, get_vids, twit_account, tweet_id, author_account)) From 3a88438ec2d8f286cc83b142d5e6beff1611c182 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 14:57:22 +0100 Subject: [PATCH 25/42] Some easy bugs squashed --- twoot.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/twoot.py b/twoot.py index b2fc4c2..ec5c53c 100755 --- a/twoot.py +++ b/twoot.py @@ -64,10 +64,10 @@ def process_media_body(tt_iter): # If it is an 'a' html tag elif tag.name == 'a': tag_text = tag.get_text() - if tag_text.starts_with('@'): + if tag_text.startswith('@'): # Only keep user name tweet_text += tag_text - elif tag_text.starts_with('#'): + elif tag_text.startswith('#'): # Only keep hashtag text tweet_text += tag_text else: @@ -89,7 +89,7 @@ def process_card(card_container): link = card_container.get('href') # Dailymotion - if link.contains('dailymotion.com'): + if link.find('dailymotion.com') >= 0: image_url = 'twitter.com' + card_container.div.div.img.get('src') list.append(image_url) logging.debug('Extracted still image of dailymotion video from card') @@ -112,7 +112,8 @@ def process_attachments(attachments_container, get_vids, twit_account, tweet_id, images = attachments_container.find_all('a', class_='still-image') for image in images: pics.append(image.get('href')) - logging.debug('collected ' + str(len(pics)) + ' images from attachments') + + logging.debug('collected ' + str(len(pics)) + ' images from attachments') # Download nitter video (converted animated GIF) gif_class = attachments_container.find('video', class_='gif') @@ -350,7 +351,7 @@ def main(argv): photos.extend(process_card(card_class)) # Process attachment: capture image or .mp4 url or download twitter video - attachments_class = status.find('a', class_='attachments') + attachments_class = status.find('div', class_='attachments') if attachments_class is not None: photos.extend(process_attachments(attachments_class, get_vids, twit_account, tweet_id, author_account)) From 822215fefeda36e88066cdd99a22afd3a7d413f9 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 17:06:09 +0100 Subject: [PATCH 26/42] download more images. Improved logging --- twoot.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/twoot.py b/twoot.py index ec5c53c..d2ff2dd 100755 --- a/twoot.py +++ b/twoot.py @@ -42,10 +42,6 @@ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60', ] -# Setup logging to file -logging.basicConfig(filename="twoot.log", level=logging.DEBUG) -logging.info('*********** NEW RUN ***********') - def process_media_body(tt_iter): """ @@ -88,11 +84,10 @@ def process_card(card_container): list = [] link = card_container.get('href') - # Dailymotion - if link.find('dailymotion.com') >= 0: - image_url = 'twitter.com' + card_container.div.div.img.get('src') - list.append(image_url) - logging.debug('Extracted still image of dailymotion video from card') + # Do not extract image for youtube links + image_url = 'twitter.com' + card_container.div.div.img.get('src') + list.append(image_url) + logging.debug('Extracted still image of dailymotion video from card') return list @@ -217,7 +212,17 @@ def main(argv): max_age = float(args['a']) min_delay = float(args['d']) - logging.info('Updating ' + twit_account + ' on ' + mast_instance) + # Setup logging to file + os.remove(twit_account + '.log') + logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG) + logging.info('Running with the following parameters:') + logging.info(' -t ' + twit_account) + logging.info(' -i ' + mast_instance) + logging.info(' -m ' + mast_account) + logging.info(' -r ' + tweets_and_replies) + logging.info(' -v ' + get_vids) + logging.info(' -a ' + max_age) + logging.info(' -d ' + min_delay) # Try to open database. If it does not exist, create it sql = sqlite3.connect('twoot.db') @@ -431,7 +436,7 @@ def main(argv): ) except MastodonError as me: - print('failed to create app on ' + mast_instance) + logging.fatal('failed to create app on ' + mast_instance + '\n' + str(me)) sys.exit(1) # Log in to Mastodon instance @@ -448,7 +453,7 @@ def main(argv): ) except MastodonError as me: - logging.fatal('ERROR: Login to ' + mast_instance + ' Failed\n' + me) + logging.fatal('ERROR: Login to ' + mast_instance + ' Failed\n') sys.exit(1) # Upload tweets From 67bf87213d43258a3e12bf76a1bfceaa459c3b49 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 17:21:41 +0100 Subject: [PATCH 27/42] Correct url in image downloads --- twoot.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/twoot.py b/twoot.py index d2ff2dd..085e68a 100755 --- a/twoot.py +++ b/twoot.py @@ -85,9 +85,9 @@ def process_card(card_container): link = card_container.get('href') # Do not extract image for youtube links - image_url = 'twitter.com' + card_container.div.div.img.get('src') + image_url = 'https://nitter.net' + card_container.div.div.img.get('src') list.append(image_url) - logging.debug('Extracted still image of dailymotion video from card') + logging.debug('Extracted image from card') return list @@ -106,7 +106,7 @@ def process_attachments(attachments_container, get_vids, twit_account, tweet_id, pics = [] images = attachments_container.find_all('a', class_='still-image') for image in images: - pics.append(image.get('href')) + pics.append('https://nitter.net' + image.get('href')) logging.debug('collected ' + str(len(pics)) + ' images from attachments') @@ -212,17 +212,22 @@ def main(argv): max_age = float(args['a']) min_delay = float(args['d']) + # Remove previous log file + try: + os.remove(twit_account + '.log') + except FileNotFoundError: + pass + # Setup logging to file - os.remove(twit_account + '.log') logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG) logging.info('Running with the following parameters:') logging.info(' -t ' + twit_account) logging.info(' -i ' + mast_instance) logging.info(' -m ' + mast_account) - logging.info(' -r ' + tweets_and_replies) - logging.info(' -v ' + get_vids) - logging.info(' -a ' + max_age) - logging.info(' -d ' + min_delay) + logging.info(' -r ' + str(tweets_and_replies)) + logging.info(' -v ' + str(get_vids)) + logging.info(' -a ' + str(max_age)) + logging.info(' -d ' + str(min_delay)) # Try to open database. If it does not exist, create it sql = sqlite3.connect('twoot.db') From a0ce29f4c5ff1239244004d9df5ed370ad4c2868 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 17:35:50 +0100 Subject: [PATCH 28/42] Fine tuning --- twoot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 085e68a..cf0f485 100755 --- a/twoot.py +++ b/twoot.py @@ -338,7 +338,7 @@ def main(argv): # Add prefix if the tweet is a reply-to replying_to_class = status.find('div', class_='replying-to') if replying_to_class is not None: - tweet_text += 'Replying to ' + replying_to_class.a.get_text() + tweet_text += 'Replying to ' + replying_to_class.a.get_text() + '\n\n' # Check it the tweet is a retweet from somebody else if author_account.lower() != twit_account.lower(): @@ -456,6 +456,7 @@ def main(argv): password=mast_password, to_file=mast_account + ".secret" ) + logging.info('Logging in to ' + mast_instance) except MastodonError as me: logging.fatal('ERROR: Login to ' + mast_instance + ' Failed\n') From 62ba2f505ec0f067e0e44dbcdad9b1ffdbcecc55 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 17:55:12 +0100 Subject: [PATCH 29/42] Issues with video download --- twoot.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/twoot.py b/twoot.py index cf0f485..b0e343d 100755 --- a/twoot.py +++ b/twoot.py @@ -92,13 +92,13 @@ def process_card(card_container): return list -def process_attachments(attachments_container, get_vids, twit_account, tweet_id, author_account): +def process_attachments(attachments_container, get_vids, twit_account, status_id, author_account): """ Extract images or video from attachments. Videos are downloaded on the file system. :param card_container: soup of 'div' tag containing attachments markup :param get_vids: whether to download vids or not :param twit_account: name of twitter account - :param tweet_id: id of tweet being processed + :param status_id: id of tweet being processed :param author_account: author of tweet with video attachment :return: list with url of images """ @@ -115,12 +115,12 @@ def process_attachments(attachments_container, get_vids, twit_account, tweet_id, if gif_class is not None: gif_video_file = 'https://nitter.com' + gif_class.source.get('src') - video_path = os.path.join('./output', twit_account, tweet_id, author_account, tweet_id) - os.makedirs(video_path, 0o777, exist_ok=True) + video_path = os.path.join('output', twit_account, status_id, author_account, status_id) + os.makedirs(video_path, exist_ok=True) # Open directory for writing file - vp = os.open(video_path, os.O_WRONLY) - os.fchdir(vp) + vp = os.open(video_path, os.O_WRONLY) + os.chdir(vp) r = requests.get(gif_video_file, stream=True) # Download chunks and write them to file @@ -252,7 +252,7 @@ def main(argv): headers.update( { 'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS)-1)], - 'Cookie': 'replaceTwitter=; replaceYouTube=', + 'Cookie': 'replaceTwitter=; replaceYouTube=; hlsPlayback=on; proxyVideos=', } ) @@ -363,7 +363,7 @@ def main(argv): # Process attachment: capture image or .mp4 url or download twitter video attachments_class = status.find('div', class_='attachments') if attachments_class is not None: - photos.extend(process_attachments(attachments_class, get_vids, twit_account, tweet_id, author_account)) + photos.extend(process_attachments(attachments_class, get_vids, twit_account, status_id, author_account)) # Add footer with link to original tweet tweet_text += '\n\nOriginal tweet : ' + full_status_url From 986d902ccd364287310f6780246970cbe907c689 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 21:06:05 +0100 Subject: [PATCH 30/42] Fixed video download url --- twoot.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/twoot.py b/twoot.py index b0e343d..ac009c5 100755 --- a/twoot.py +++ b/twoot.py @@ -113,42 +113,43 @@ def process_attachments(attachments_container, get_vids, twit_account, status_id # Download nitter video (converted animated GIF) gif_class = attachments_container.find('video', class_='gif') if gif_class is not None: - gif_video_file = 'https://nitter.com' + gif_class.source.get('src') + gif_video_file = 'https://nitter.net' + gif_class.source.get('src') video_path = os.path.join('output', twit_account, status_id, author_account, status_id) os.makedirs(video_path, exist_ok=True) # Open directory for writing file - vp = os.open(video_path, os.O_WRONLY) - os.chdir(vp) - r = requests.get(gif_video_file, stream=True) - - # Download chunks and write them to file - with open('gif_video.mp4', 'wb') as f: - for chunk in r.iter_content(chunk_size=16*1024): - f.write(chunk) + orig_dir = os.getcwd() + os.chdir(video_path) + with requests.get(gif_video_file, stream=True) as r: + r.raise_for_status() + # Download chunks and write them to file + with open('gif_video.mp4', 'wb') as f: + for chunk in r.iter_content(chunk_size=16*1024): + f.write(chunk) logging.debug('downloaded video of GIF animation from attachments') # Close directory - os.close(vp) + os.chdir(orig_dir) # Download twitter video + vid_in_tweet = False vid_class = attachments_container.find('div', class_='video-container') if vid_class is not None: - video_file = 'https://twitter.com' + vid_class.video.get('data-url') + video_file = os.path.join('https://twitter.com', author_account, 'status', status_id) if get_vids: # Download video from twitter and store in filesystem. Running as subprocess to avoid # requirement to install ffmpeg and ffmpeg-python for those that do not want to post videos try: # Set output location to ./output/twit_account/status_id dl_feedback = subprocess.run( - ["./twitterdl.py", tweet_uri, "-ooutput/" + twit_account + "/" + status_id, "-w 500"], + ["./twitterdl.py", video_file, "-ooutput/" + twit_account + "/" + status_id, "-w 500"], capture_output=True, ) if dl_feedback.returncode != 0: - logging.warning('Video in tweet ' + tweet_id + ' from ' + twit_account + ' failed to download') - tweet_text += '\n\n[Video embedded in original tweet]' + logging.warning('Video in tweet ' + status_id + ' from ' + twit_account + ' failed to download') + vid_in_tweet = True else: logging.debug('downloaded twitter video from attachments') @@ -156,9 +157,9 @@ def process_attachments(attachments_container, get_vids, twit_account, status_id logging.fatal("Could not execute twitterdl.py (is it there? Is it set as executable?)") sys.exit(-1) else: - tweet_text += '\n\n[Video embedded in original tweet]' + vid_in_tweet = True - return pics + return pics, vid_in_tweet def contains_class(body_classes, some_class): @@ -363,7 +364,10 @@ def main(argv): # Process attachment: capture image or .mp4 url or download twitter video attachments_class = status.find('div', class_='attachments') if attachments_class is not None: - photos.extend(process_attachments(attachments_class, get_vids, twit_account, status_id, author_account)) + pics, vid_in_tweet = process_attachments(attachments_class, get_vids, twit_account, status_id, author_account) + photos.extend(pics) + if vid_in_tweet: + tweet_text += '\n\n[Video embedded in original tweet]' # Add footer with link to original tweet tweet_text += '\n\nOriginal tweet : ' + full_status_url From 33342cdfb727f25d105784ddc6f0e3f2904219d5 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 21:32:26 +0100 Subject: [PATCH 31/42] Cards can have no pic --- twoot.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/twoot.py b/twoot.py index ac009c5..8b0ccfa 100755 --- a/twoot.py +++ b/twoot.py @@ -82,12 +82,12 @@ def process_card(card_container): :return: list with url of image """ list = [] - link = card_container.get('href') - # Do not extract image for youtube links - image_url = 'https://nitter.net' + card_container.div.div.img.get('src') - list.append(image_url) - logging.debug('Extracted image from card') + img = card_container.div.div.img + if img is not None: + image_url = 'https://nitter.net' + img.get('src') + list.append(image_url) + logging.debug('Extracted image from card') return list From 1525955c5266223f2f8c30581d31980de4ac6a22 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 22:09:34 +0100 Subject: [PATCH 32/42] Added info log messages --- twoot.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/twoot.py b/twoot.py index 8b0ccfa..2476fdd 100755 --- a/twoot.py +++ b/twoot.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ - Copyright (C) 2019 Jean-Christophe Francois + Copyright (C) 2020 Jean-Christophe Francois This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -220,7 +220,7 @@ def main(argv): pass # Setup logging to file - logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG) + logging.basicConfig(filename=twit_account + '.log', level=logging.INFO) logging.info('Running with the following parameters:') logging.info(' -t ' + twit_account) logging.info(' -i ' + mast_instance) @@ -273,9 +273,9 @@ def main(argv): logging.info('Nitter page downloaded successfully') # DEBUG: Save page to file - of = open(twit_account + '.html', 'w') - of.write(twit_account_page.text) - of.close() + #of = open(twit_account + '.html', 'w') + #of.write(twit_account_page.text) + #of.close() # Make soup soup = BeautifulSoup(twit_account_page.text, 'html.parser') @@ -295,6 +295,8 @@ def main(argv): # Process each tweets and generate dictionary # with data ready to be posted on Mastodon # ********************************************************** + out_date_cnt = 0 + in_db_cnt = 0 for status in timeline: # Extract tweet ID and status ID tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m') @@ -308,6 +310,7 @@ def main(argv): # Check if time is within acceptable range if not is_time_valid(timestamp, max_age, min_delay): + out_date_cnt += 1 logging.debug("Tweet outside valid time range, skipping") continue @@ -317,6 +320,7 @@ def main(argv): tweet_in_db = db.fetchone() if tweet_in_db is not None: + in_db_cnt += 1 logging.debug("Tweet %s already in database", tweet_id) # Skip to next tweet continue @@ -425,6 +429,8 @@ def main(argv): logging.debug('Tweet %s added to list of toots to upload', tweet_id) # TODO Log summary stats: how many not in db, how many in valid timeframe + logging.info(str(out_date_cnt) + ' tweets outside of valid time range') + logging.info(str(in_db_cnt) + ' tweets already in database') # DEBUG: Print extracted tweets #for t in tweets: @@ -467,6 +473,7 @@ def main(argv): sys.exit(1) # Upload tweets + posted_cnt = 0 for tweet in reversed(tweets): logging.debug('Uploading Tweet %s', tweet["tweet_id"]) @@ -515,9 +522,10 @@ def main(argv): except MastodonError as me: logging.error('posting ' + tweet['tweet_text'] + ' to ' + mast_instance + ' Failed') logging.error(me) - sys.exit(1) - logging.debug('Tweet %s posted on %s', tweet_id, mast_account) + else: + posted_cnt += 1 + logging.debug('Tweet %s posted on %s', tweet['tweet_id'], mast_account) # Insert toot id into database if 'id' in toot: @@ -525,6 +533,8 @@ def main(argv): (twit_account, mast_instance, mast_account, tweet['tweet_id'], toot['id'])) sql.commit() + logging.info(str(posted_cnt) + ' Tweets posted to Mastodon') + # Cleanup downloaded video files try: shutil.rmtree('./output/' + twit_account) From 60f7054fac98904c7c9b1f8fba9d51898b2461a6 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 22:16:27 +0100 Subject: [PATCH 33/42] Separate logging for exceptions --- twoot.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 2476fdd..1ef7edd 100755 --- a/twoot.py +++ b/twoot.py @@ -451,7 +451,8 @@ def main(argv): ) except MastodonError as me: - logging.fatal('failed to create app on ' + mast_instance + '\n' + str(me)) + logging.fatal('failed to create app on ' + mast_instance) + logging.fatal(me) sys.exit(1) # Log in to Mastodon instance @@ -470,6 +471,7 @@ def main(argv): except MastodonError as me: logging.fatal('ERROR: Login to ' + mast_instance + ' Failed\n') + logging.fatal(me) sys.exit(1) # Upload tweets From 066f737a617a41bbee3a0ecacdd8431f3614e18a Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 22:41:57 +0100 Subject: [PATCH 34/42] quote is an 'a' tag --- twoot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/twoot.py b/twoot.py index 1ef7edd..d4c441f 100755 --- a/twoot.py +++ b/twoot.py @@ -356,9 +356,9 @@ def main(argv): tweet_text += process_media_body(tt_iter) # Process quote: append link to tweet_text - quote_div = status.find('div', class_='quote-link') + quote_div = status.find('a', class_='quote-link') if quote_div is not None: - tweet_text += '\n twitter.com' + quote_div.get('href').strip('#m') + tweet_text += '\n\nhttps://twitter.com' + quote_div.get('href').strip('#m') # Process card : extract image if necessary card_class = status.find('a', class_='card-container') From bb52e54c0da683d70a78722454c40acbe2c5ad3f Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 18 Dec 2020 22:43:50 +0100 Subject: [PATCH 35/42] Logging set to debug --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index d4c441f..ad587fd 100755 --- a/twoot.py +++ b/twoot.py @@ -220,7 +220,7 @@ def main(argv): pass # Setup logging to file - logging.basicConfig(filename=twit_account + '.log', level=logging.INFO) + logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG) logging.info('Running with the following parameters:') logging.info(' -t ' + twit_account) logging.info(' -i ' + mast_instance) From 43d63b1e5a18282a4a42b676206f164424d76aa5 Mon Sep 17 00:00:00 2001 From: jeancf Date: Sat, 19 Dec 2020 09:21:39 +0100 Subject: [PATCH 36/42] Added logging run time --- twoot.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index ad587fd..c43c9f3 100755 --- a/twoot.py +++ b/twoot.py @@ -26,7 +26,8 @@ import random import requests from bs4 import BeautifulSoup, element import sqlite3 -import datetime, time +import datetime +import time import re from pathlib import Path from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError @@ -189,6 +190,8 @@ def is_time_valid(timestamp, max_age, min_delay): def main(argv): + # Start stopwatch + start_time = time.time() # Build parser for command line arguments parser = argparse.ArgumentParser(description='toot tweets.') @@ -536,6 +539,7 @@ def main(argv): sql.commit() logging.info(str(posted_cnt) + ' Tweets posted to Mastodon') + logging.info('Run time : ' + str(time.time() - start_time) + ' seconds') # Cleanup downloaded video files try: From dc6c16ae16d62f450dfead58406f3c6160f9eab8 Mon Sep 17 00:00:00 2001 From: jeancf Date: Sat, 19 Dec 2020 10:09:03 +0100 Subject: [PATCH 37/42] Keep logs for now --- twoot.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/twoot.py b/twoot.py index c43c9f3..dbc32f2 100755 --- a/twoot.py +++ b/twoot.py @@ -217,10 +217,10 @@ def main(argv): min_delay = float(args['d']) # Remove previous log file - try: - os.remove(twit_account + '.log') - except FileNotFoundError: - pass + #try: + # os.remove(twit_account + '.log') + #except FileNotFoundError: + # pass # Setup logging to file logging.basicConfig(filename=twit_account + '.log', level=logging.DEBUG) @@ -539,7 +539,6 @@ def main(argv): sql.commit() logging.info(str(posted_cnt) + ' Tweets posted to Mastodon') - logging.info('Run time : ' + str(time.time() - start_time) + ' seconds') # Cleanup downloaded video files try: @@ -547,6 +546,8 @@ def main(argv): except FileNotFoundError: # The directory does not exist pass + logging.info('Run time : ' + str(time.time() - start_time) + ' seconds') + logging.info('_____________________________________________________________') if __name__ == "__main__": main(sys.argv) From 3c7693fe66bbf9c3b3e1b849bc54e4a7888bd9de Mon Sep 17 00:00:00 2001 From: jeancf Date: Sat, 19 Dec 2020 10:30:19 +0100 Subject: [PATCH 38/42] Updated README Improved decimal format in log --- README.md | 14 +++++++++----- twoot.py | 3 ++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 19f21da..674c28f 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,10 @@ Twoot is a python script that extracts tweets from a twitter feed and reposts them as toots on a Mastodon account. -I started twoot when [tootbot](https://github.com/cquest/tootbot) -stopped working. Tootbot relies on rss feeds from https://twitrss.me -that broke when Twitter refreshed their web UI in July 2019. - -Instead twoot is self contained and handles all the processing. +**UPDATE 19 DEC 2020** VERSION 2.0 Twitter's *no-javascript* version +has been retired. Twoot has been rewritten to get content from +[nitter.net](https://nitter.net) which is a javascript-free mirror of +twitter. As a bonus (or a curse?) twoot now also supports animated GIFs. **UPDATE 05 APR 2020** VERSION 1.0. Twoot can now optionally download videos from Twitter and upload them on Mastodon. @@ -91,3 +90,8 @@ ago: ``` 1-59/15 * * * * /path/to/twoot.py -t SuperDuperBot -i botsin.space -m superduperbot -p my_Sup3r-S4f3*pw -a 5 -d 15 ``` + +# Background +I started twoot when [tootbot](https://github.com/cquest/tootbot) +stopped working. Tootbot relies on rss feeds from https://twitrss.me +that broke when Twitter refreshed their web UI in July 2019. \ No newline at end of file diff --git a/twoot.py b/twoot.py index dbc32f2..bf9157d 100755 --- a/twoot.py +++ b/twoot.py @@ -546,8 +546,9 @@ def main(argv): except FileNotFoundError: # The directory does not exist pass - logging.info('Run time : ' + str(time.time() - start_time) + ' seconds') + logging.info('Run time : %2.1f seconds', str(time.time() - start_time)) logging.info('_____________________________________________________________') + if __name__ == "__main__": main(sys.argv) From 5df11dbe4b6b47006e4ad3b7dcf4579cef98687a Mon Sep 17 00:00:00 2001 From: jeancf Date: Sat, 19 Dec 2020 10:36:59 +0100 Subject: [PATCH 39/42] Fixed last logging syntax --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index bf9157d..305ac23 100755 --- a/twoot.py +++ b/twoot.py @@ -546,7 +546,7 @@ def main(argv): except FileNotFoundError: # The directory does not exist pass - logging.info('Run time : %2.1f seconds', str(time.time() - start_time)) + logging.info('Run time : %2.1f seconds' % (time.time() - start_time)) logging.info('_____________________________________________________________') From 40185ef8173ba0a30bee4b070d106a1b61ee5edd Mon Sep 17 00:00:00 2001 From: jeancf Date: Sat, 19 Dec 2020 10:48:46 +0100 Subject: [PATCH 40/42] Improved last logging syntax --- twoot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 305ac23..70de32c 100755 --- a/twoot.py +++ b/twoot.py @@ -547,7 +547,7 @@ def main(argv): pass logging.info('Run time : %2.1f seconds' % (time.time() - start_time)) - logging.info('_____________________________________________________________') + logging.info('_____________________________________________________________________________________') if __name__ == "__main__": From 1d40071b27abdc4f7bff343cbb0bf9b0d7c45a78 Mon Sep 17 00:00:00 2001 From: jeancf Date: Sat, 19 Dec 2020 10:53:11 +0100 Subject: [PATCH 41/42] Added log of twitter:image download --- twoot.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/twoot.py b/twoot.py index 70de32c..5829043 100755 --- a/twoot.py +++ b/twoot.py @@ -401,6 +401,8 @@ def main(argv): requests.exceptions.TooManyRedirects, requests.exceptions.MissingSchema): pass + else: + logging.debug("downloaded twitter:image from linked page") # Check if video was downloaded video_file = None From a9109884a4d3fb6b2c9b6073bb73cdbb44732dbb Mon Sep 17 00:00:00 2001 From: jeancf Date: Sat, 19 Dec 2020 10:59:23 +0100 Subject: [PATCH 42/42] More debug messages --- twoot.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/twoot.py b/twoot.py index 5829043..5a6d215 100755 --- a/twoot.py +++ b/twoot.py @@ -489,7 +489,7 @@ def main(argv): # Upload video if there is one if tweet['video'] is not None: try: - logging.debug("Uploading video") + logging.debug("Uploading video to Mastodon") media_posted = mastodon.media_post(tweet['video']) media_ids.append(media_posted['id']) except (MastodonAPIError, MastodonIllegalArgumentError, TypeError): # Media cannot be uploaded (invalid format, dead link, etc.) @@ -502,6 +502,7 @@ def main(argv): media = False # Download picture try: + logging.debug('downloading picture') media = requests.get(photo) except: # Picture cannot be downloaded for any reason pass @@ -509,6 +510,7 @@ def main(argv): # Upload picture to Mastodon instance if media: try: + logging.debug('uploading picture to Mastodon') media_posted = mastodon.media_post(media.content, mime_type=media.headers['content-type']) media_ids.append(media_posted['id']) except (MastodonAPIError, MastodonIllegalArgumentError, TypeError): # Media cannot be uploaded (invalid format, dead link, etc.)