Rewrite complete

This commit is contained in:
jeancf 2020-02-15 15:39:01 +01:00
parent 0b15b93d37
commit a5fde58615

105
twoot.py
View File

@ -25,7 +25,7 @@ import random
import requests import requests
from bs4 import BeautifulSoup, element from bs4 import BeautifulSoup, element
import sqlite3 import sqlite3
import time import datetime, time
import re import re
from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError
@ -73,6 +73,10 @@ def cleanup_tweet_text(tt_iter):
tweet_text += ' ' tweet_text += ' '
# Add full url # Add full url
tweet_text += tag['data-expanded-url'] tweet_text += tag['data-expanded-url']
if tag.has_attr('data-expanded-path'):
data_expanded_path = tag['data-expanded-path']
if 'video' in data_expanded_path:
tweet_text += '\n\n[Video embedded in original tweet]'
# If element is hashflag (hashtag + icon), handle as simple hashtag # If element is hashflag (hashtag + icon), handle as simple hashtag
elif tag.name == 'span' and tag['class'][0] == 'twitter-hashflag-container': elif tag.name == 'span' and tag['class'][0] == 'twitter-hashflag-container':
@ -105,6 +109,19 @@ def cleanup_tweet_text(tt_iter):
return tweet_text return tweet_text
def contains_class(body_classes, some_class):
'''
:param body_classes: list of classes to search
:param some_class: class that we are interested in
:return: True if found, false otherwise
'''
found = False
for body_class in body_classes:
if body_class == some_class:
found = True
return found
def main(argv): def main(argv):
# Build parser for command line arguments # Build parser for command line arguments
@ -147,6 +164,10 @@ def main(argv):
# Download twitter page of user. We should get a 'no javascript' landing page and some cookies # Download twitter page of user. We should get a 'no javascript' landing page and some cookies
no_js_page = requests.get(url, headers=headers) no_js_page = requests.get(url, headers=headers)
# Verify that download worked
assert no_js_page.status_code == 200,\
'The twitter page did not download correctly. Aborting'
# DEBUG: Save page to file # DEBUG: Save page to file
of = open('no_js_page.html', 'w') of = open('no_js_page.html', 'w')
of.write(no_js_page.text) of.write(no_js_page.text)
@ -167,62 +188,72 @@ def main(argv):
twit_account_page = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=no_js_page.cookies) twit_account_page = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=no_js_page.cookies)
# Verify that download worked
assert twit_account_page.status_code == 200,\
'The twitter page did not download correctly. Aborting'
# DEBUG: Save page to file # DEBUG: Save page to file
of = open(twit_account + '.html', 'w') of = open(twit_account + '.html', 'w')
of.write(twit_account_page.text) of.write(twit_account_page.text)
of.close() of.close()
# Verify that download worked # Make soup
assert twit_account_page.status_code == 200,\ soup = BeautifulSoup(twit_account_page.text, 'html.parser')
'The twitter page did not download correctly. Aborting'
# Verify that we now have the correct twitter page # Verify that we now have the correct twitter page
soup = BeautifulSoup(twit_account_page.text, 'html.parser') body_classes = soup.body.get_attribute_list('class')
assert twit_account.lower() in str(soup.head.title.string).lower(),\ assert contains_class(body_classes, 'users-show-page'), \
'This is not the correct twitter page. Quitting' 'This is not the correct twitter page. Quitting'
# Extract twitter timeline # Extract twitter timeline
timeline = soup.find_all('table', class_='tweet') timeline = soup.find_all('table', class_='tweet')
for status in timeline: for status in timeline:
# Extract tweet id
tweet_id = str(status['href']).strip('?p=v')
# Extract url of full status page # Extract url of full status page
full_status_url = 'https://mobile.twitter.com' + status['href'] full_status_url = 'https://mobile.twitter.com' + tweet_id
# fetch full status page # fetch full status page
full_status_page = requests.get(full_status_url, cookies=twit_account_page.cookies) full_status_page = requests.get(full_status_url, cookies=twit_account_page.cookies)
# For some funny reason the command above only works if I don't provide headers # FIXME: For some funny reason the command above only works if I don't provide headers. If I do, I get the no_js page...
# If I do, I get the no_js page...
# Verify that download worked
assert twit_account_page.status_code == 200, \
'The twitter page did not download correctly. Aborting'
# DEBUG: Save page to file # DEBUG: Save page to file
of = open('full_status_page.html', 'w') of = open('full_status_page.html', 'w')
of.write(full_status_page.text) of.write(full_status_page.text)
of.close() of.close()
sys.exit(1)
# Extract tweet id # Make soup
tweet_id = str(status['href']).strip('?p=v') soup = BeautifulSoup(full_status_page.text, 'html.parser')
# Isolate tweet header # Verify that we now have the correct twitter page
sih = status.find('tr', class_='tweet-header') body_classes = soup.body.get_attribute_list('class')
assert contains_class(body_classes, 'tweets-show-page'), \
'This is not the correct twitter page. Quitting'
# Isolate table main-tweet
tmt = soup.find('table', class_='main-tweet')
# Extract avatar
author_logo_url = tmt.find('td', class_='avatar').a.img['src']
# extract author # extract author
author = sih.find('strong', class_='fullname').get_text() author = tmt.find('div', class_='fullname').a.strong.get_text()
# Extract author's logo
author_logo_url = sih.find('img', alt=author)['src']
# TODO: Extract time stamp by following link under td.timestamp
import datetime
timestamp = datetime.datetime.now().timestamp()
# Extract user name # Extract user name
author_account = str(sih.find('div', class_='username').span.next_sibling).strip('\n ') author_account = str(tmt.find('span', class_='username').span.next_sibling).strip('\n ')
# Isolate tweet text container # TODO: Extract time stamp
ttc = status.find('tr', class_='tweet-container') time_string = tmt.find('div', class_='metadata').a.get_text()
timestamp = datetime.datetime.strptime(time_string, '%I:%M %p - %d %b %Y').timestamp()
# extract iterator over tweet text contents # extract iterator over tweet text contents
tt_iter = ttc.find('div', class_='dir-ltr').children tt_iter = tmt.find('div', class_='tweet-text').div.children
tweet_text = cleanup_tweet_text(tt_iter) tweet_text = cleanup_tweet_text(tt_iter)
@ -233,21 +264,15 @@ def main(argv):
# Add footer with link to original tweet # Add footer with link to original tweet
tweet_text += '\n\nOriginal tweet : https://twitter.com/' + tweet_id tweet_text += '\n\nOriginal tweet : https://twitter.com/' + tweet_id
# Isolate attached media container # DEBUG: STOP HERE
amoc = status.find('div', class_='AdaptiveMediaOuterContainer') #sys.exit(1)
photos = [] photos = [] # The no_js version of twitter only shows one photo
if amoc:
# Extract photos media = tmt.find('div', class_='media')
photo_conts = amoc.find_all('div', class_='AdaptiveMedia-photoContainer') if media:
for p in photo_conts: pic = str(media.img['src']).strip(':small')
photos.append(p['data-image-url']) photos.append(pic)
# Extract tweet id
tweet_id = ttc.find('div', class_='tweet-text')['data-id']
# Mention presence of videos in tweet
videos = amoc.find_all('div', class_='AdaptiveMedia-videoContainer')
if len(videos) != 0:
tweet_text += '\n\n[Video embedded in original tweet]'
# If no media was specifically added in the tweet, try to get the first picture # If no media was specifically added in the tweet, try to get the first picture
# with "twitter:image" meta tag in first linked page in tweet text # with "twitter:image" meta tag in first linked page in tweet text