From 0b15b93d3747e2dd9eadeda7a3ed5ddc35668915 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 14 Feb 2020 18:01:12 +0100 Subject: [PATCH] Loading full_status_page (headers not working) --- twoot.py | 62 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/twoot.py b/twoot.py index cb7fe4e..31fcdc5 100755 --- a/twoot.py +++ b/twoot.py @@ -42,8 +42,8 @@ USER_AGENTS = [ def cleanup_tweet_text(tt_iter): ''' - Receives an iterator over all the elements contained in the tweet-text container - and processes them to remove Twitter-specific stuff and make them suitable for + Receives an iterator over all the elements contained in the tweet-text container. + Processes them to remove Twitter-specific stuff and make them suitable for posting on Mastodon ''' tweet_text = '' @@ -73,10 +73,6 @@ def cleanup_tweet_text(tt_iter): tweet_text += ' ' # Add full url tweet_text += tag['data-expanded-url'] - # If element is a picture - elif tag.has_attr('data-url'): - # TODO handle photo - pass # If element is hashflag (hashtag + icon), handle as simple hashtag elif tag.name == 'span' and tag['class'][0] == 'twitter-hashflag-container': @@ -100,6 +96,7 @@ def cleanup_tweet_text(tt_iter): # elif tag is a geographical point of interest elif tag.name == 'span' and tag['class'][0] == 'tweet-poi-geo-text': + # Not sure what to do pass else: @@ -148,15 +145,15 @@ def main(argv): url = 'https://mobile.twitter.com/' + twit_account # Download twitter page of user. We should get a 'no javascript' landing page and some cookies - r1 = requests.get(url, headers=headers) + no_js_page = requests.get(url, headers=headers) # DEBUG: Save page to file of = open('no_js_page.html', 'w') - of.write(r1.text) + of.write(no_js_page.text) of.close() # Verify that this is the no_js page that we expected - soup = BeautifulSoup(r1.text, 'html.parser') + soup = BeautifulSoup(no_js_page.text, 'html.parser') assert 'JavaScript is disabled' in str(soup.form.p.string),\ 'this is not the no_js page we expected. Quitting' @@ -168,31 +165,45 @@ def main(argv): } ) - response = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=r1.cookies) + twit_account_page = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=no_js_page.cookies) # DEBUG: Save page to file - of = open('twitter.html', 'w') - of.write(response.text) + of = open(twit_account + '.html', 'w') + of.write(twit_account_page.text) of.close() # Verify that download worked - assert response.status_code == 200,\ + assert twit_account_page.status_code == 200,\ 'The twitter page did not download correctly. Aborting' # Verify that we now have the correct twitter page - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(twit_account_page.text, 'html.parser') assert twit_account.lower() in str(soup.head.title.string).lower(),\ 'This is not the correct twitter page. Quitting' # Extract twitter timeline - results = soup.find_all('table', class_='tweet') + timeline = soup.find_all('table', class_='tweet') + + for status in timeline: + # Extract url of full status page + full_status_url = 'https://mobile.twitter.com' + status['href'] + + # fetch full status page + full_status_page = requests.get(full_status_url, cookies=twit_account_page.cookies) + # For some funny reason the command above only works if I don't provide headers + # If I do, I get the no_js page... + + # DEBUG: Save page to file + of = open('full_status_page.html', 'w') + of.write(full_status_page.text) + of.close() + sys.exit(1) - for result in results: # Extract tweet id - tweet_id = str(result['href']).strip('?p=v') + tweet_id = str(status['href']).strip('?p=v') # Isolate tweet header - sih = result.find('tr', class_='tweet-header') + sih = status.find('tr', class_='tweet-header') # extract author author = sih.find('strong', class_='fullname').get_text() @@ -208,7 +219,7 @@ def main(argv): author_account = str(sih.find('div', class_='username').span.next_sibling).strip('\n ') # Isolate tweet text container - ttc = result.find('tr', class_='tweet-container') + ttc = status.find('tr', class_='tweet-container') # extract iterator over tweet text contents tt_iter = ttc.find('div', class_='dir-ltr').children @@ -223,7 +234,7 @@ def main(argv): tweet_text += '\n\nOriginal tweet : https://twitter.com/' + tweet_id # Isolate attached media container - amoc = result.find('div', class_='AdaptiveMediaOuterContainer') + amoc = status.find('div', class_='AdaptiveMediaOuterContainer') photos = [] if amoc: @@ -231,11 +242,12 @@ def main(argv): photo_conts = amoc.find_all('div', class_='AdaptiveMedia-photoContainer') for p in photo_conts: photos.append(p['data-image-url']) - - # Mention presence of videos in tweet - videos = amoc.find_all('div', class_='AdaptiveMedia-videoContainer') - if len(videos) != 0: - tweet_text += '\n\n[Video embedded in original tweet]' + # Extract tweet id + tweet_id = ttc.find('div', class_='tweet-text')['data-id'] + # Mention presence of videos in tweet + videos = amoc.find_all('div', class_='AdaptiveMedia-videoContainer') + if len(videos) != 0: + tweet_text += '\n\n[Video embedded in original tweet]' # If no media was specifically added in the tweet, try to get the first picture # with "twitter:image" meta tag in first linked page in tweet text