From 6fb6a38732b6c3b13368f0d75a8123e7f35f40b2 Mon Sep 17 00:00:00 2001 From: jeancf Date: Fri, 6 Mar 2020 17:40:13 +0100 Subject: [PATCH] Used session to manage cookies automatically --- twoot.py | 126 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 73 insertions(+), 53 deletions(-) diff --git a/twoot.py b/twoot.py index e7cd068..3dc9b00 100755 --- a/twoot.py +++ b/twoot.py @@ -32,14 +32,54 @@ from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalA # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/ USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/69.0', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/73.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13 Safari/605.1.15', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 Edg/44.18362.329.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; Xbox; Xbox One) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edge/44.18363.8131', ] #TODO log to file +def handle_no_js(session, page, headers): + """ + Check if page is a "No Javascript" page instead of the content that we wanted + If it is, submit the form on the page as POST request to get the correct page and return it + :param session: current requests session + :param page: Response object to check + :param headers: HTTP headers used in initial request + :return: correct page (Response object) + """ + # DEBUG: Save page to file + of = open('no_js_page.html', 'w') + of.write(page.text) + of.close() + + # Set default return value + new_page = page + + # Make soup + soup = BeautifulSoup(page.text, 'html.parser') + + if soup.form.p is not None: + if 'JavaScript is disabled' in str(soup.form.p.string): + # Submit POST form response with cookies + headers.update( + { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': page.request.url, + } + ) + + action = soup.form.get('action') + + # Submit the form + new_page = session.post(action, headers=headers, cookies=page.cookies) + + # Verify that download worked + assert (new_page.status_code == 200), 'The twitter page did not download correctly. Aborting' + + return new_page + def cleanup_tweet_text(tt_iter): ''' Receives an iterator over all the elements contained in the tweet-text container. @@ -150,6 +190,9 @@ def main(argv): # To store content of all tweets from this user tweets = [] + # Initiate session + session = requests.Session() + # Get a copy of the default headers that requests would use headers = requests.utils.default_headers() @@ -162,40 +205,19 @@ def main(argv): url = 'https://mobile.twitter.com/' + twit_account # Download twitter page of user. We should get a 'no javascript' landing page and some cookies - no_js_page = requests.get(url, headers=headers) - - # Verify that download worked - assert no_js_page.status_code == 200,\ - 'The twitter page did not download correctly. Aborting' - - # DEBUG: Save page to file - #of = open('no_js_page.html', 'w') - #of.write(no_js_page.text) - #of.close() - - # Verify that this is the no_js page that we expected - soup = BeautifulSoup(no_js_page.text, 'html.parser') - assert 'JavaScript is disabled' in str(soup.form.p.string),\ - 'this is not the no_js page we expected. Quitting' - - # Submit POST form response with cookies - headers.update( - { - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': url, - } - ) - - twit_account_page = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=no_js_page.cookies) + twit_account_page = session.get(url, headers=headers) # Verify that download worked assert twit_account_page.status_code == 200,\ 'The twitter page did not download correctly. Aborting' + # If we got a No Javascript page, download the correct page + twit_account_page = handle_no_js(session, twit_account_page, headers) + # DEBUG: Save page to file - #of = open(twit_account + '.html', 'w') - #of.write(twit_account_page.text) - #of.close() + of = open(twit_account + '.html', 'w') + of.write(twit_account_page.text) + of.close() # Make soup soup = BeautifulSoup(twit_account_page.text, 'html.parser') @@ -213,20 +235,22 @@ def main(argv): tweet_id = str(status['href']).strip('?p=v') # Extract url of full status page - full_status_url = 'https://mobile.twitter.com' + tweet_id + full_status_url = 'https://mobile.twitter.com' + tweet_id + '?p=v' # fetch full status page - full_status_page = requests.get(full_status_url, cookies=twit_account_page.cookies) - # FIXME: For some funny reason the command above only works if I don't provide headers. If I do, I get the no_js page... + full_status_page = session.get(full_status_url, headers=headers) # Verify that download worked - assert twit_account_page.status_code == 200, \ + assert full_status_page.status_code == 200, \ 'The twitter page did not download correctly. Aborting' + # If we got a No Javascript page, download the correct page + full_status_page = handle_no_js(session, full_status_page, headers) + # DEBUG: Save page to file - # of = open('full_status_page.html', 'w') - # of.write(full_status_page.text) - # of.close() + of = open('full_status_page.html', 'w') + of.write(full_status_page.text) + of.close() # Make soup soup = BeautifulSoup(full_status_page.text, 'html.parser') @@ -236,16 +260,16 @@ def main(argv): assert contains_class(body_classes, 'tweets-show-page'), \ 'This is not the correct twitter page. Quitting' - # Check if tweet contains pic censored as "offensive media" + # Check if tweet contains pic censored as "Sensitive material" if soup.find('div', class_='accept-data') is not None: - print('Censored pic found') - - # TODO if it does, submit form to obtain uncensored tweet + # If it does, submit form to obtain uncensored tweet # Submit POST form response with cookies headers.update( { + 'Origin': 'https://mobile.twitter.com', + 'Host': 'mobile.twitter.com', 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': full_status_url + '?p=v', + 'Referer': full_status_url, } ) @@ -253,23 +277,19 @@ def main(argv): authenticity_token = soup.find('input', {'name': 'authenticity_token'}).get('value') form_input = {'show_media': 1, 'authenticity_token': authenticity_token, 'commit': 'Display media'} - print(full_status_url) - print(headers) - print(form_input) - print(full_status_page.cookies) - - full_status_page = requests.post(full_status_url + '?p=v', data=form_input, headers=headers, cookies=full_status_page.cookies) + full_status_page = session.post(full_status_url + '?p=v', data=form_input, headers=headers) # Verify that download worked - assert twit_account_page.status_code == 200, \ + assert full_status_page.status_code == 200, \ 'The twitter page did not download correctly. Aborting' # DEBUG: Save page to file - of = open('full_status_page.html', 'w') + of = open('full_status_page_uncensored.html', 'w') of.write(full_status_page.text) of.close() - sys.exit(-1) + # Remake soup + soup = BeautifulSoup(full_status_page.text, 'html.parser') # Isolate table main-tweet tmt = soup.find('table', class_='main-tweet') @@ -341,8 +361,8 @@ def main(argv): tweets.append(tweet) # DEBUG: Print extracted tweets - # for t in tweets: - # print(t) + for t in tweets: + print(t) # ********************************************************** # Iterate tweets. Check if the tweet has already been posted