From 6fb6a38732b6c3b13368f0d75a8123e7f35f40b2 Mon Sep 17 00:00:00 2001
From: jeancf <jc@noirextreme.com>
Date: Fri, 6 Mar 2020 17:40:13 +0100
Subject: [PATCH] Used session to manage cookies automatically

---
 twoot.py | 126 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 73 insertions(+), 53 deletions(-)

diff --git a/twoot.py b/twoot.py
index e7cd068..3dc9b00 100755
--- a/twoot.py
+++ b/twoot.py
@@ -32,14 +32,54 @@ from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalA
 # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
 USER_AGENTS = [
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
-    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/69.0',
+    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/73.0',
     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13 Safari/605.1.15',
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 Edg/44.18362.329.0',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; Xbox; Xbox One) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edge/44.18363.8131',
     ]
 
 #TODO log to file
 
 
+def handle_no_js(session, page, headers):
+    """
+    Check if page is a "No Javascript" page instead of the content that we wanted
+    If it is, submit the form on the page as POST request to get the correct page and return it
+    :param session: current requests session
+    :param page: Response object to check
+    :param headers: HTTP headers used in initial request
+    :return: correct page (Response object)
+    """
+    # DEBUG: Save page to file
+    of = open('no_js_page.html', 'w')
+    of.write(page.text)
+    of.close()
+
+    # Set default return value
+    new_page = page
+
+    # Make soup
+    soup = BeautifulSoup(page.text, 'html.parser')
+
+    if soup.form.p is not None:
+        if 'JavaScript is disabled' in str(soup.form.p.string):
+            # Submit POST form response with cookies
+            headers.update(
+                {
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                    'Referer': page.request.url,
+                }
+            )
+
+            action = soup.form.get('action')
+
+            # Submit the form
+            new_page = session.post(action, headers=headers, cookies=page.cookies)
+
+            # Verify that download worked
+            assert (new_page.status_code == 200), 'The twitter page did not download correctly. Aborting'
+
+    return new_page
+
 def cleanup_tweet_text(tt_iter):
     '''
     Receives an iterator over all the elements contained in the tweet-text container.
@@ -150,6 +190,9 @@ def main(argv):
     # To store content of all tweets from this user
     tweets = []
 
+    # Initiate session
+    session = requests.Session()
+
     # Get a copy of the default headers that requests would use
     headers = requests.utils.default_headers()
 
@@ -162,40 +205,19 @@ def main(argv):
 
     url = 'https://mobile.twitter.com/' + twit_account
     # Download twitter page of user. We should get a 'no javascript' landing page and some cookies
-    no_js_page = requests.get(url, headers=headers)
-
-    # Verify that download worked
-    assert no_js_page.status_code == 200,\
-        'The twitter page did not download correctly. Aborting'
-
-    # DEBUG: Save page to file
-    #of = open('no_js_page.html', 'w')
-    #of.write(no_js_page.text)
-    #of.close()
-
-    # Verify that this is the no_js page that we expected
-    soup = BeautifulSoup(no_js_page.text, 'html.parser')
-    assert 'JavaScript is disabled' in str(soup.form.p.string),\
-        'this is not the no_js page we expected. Quitting'
-
-    # Submit POST form response with cookies
-    headers.update(
-        {
-            'Content-Type': 'application/x-www-form-urlencoded',
-            'Referer': url,
-        }
-    )
-
-    twit_account_page = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=no_js_page.cookies)
+    twit_account_page = session.get(url, headers=headers)
 
     # Verify that download worked
     assert twit_account_page.status_code == 200,\
         'The twitter page did not download correctly. Aborting'
 
+    # If we got a No Javascript page, download the correct page
+    twit_account_page = handle_no_js(session, twit_account_page, headers)
+
     # DEBUG: Save page to file
-    #of = open(twit_account + '.html', 'w')
-    #of.write(twit_account_page.text)
-    #of.close()
+    of = open(twit_account + '.html', 'w')
+    of.write(twit_account_page.text)
+    of.close()
 
     # Make soup
     soup = BeautifulSoup(twit_account_page.text, 'html.parser')
@@ -213,20 +235,22 @@ def main(argv):
         tweet_id = str(status['href']).strip('?p=v')
 
         # Extract url of full status page
-        full_status_url = 'https://mobile.twitter.com' + tweet_id
+        full_status_url = 'https://mobile.twitter.com' + tweet_id + '?p=v'
 
         # fetch full status page
-        full_status_page = requests.get(full_status_url, cookies=twit_account_page.cookies)
-        # FIXME: For some funny reason the command above only works if I don't provide headers. If I do, I get the no_js page...
+        full_status_page = session.get(full_status_url, headers=headers)
 
         # Verify that download worked
-        assert twit_account_page.status_code == 200, \
+        assert full_status_page.status_code == 200, \
             'The twitter page did not download correctly. Aborting'
 
+        # If we got a No Javascript page, download the correct page
+        full_status_page = handle_no_js(session, full_status_page, headers)
+
         # DEBUG: Save page to file
-        # of = open('full_status_page.html', 'w')
-        # of.write(full_status_page.text)
-        # of.close()
+        of = open('full_status_page.html', 'w')
+        of.write(full_status_page.text)
+        of.close()
 
         # Make soup
         soup = BeautifulSoup(full_status_page.text, 'html.parser')
@@ -236,16 +260,16 @@ def main(argv):
         assert contains_class(body_classes, 'tweets-show-page'), \
             'This is not the correct twitter page. Quitting'
 
-        # Check if tweet contains pic censored as "offensive media"
+        # Check if tweet contains pic censored as "Sensitive material"
         if soup.find('div', class_='accept-data') is not None:
-            print('Censored pic found')
-
-            # TODO if it does, submit form to obtain uncensored tweet
+            # If it does, submit form to obtain uncensored tweet
             # Submit POST form response with cookies
             headers.update(
                 {
+                    'Origin': 'https://mobile.twitter.com',
+                    'Host': 'mobile.twitter.com',
                     'Content-Type': 'application/x-www-form-urlencoded',
-                    'Referer': full_status_url + '?p=v',
+                    'Referer': full_status_url,
                 }
             )
 
@@ -253,23 +277,19 @@ def main(argv):
             authenticity_token = soup.find('input', {'name': 'authenticity_token'}).get('value')
             form_input = {'show_media': 1, 'authenticity_token': authenticity_token, 'commit': 'Display media'}
 
-            print(full_status_url)
-            print(headers)
-            print(form_input)
-            print(full_status_page.cookies)
-
-            full_status_page = requests.post(full_status_url + '?p=v', data=form_input, headers=headers, cookies=full_status_page.cookies)
+            full_status_page = session.post(full_status_url + '?p=v', data=form_input, headers=headers)
 
             # Verify that download worked
-            assert twit_account_page.status_code == 200, \
+            assert full_status_page.status_code == 200, \
                 'The twitter page did not download correctly. Aborting'
 
             # DEBUG: Save page to file
-            of = open('full_status_page.html', 'w')
+            of = open('full_status_page_uncensored.html', 'w')
             of.write(full_status_page.text)
             of.close()
 
-            sys.exit(-1)
+            # Remake soup
+            soup = BeautifulSoup(full_status_page.text, 'html.parser')
 
         # Isolate table main-tweet
         tmt = soup.find('table', class_='main-tweet')
@@ -341,8 +361,8 @@ def main(argv):
         tweets.append(tweet)
 
     # DEBUG: Print extracted tweets
-    # for t in tweets:
-    #     print(t)
+    for t in tweets:
+         print(t)
 
     # **********************************************************
     # Iterate tweets. Check if the tweet has already been posted