Used session to manage cookies automatically

2025-05-22 08:53:16 +00:00 · 2020-03-06 17:40:13 +01:00 · 2020-03-06 17:40:13 +01:00 · 6fb6a38732
commit 6fb6a38732
parent fd9130c053
1 changed files with 73 additions and 53 deletions
--- a/twoot.py
+++ b/twoot.py
@ -32,14 +32,54 @@ from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalA
 # Update from https://www.whatismybrowser.com/guides/the-latest-user-agent/
 USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
-    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/69.0',
+    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/73.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13 Safari/605.1.15',
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 Edg/44.18362.329.0',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; Xbox; Xbox One) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edge/44.18363.8131',
    ]

 #TODO log to file


+def handle_no_js(session, page, headers):
+    """
+    Check if page is a "No Javascript" page instead of the content that we wanted
+    If it is, submit the form on the page as POST request to get the correct page and return it
+    :param session: current requests session
+    :param page: Response object to check
+    :param headers: HTTP headers used in initial request
+    :return: correct page (Response object)
+    """
+    # DEBUG: Save page to file
+    of = open('no_js_page.html', 'w')
+    of.write(page.text)
+    of.close()
+
+    # Set default return value
+    new_page = page
+
+    # Make soup
+    soup = BeautifulSoup(page.text, 'html.parser')
+
+    if soup.form.p is not None:
+        if 'JavaScript is disabled' in str(soup.form.p.string):
+            # Submit POST form response with cookies
+            headers.update(
+                {
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                    'Referer': page.request.url,
+                }
+            )
+
+            action = soup.form.get('action')
+
+            # Submit the form
+            new_page = session.post(action, headers=headers, cookies=page.cookies)
+
+            # Verify that download worked
+            assert (new_page.status_code == 200), 'The twitter page did not download correctly. Aborting'
+
+    return new_page
+
 def cleanup_tweet_text(tt_iter):
    '''
    Receives an iterator over all the elements contained in the tweet-text container.
@ -150,6 +190,9 @@ def main(argv):
    # To store content of all tweets from this user
    tweets = []

+    # Initiate session
+    session = requests.Session()
+
    # Get a copy of the default headers that requests would use
    headers = requests.utils.default_headers()

@ -162,40 +205,19 @@ def main(argv):

    url = 'https://mobile.twitter.com/' + twit_account
    # Download twitter page of user. We should get a 'no javascript' landing page and some cookies
-    no_js_page = requests.get(url, headers=headers)
-
-    # Verify that download worked
-    assert no_js_page.status_code == 200,\
-        'The twitter page did not download correctly. Aborting'
-
-    # DEBUG: Save page to file
-    #of = open('no_js_page.html', 'w')
-    #of.write(no_js_page.text)
-    #of.close()
-
-    # Verify that this is the no_js page that we expected
-    soup = BeautifulSoup(no_js_page.text, 'html.parser')
-    assert 'JavaScript is disabled' in str(soup.form.p.string),\
-        'this is not the no_js page we expected. Quitting'
-
-    # Submit POST form response with cookies
-    headers.update(
-        {
-            'Content-Type': 'application/x-www-form-urlencoded',
-            'Referer': url,
-        }
-    )
-
-    twit_account_page = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=no_js_page.cookies)
+    twit_account_page = session.get(url, headers=headers)

    # Verify that download worked
    assert twit_account_page.status_code == 200,\
        'The twitter page did not download correctly. Aborting'

+    # If we got a No Javascript page, download the correct page
+    twit_account_page = handle_no_js(session, twit_account_page, headers)
+
    # DEBUG: Save page to file
-    #of = open(twit_account + '.html', 'w')
-    #of.write(twit_account_page.text)
-    #of.close()
+    of = open(twit_account + '.html', 'w')
+    of.write(twit_account_page.text)
+    of.close()

    # Make soup
    soup = BeautifulSoup(twit_account_page.text, 'html.parser')
@ -213,20 +235,22 @@ def main(argv):
        tweet_id = str(status['href']).strip('?p=v')

        # Extract url of full status page
-        full_status_url = 'https://mobile.twitter.com' + tweet_id
+        full_status_url = 'https://mobile.twitter.com' + tweet_id + '?p=v'

        # fetch full status page
-        full_status_page = requests.get(full_status_url, cookies=twit_account_page.cookies)
-        # FIXME: For some funny reason the command above only works if I don't provide headers. If I do, I get the no_js page...
+        full_status_page = session.get(full_status_url, headers=headers)

        # Verify that download worked
-        assert twit_account_page.status_code == 200, \
+        assert full_status_page.status_code == 200, \
            'The twitter page did not download correctly. Aborting'

+        # If we got a No Javascript page, download the correct page
+        full_status_page = handle_no_js(session, full_status_page, headers)
+
        # DEBUG: Save page to file
-        # of = open('full_status_page.html', 'w')
-        # of.write(full_status_page.text)
-        # of.close()
+        of = open('full_status_page.html', 'w')
+        of.write(full_status_page.text)
+        of.close()

        # Make soup
        soup = BeautifulSoup(full_status_page.text, 'html.parser')
@ -236,16 +260,16 @@ def main(argv):
        assert contains_class(body_classes, 'tweets-show-page'), \
            'This is not the correct twitter page. Quitting'

-        # Check if tweet contains pic censored as "offensive media"
+        # Check if tweet contains pic censored as "Sensitive material"
        if soup.find('div', class_='accept-data') is not None:
-            print('Censored pic found')
-
-            # TODO if it does, submit form to obtain uncensored tweet
+            # If it does, submit form to obtain uncensored tweet
            # Submit POST form response with cookies
            headers.update(
                {
+                    'Origin': 'https://mobile.twitter.com',
+                    'Host': 'mobile.twitter.com',
                    'Content-Type': 'application/x-www-form-urlencoded',
-                    'Referer': full_status_url + '?p=v',
+                    'Referer': full_status_url,
                }
            )

@ -253,23 +277,19 @@ def main(argv):
            authenticity_token = soup.find('input', {'name': 'authenticity_token'}).get('value')
            form_input = {'show_media': 1, 'authenticity_token': authenticity_token, 'commit': 'Display media'}

-            print(full_status_url)
-            print(headers)
-            print(form_input)
-            print(full_status_page.cookies)
-
-            full_status_page = requests.post(full_status_url + '?p=v', data=form_input, headers=headers, cookies=full_status_page.cookies)
+            full_status_page = session.post(full_status_url + '?p=v', data=form_input, headers=headers)

            # Verify that download worked
-            assert twit_account_page.status_code == 200, \
+            assert full_status_page.status_code == 200, \
                'The twitter page did not download correctly. Aborting'

            # DEBUG: Save page to file
-            of = open('full_status_page.html', 'w')
+            of = open('full_status_page_uncensored.html', 'w')
            of.write(full_status_page.text)
            of.close()

-            sys.exit(-1)
+            # Remake soup
+            soup = BeautifulSoup(full_status_page.text, 'html.parser')

        # Isolate table main-tweet
        tmt = soup.find('table', class_='main-tweet')
@ -341,8 +361,8 @@ def main(argv):
        tweets.append(tweet)

    # DEBUG: Print extracted tweets
-    # for t in tweets:
-    #     print(t)
+    for t in tweets:
+         print(t)

    # **********************************************************
    # Iterate tweets. Check if the tweet has already been posted