Added handling of no_js landing page

2025-05-07 03:53:56 +00:00 · 2020-02-13 18:01:45 +01:00 · 2020-02-13 18:01:45 +01:00 · 446f39f173
commit 446f39f173
parent fdab0a0836
1 changed files with 32 additions and 10 deletions
--- a/twoot.py
+++ b/twoot.py
@ -142,21 +142,43 @@ def main(argv):
        }
    )

-    # Download twitter page of user
-    response = requests.get('https://twitter.com/' + twit_account, headers=headers)
+    url = 'https://mobile.twitter.com/' + twit_account
+    # Download twitter page of user. We should get a 'no javascript' landing page and some cookies
+    r1 = requests.get(url, headers=headers)

-    ## DEBUG: Save page to file
-    #of = open('twitter.html', 'w')
-    #of.write(response.text)
-    #of.close()
+    # DEBUG: Save page to file
+    of = open('no_js_page.html', 'w')
+    of.write(r1.text)
+    of.close()
+
+    # Verify that this is the no_js page that we expected
+    soup = BeautifulSoup(r1.text, 'html.parser')
+    assert (str(soup.form.p.string).find('JavaScript is disabled') != -1),\
+        'this is not the no_js page we expected. Quitting'
+
+    # Submit POST form response with cookies
+    headers.update(
+        {
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'Referer': url,
+        }
+    )
+
+    response = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=r1.cookies)
+
+    # DEBUG: Save page to file
+    of = open('twitter.html', 'w')
+    of.write(response.text)
+    of.close()

    # Verify that download worked
-    if response.status_code != 200:
-        print("Could not download twitter timeline. Aborting.")
-        exit(-1)
+    assert response.status_code == 200,\
+        'The twitter page did not download correctly. Aborting'

-    # Build tree of html elements for processing
+    # Verify that we now have the correct twitter page
    soup = BeautifulSoup(response.text, 'html.parser')
+    assert (str(soup.head.title.string).find(twit_account) != -1),\
+        'This is not the correct twitter page. Quitting'

    # Extract twitter timeline
    results = soup.find_all('div', class_='content')