Download page from nitter.net

2025-04-26 22:53:35 +00:00 · 2020-12-16 19:43:17 +01:00 · 2020-12-16 19:43:17 +01:00 · 894c13d551
commit 894c13d551
parent 9fc76b9981
1 changed files with 13 additions and 49 deletions
--- a/twoot.py
+++ b/twoot.py
@ -44,48 +44,8 @@ USER_AGENTS = [
    ]
 # Setup logging to file
-logging.basicConfig(filename="twoot.log", level=logging.WARNING)
+logging.basicConfig(filename="twoot.log", level=logging.INFO)
-logging.debug('*********** NEW RUN ***********')
+logging.info('*********** NEW RUN ***********')
 def handle_no_js(session, page, headers):
    """
    Check if page is a "No Javascript" page instead of the content that we wanted
    If it is, submit the form on the page as POST request to get the correct page and return it
    :param session: current requests session
    :param page: Response object to check
    :param headers: HTTP headers used in initial request
    :return: correct page (Response object)
    """
    # DEBUG: Save page to file
    #of = open('no_js_page.html', 'w')
    #of.write(page.text)
    #of.close()
    # Set default return value
    new_page = page
    # Make soup
    soup = BeautifulSoup(page.text, 'html.parser')
    if soup.form.p is not None:
        if 'JavaScript is disabled' in str(soup.form.p.string):
            # Submit POST form response with cookies
            headers.update(
                {
                    'Content-Type': 'application/x-www-form-urlencoded',
                    'Referer': page.request.url,
                }
            )
            action = soup.form.get('action')
            # Submit the form
            new_page = session.post(action, headers=headers, cookies=page.cookies)
            # Verify that download worked
            assert (new_page.status_code == 200), 'The twitter page did not download correctly. Aborting'
    return new_page
 def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids):
@ -195,6 +155,7 @@ def contains_class(body_classes, some_class):
    return found
 def main(argv):
    # Build parser for command line arguments
@ -220,6 +181,8 @@ def main(argv):
    max_age = float(args['a'])
    min_delay = float(args['d'])
    logging.info('Updating ' + twit_account + ' on ' + mast_instance)
    # Try to open database. If it does not exist, create it
    sql = sqlite3.connect('twoot.db')
    db = sql.cursor()
@ -246,21 +209,22 @@ def main(argv):
        }
    )
-    url = 'https://mobile.twitter.com/' + twit_account
+    url = 'https://nitter.net/' + twit_account
-    # Download twitter page of user. We should get a 'no javascript' landing page and some cookies
+    # Download twitter page of user.
    twit_account_page = session.get(url, headers=headers)
    # Verify that download worked
    assert twit_account_page.status_code == 200,\
        'The twitter page did not download correctly. Aborting'
-    # If we got a No Javascript page, download the correct page
+    logging.info('Page downloaded successfully')
    twit_account_page = handle_no_js(session, twit_account_page, headers)
    # DEBUG: Save page to file
-    #of = open(twit_account + '.html', 'w')
+    of = open(twit_account + '.html', 'w')
-    #of.write(twit_account_page.text)
+    of.write(twit_account_page.text)
-    #of.close()
+    of.close()
    exit(0)
    # Make soup
    soup = BeautifulSoup(twit_account_page.text, 'html.parser')