Download page from nitter.net

This commit is contained in:
jeancf 2020-12-16 19:43:17 +01:00
parent 9fc76b9981
commit 894c13d551

View File

@ -44,48 +44,8 @@ USER_AGENTS = [
] ]
# Setup logging to file # Setup logging to file
logging.basicConfig(filename="twoot.log", level=logging.WARNING) logging.basicConfig(filename="twoot.log", level=logging.INFO)
logging.debug('*********** NEW RUN ***********') logging.info('*********** NEW RUN ***********')
def handle_no_js(session, page, headers):
"""
Check if page is a "No Javascript" page instead of the content that we wanted
If it is, submit the form on the page as POST request to get the correct page and return it
:param session: current requests session
:param page: Response object to check
:param headers: HTTP headers used in initial request
:return: correct page (Response object)
"""
# DEBUG: Save page to file
#of = open('no_js_page.html', 'w')
#of.write(page.text)
#of.close()
# Set default return value
new_page = page
# Make soup
soup = BeautifulSoup(page.text, 'html.parser')
if soup.form.p is not None:
if 'JavaScript is disabled' in str(soup.form.p.string):
# Submit POST form response with cookies
headers.update(
{
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': page.request.url,
}
)
action = soup.form.get('action')
# Submit the form
new_page = session.post(action, headers=headers, cookies=page.cookies)
# Verify that download worked
assert (new_page.status_code == 200), 'The twitter page did not download correctly. Aborting'
return new_page
def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids): def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids):
@ -195,6 +155,7 @@ def contains_class(body_classes, some_class):
return found return found
def main(argv): def main(argv):
# Build parser for command line arguments # Build parser for command line arguments
@ -220,6 +181,8 @@ def main(argv):
max_age = float(args['a']) max_age = float(args['a'])
min_delay = float(args['d']) min_delay = float(args['d'])
logging.info('Updating ' + twit_account + ' on ' + mast_instance)
# Try to open database. If it does not exist, create it # Try to open database. If it does not exist, create it
sql = sqlite3.connect('twoot.db') sql = sqlite3.connect('twoot.db')
db = sql.cursor() db = sql.cursor()
@ -246,21 +209,22 @@ def main(argv):
} }
) )
url = 'https://mobile.twitter.com/' + twit_account url = 'https://nitter.net/' + twit_account
# Download twitter page of user. We should get a 'no javascript' landing page and some cookies # Download twitter page of user.
twit_account_page = session.get(url, headers=headers) twit_account_page = session.get(url, headers=headers)
# Verify that download worked # Verify that download worked
assert twit_account_page.status_code == 200,\ assert twit_account_page.status_code == 200,\
'The twitter page did not download correctly. Aborting' 'The twitter page did not download correctly. Aborting'
# If we got a No Javascript page, download the correct page logging.info('Page downloaded successfully')
twit_account_page = handle_no_js(session, twit_account_page, headers)
# DEBUG: Save page to file # DEBUG: Save page to file
#of = open(twit_account + '.html', 'w') of = open(twit_account + '.html', 'w')
#of.write(twit_account_page.text) of.write(twit_account_page.text)
#of.close() of.close()
exit(0)
# Make soup # Make soup
soup = BeautifulSoup(twit_account_page.text, 'html.parser') soup = BeautifulSoup(twit_account_page.text, 'html.parser')