mirror of
https://gitlab.com/jeancf/twoot.git
synced 2024-12-18 08:10:21 +00:00
Download page from nitter.net
This commit is contained in:
parent
9fc76b9981
commit
894c13d551
62
twoot.py
62
twoot.py
|
@ -44,48 +44,8 @@ USER_AGENTS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
# Setup logging to file
|
# Setup logging to file
|
||||||
logging.basicConfig(filename="twoot.log", level=logging.WARNING)
|
logging.basicConfig(filename="twoot.log", level=logging.INFO)
|
||||||
logging.debug('*********** NEW RUN ***********')
|
logging.info('*********** NEW RUN ***********')
|
||||||
|
|
||||||
def handle_no_js(session, page, headers):
|
|
||||||
"""
|
|
||||||
Check if page is a "No Javascript" page instead of the content that we wanted
|
|
||||||
If it is, submit the form on the page as POST request to get the correct page and return it
|
|
||||||
:param session: current requests session
|
|
||||||
:param page: Response object to check
|
|
||||||
:param headers: HTTP headers used in initial request
|
|
||||||
:return: correct page (Response object)
|
|
||||||
"""
|
|
||||||
# DEBUG: Save page to file
|
|
||||||
#of = open('no_js_page.html', 'w')
|
|
||||||
#of.write(page.text)
|
|
||||||
#of.close()
|
|
||||||
|
|
||||||
# Set default return value
|
|
||||||
new_page = page
|
|
||||||
|
|
||||||
# Make soup
|
|
||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
|
||||||
|
|
||||||
if soup.form.p is not None:
|
|
||||||
if 'JavaScript is disabled' in str(soup.form.p.string):
|
|
||||||
# Submit POST form response with cookies
|
|
||||||
headers.update(
|
|
||||||
{
|
|
||||||
'Content-Type': 'application/x-www-form-urlencoded',
|
|
||||||
'Referer': page.request.url,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
action = soup.form.get('action')
|
|
||||||
|
|
||||||
# Submit the form
|
|
||||||
new_page = session.post(action, headers=headers, cookies=page.cookies)
|
|
||||||
|
|
||||||
# Verify that download worked
|
|
||||||
assert (new_page.status_code == 200), 'The twitter page did not download correctly. Aborting'
|
|
||||||
|
|
||||||
return new_page
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids):
|
def cleanup_tweet_text(tt_iter, twit_account, status_id, tweet_uri, get_vids):
|
||||||
|
@ -195,6 +155,7 @@ def contains_class(body_classes, some_class):
|
||||||
|
|
||||||
return found
|
return found
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
|
|
||||||
# Build parser for command line arguments
|
# Build parser for command line arguments
|
||||||
|
@ -220,6 +181,8 @@ def main(argv):
|
||||||
max_age = float(args['a'])
|
max_age = float(args['a'])
|
||||||
min_delay = float(args['d'])
|
min_delay = float(args['d'])
|
||||||
|
|
||||||
|
logging.info('Updating ' + twit_account + ' on ' + mast_instance)
|
||||||
|
|
||||||
# Try to open database. If it does not exist, create it
|
# Try to open database. If it does not exist, create it
|
||||||
sql = sqlite3.connect('twoot.db')
|
sql = sqlite3.connect('twoot.db')
|
||||||
db = sql.cursor()
|
db = sql.cursor()
|
||||||
|
@ -246,21 +209,22 @@ def main(argv):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
url = 'https://mobile.twitter.com/' + twit_account
|
url = 'https://nitter.net/' + twit_account
|
||||||
# Download twitter page of user. We should get a 'no javascript' landing page and some cookies
|
# Download twitter page of user.
|
||||||
twit_account_page = session.get(url, headers=headers)
|
twit_account_page = session.get(url, headers=headers)
|
||||||
|
|
||||||
# Verify that download worked
|
# Verify that download worked
|
||||||
assert twit_account_page.status_code == 200,\
|
assert twit_account_page.status_code == 200,\
|
||||||
'The twitter page did not download correctly. Aborting'
|
'The twitter page did not download correctly. Aborting'
|
||||||
|
|
||||||
# If we got a No Javascript page, download the correct page
|
logging.info('Page downloaded successfully')
|
||||||
twit_account_page = handle_no_js(session, twit_account_page, headers)
|
|
||||||
|
|
||||||
# DEBUG: Save page to file
|
# DEBUG: Save page to file
|
||||||
#of = open(twit_account + '.html', 'w')
|
of = open(twit_account + '.html', 'w')
|
||||||
#of.write(twit_account_page.text)
|
of.write(twit_account_page.text)
|
||||||
#of.close()
|
of.close()
|
||||||
|
|
||||||
|
exit(0)
|
||||||
|
|
||||||
# Make soup
|
# Make soup
|
||||||
soup = BeautifulSoup(twit_account_page.text, 'html.parser')
|
soup = BeautifulSoup(twit_account_page.text, 'html.parser')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user