diff --git a/twoot.py b/twoot.py index 56242a9..14a7503 100755 --- a/twoot.py +++ b/twoot.py @@ -192,7 +192,58 @@ def build_config(args): exit(-1) -def get_timeline(url): +""" +Dowload page with full thread of tweets and extract all replied to tweet reference by url. +Only used by `get_timeline()`. +:param session: Existing HTTP session with Nitter instance +:param headers: HTTP headers to use +:param url: url of the thread page to download +:return: List of tweets from the thread +""" +def _get_rest_of_thread(session, headers, url): + # Download page with thread + try: + thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT) + except requests.exceptions.ConnectionError: + logging.fatal('Host did not respond when trying to download ' + url) + shutdown(-1) + except requests.exceptions.Timeout: + logging.fatal(url + ' took too long to respond') + shutdown(-1) + + # Verify that download worked + if thread_page.status_code != 200: + logging.fatal('The Nitter page did not download correctly from ' + url + ' (' + str( + thread_page.status_code) + '). Aborting') + shutdown(-1) + + logging.debug('Nitter page downloaded successfully from ' + url) + + # DEBUG: Save page to file + # of = open('thread_page_debug.html', 'w') + # of.write(twit_account_page.text) + # of.close() + + # Make soup + soup = BeautifulSoup(thread_page.text, 'html.parser') + + # Get all items in thread + timeline = soup.find_all('div', class_='timeline-item') + return timeline + +""" +Dowload page with full thread of tweets. Only used by `get_timeline()`. +:param url: url of the thread page to download +:return: List of tweets from the thread +""" +def get_timeline(nitter_url): + # Define url to use + url = nitter_url + '/' + TOML['config']['twitter_account'] + + # Use different page if we need to handle replies + if TOML['options']['post_reply_to']: + url += '/with_replies' + # Initiate session session = requests.Session() @@ -226,27 +277,37 @@ def get_timeline(url): logging.debug('Nitter page downloaded successfully from ' + url) # DEBUG: Save page to file - # of = open(TOML['config']['twitter_account'] + '.html', 'w') + # of = open('user_page_debug.html', 'w') # of.write(twit_account_page.text) # of.close() # Make soup soup = BeautifulSoup(twit_account_page.text, 'html.parser') - # Extract twitter timeline + # Get the div containing tweets + tl = soup.find('div', class_='timeline') + + # Get the list of direct children of timeline + list = tl.find_all('div', recursive=False) + timeline = [] - - # Get all the items from the timeline - list = soup.find_all('div', class_='timeline-item') - for item in list: classes = item['class'] - if 'more-replies-thread' in classes: - logging.debug('found a more-replies-thread item') - else: + if 'timeline-item' in classes: timeline.append(item) + elif 'thread-line' in classes: + # Get the first item of thread + first_item = item.find('div', class_='timeline-item') + timeline.append(first_item) - + # Get the rest of the items of the thread + thread_link_tag = item.find('a', class_='tweet-link') + if thread_link_tag is not None: + thread_url = thread_link_tag.get('href') + timeline.extend(_get_rest_of_thread(session, headers, nitter_url + thread_url)) + else: + # Ignore other classes + continue return soup, timeline @@ -899,12 +960,7 @@ def main(argv): # To store content of all tweets from this user tweets = [] - url = nitter_url + '/' + TOML['config']['twitter_account'] - # Use different page if we need to handle replies - if TOML['options']['post_reply_to']: - url += '/with_replies' - - soup, timeline = get_timeline(url) + soup, timeline = get_timeline(nitter_url) logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')