diff --git a/twoot.py b/twoot.py index 579c682..2807bde 100755 --- a/twoot.py +++ b/twoot.py @@ -171,7 +171,7 @@ Only used by `get_timeline()`. :param headers: HTTP headers to use :param nitter url: url of the nitter instance to use :param thread_url: url of the first tweet in thread -:return: List of tweets from the thread +:return: list of tuples with url of tweet replied-to (or None) and content of tweet """ def _get_rest_of_thread(session, headers, nitter_url, thread_url): logging.debug("Downloading tweets in thread from separate page") @@ -203,14 +203,25 @@ def _get_rest_of_thread(session, headers, nitter_url, thread_url): # Get all items in thread after main tweet after_tweet = soup.find('div', 'after-tweet') + list = after_tweet.find_all('div', class_='timeline-item') + + # Build timeline of tuples + timeline = [] + previous_tweet_url = thread_url + for item in list: + timeline.append((previous_tweet_url, item)) + # Get the url of the tweet + previous_tweet_url = item.find('a', class_='tweet-link') + if previous_tweet_url is None: + logging.error('Thread tweet is missing link tag') - timeline = after_tweet.find_all('div', class_='timeline-item') return timeline + """ -Dowload page with full thread of tweets. Only used by `get_timeline()`. -:param url: url of the thread page to download -:return: List of tweets from the thread +Download timeline of twitter account +:param url: url of the account page to download +:return: list of tuples with url of tweet replied-to (or None) and content of tweet """ def get_timeline(nitter_url): # Define url to use @@ -270,16 +281,19 @@ def get_timeline(nitter_url): for item in list: classes = item['class'] if 'timeline-item' in classes: # Individual tweet - timeline.append(item) + timeline.append((None, item)) elif 'thread-line' in classes: # First tweet of a thread # Get the first item of thread first_item = item.find('div', class_='timeline-item') - timeline.append(first_item) - # Get the rest of the items of the thread + # Get the url of the tweet thread_link_tag = item.find('a', class_='tweet-link') if thread_link_tag is not None: thread_url = thread_link_tag.get('href') + + timeline.append((thread_url, first_item)) + + # Get the rest of the items of the thread timeline.extend(_get_rest_of_thread(session, headers, nitter_url, thread_url)) else: # Ignore other classes