get_timeline and get_rest_of_thread return tuple

2025-05-09 04:53:56 +00:00 · 2023-07-16 11:36:05 +02:00 · 2023-07-16 11:36:05 +02:00 · 3fab787738
commit 3fab787738
parent 9bdcccf713
1 changed files with 22 additions and 8 deletions
--- a/twoot.py
+++ b/twoot.py
@ -171,7 +171,7 @@ Only used by `get_timeline()`.
 :param headers: HTTP headers to use
 :param nitter url: url of the nitter instance to use
 :param thread_url: url of the first tweet in thread
-:return: List of tweets from the thread
+:return: list of tuples with url of tweet replied-to (or None) and content of tweet
 """
 def _get_rest_of_thread(session, headers, nitter_url, thread_url):
    logging.debug("Downloading tweets in thread from separate page")
@ -203,14 +203,25 @@ def _get_rest_of_thread(session, headers, nitter_url, thread_url):

    # Get all items in thread after main tweet
    after_tweet = soup.find('div', 'after-tweet')
+    list = after_tweet.find_all('div', class_='timeline-item')
+
+    # Build timeline of tuples
+    timeline = []
+    previous_tweet_url = thread_url
+    for item in list:
+            timeline.append((previous_tweet_url, item))
+            # Get the url of the tweet
+            previous_tweet_url = item.find('a', class_='tweet-link')
+            if previous_tweet_url is  None:
+                logging.error('Thread tweet is missing link tag')

-    timeline = after_tweet.find_all('div', class_='timeline-item')
    return timeline

+
 """
-Dowload page with full thread of tweets. Only used by `get_timeline()`.
-:param url: url of the thread page to download
-:return: List of tweets from the thread
+Download timeline of twitter account
+:param url: url of the account page to download
+:return: list of tuples with url of tweet replied-to (or None) and content of tweet
 """
 def get_timeline(nitter_url):
    # Define url to use
@ -270,16 +281,19 @@ def get_timeline(nitter_url):
    for item in list:
        classes = item['class']
        if 'timeline-item' in classes:  # Individual tweet
-            timeline.append(item)
+            timeline.append((None, item))
        elif 'thread-line' in classes:  # First tweet of a thread
            # Get the first item of thread
            first_item = item.find('div', class_='timeline-item')
-            timeline.append(first_item)

-            # Get the rest of the items of the thread
+            # Get the url of the tweet
            thread_link_tag = item.find('a', class_='tweet-link')
            if thread_link_tag is not None:
                thread_url = thread_link_tag.get('href')
+
+            timeline.append((thread_url, first_item))
+
+            # Get the rest of the items of the thread
            timeline.extend(_get_rest_of_thread(session, headers, nitter_url, thread_url))
        else:
            # Ignore other classes