Removed lost print() statement

Add reply_to key to tweet dictionary
Modify toot processing for loop
2025-02-24 17:08:42 +00:00 · 2023-07-16 11:52:09 +02:00 · 2023-07-16 11:43:26 +02:00 · 2023-07-16 11:37:32 +02:00 · 2023-07-16 11:36:05 +02:00 · 2023-07-16 11:18:21 +02:00
1 changed files with 29 additions and 13 deletions
--- a/twoot.py
+++ b/twoot.py
@ -169,12 +169,14 @@ Dowload page with full thread of tweets and extract all replied to tweet referen
 Only used by `get_timeline()`.
 :param session: Existing HTTP session with Nitter instance
 :param headers: HTTP headers to use
-:param url: url of the thread page to download
-:return: List of tweets from the thread
+:param nitter url: url of the nitter instance to use
+:param thread_url: url of the first tweet in thread
+:return: list of tuples with url of tweet replied-to (or None) and content of tweet
 """
-def _get_rest_of_thread(session, headers, url):
+def _get_rest_of_thread(session, headers, nitter_url, thread_url):
    logging.debug("Downloading tweets in thread from separate page")
    # Download page with thread
+    url = nitter_url + thread_url
    try:
        thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
    except requests.exceptions.ConnectionError:
@ -201,14 +203,25 @@ def _get_rest_of_thread(session, headers, url):

    # Get all items in thread after main tweet
    after_tweet = soup.find('div', 'after-tweet')
+    list = after_tweet.find_all('div', class_='timeline-item')
+
+    # Build timeline of tuples
+    timeline = []
+    previous_tweet_url = thread_url
+    for item in list:
+            timeline.append((previous_tweet_url, item))
+            # Get the url of the tweet
+            previous_tweet_url = item.find('a', class_='tweet-link')
+            if previous_tweet_url is  None:
+                logging.error('Thread tweet is missing link tag')

-    timeline = after_tweet.find_all('div', class_='timeline-item')
    return timeline

+
 """
-Dowload page with full thread of tweets. Only used by `get_timeline()`.
-:param url: url of the thread page to download
-:return: List of tweets from the thread
+Download timeline of twitter account
+:param url: url of the account page to download
+:return: list of tuples with url of tweet replied-to (or None) and content of tweet
 """
 def get_timeline(nitter_url):
    # Define url to use
@ -268,17 +281,20 @@ def get_timeline(nitter_url):
    for item in list:
        classes = item['class']
        if 'timeline-item' in classes:  # Individual tweet
-            timeline.append(item)
+            timeline.append((None, item))
        elif 'thread-line' in classes:  # First tweet of a thread
            # Get the first item of thread
            first_item = item.find('div', class_='timeline-item')
-            timeline.append(first_item)

-            # Get the rest of the items of the thread
+            # Get the url of the tweet
            thread_link_tag = item.find('a', class_='tweet-link')
            if thread_link_tag is not None:
                thread_url = thread_link_tag.get('href')
-            timeline.extend(_get_rest_of_thread(session, headers, nitter_url + thread_url))
+
+            timeline.append((thread_url, first_item))
+
+            # Get the rest of the items of the thread
+            timeline.extend(_get_rest_of_thread(session, headers, nitter_url, thread_url))
        else:
            # Ignore other classes
            continue
@ -877,7 +893,6 @@ def main(argv):
        log_level = logging.INFO
    elif ll_str == "WARNING":
        log_level = logging.WARNING
-        print('log level warning set')
    elif ll_str == "ERROR":
        log_level = logging.ERROR
    elif ll_str == "CRITICAL":
@ -944,7 +959,7 @@ def main(argv):
    tweets = []
    out_date_cnt = 0
    in_db_cnt = 0
-    for status in timeline:
+    for reply_to, status in timeline:
        # Extract tweet ID and status ID
        tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
        status_id = tweet_id.split('/')[3]
@ -1107,6 +1122,7 @@ def main(argv):
            "tweet_text": tweet_text,
            "video": video_file,
            "photos": photos,
+            "reply-to": reply_to,
        }
        tweets.append(tweet)
Author	SHA1	Message	Date
jeancf	497d9f3a20	Removed lost print() statement	2023-07-16 11:52:09 +02:00
jeancf	4cd49a1de7	Add reply_to key to tweet dictionary	2023-07-16 11:43:26 +02:00
jeancf	9b9c7702f1	Modify toot processing for loop	2023-07-16 11:37:32 +02:00
jeancf	3fab787738	get_timeline and get_rest_of_thread return tuple	2023-07-16 11:36:05 +02:00
jeancf	9bdcccf713	Modify call to get_rest_of_thread	2023-07-16 11:18:21 +02:00