Fix indentation

Add some log messages
Fine tune thread download
2025-02-24 17:08:42 +00:00 · 2023-07-13 15:44:37 +02:00 · 2023-07-13 13:32:38 +02:00 · 2023-07-13 11:53:07 +02:00 · 2023-07-13 11:36:04 +02:00
1 changed files with 78 additions and 24 deletions
--- a/twoot.py
+++ b/twoot.py
@ -192,7 +192,60 @@ def build_config(args):
        exit(-1)


-def get_timeline(url):
+"""
+Dowload page with full thread of tweets and extract all replied to tweet reference by url.
+Only used by `get_timeline()`.
+:param session: Existing HTTP session with Nitter instance
+:param headers: HTTP headers to use
+:param url: url of the thread page to download
+:return: List of tweets from the thread
+"""
+def _get_rest_of_thread(session, headers, url):
+    logging.debug("Downloading tweets in thread from separate page")
+    # Download page with thread
+    try:
+        thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
+    except requests.exceptions.ConnectionError:
+        logging.fatal('Host did not respond when trying to download ' + url)
+        shutdown(-1)
+    except requests.exceptions.Timeout:
+        logging.fatal(url + ' took too long to respond')
+        shutdown(-1)
+
+    # Verify that download worked
+    if thread_page.status_code != 200:
+        logging.fatal('The Nitter page did not download correctly from ' + url + ' (' + str(thread_page.status_code) + '). Aborting')
+        shutdown(-1)
+
+    logging.debug('Nitter page downloaded successfully from ' + url)
+
+    # DEBUG: Save page to file
+    # of = open('thread_page_debug.html', 'w')
+    # of.write(twit_account_page.text)
+    # of.close()
+
+    # Make soup
+    soup = BeautifulSoup(thread_page.text, 'html.parser')
+
+    # Get all items in thread after main tweet
+    after_tweet = soup.find('div', 'after-tweet')
+
+    timeline = after_tweet.find_all('div', class_='timeline-item')
+    return timeline
+
+"""
+Dowload page with full thread of tweets. Only used by `get_timeline()`.
+:param url: url of the thread page to download
+:return: List of tweets from the thread
+"""
+def get_timeline(nitter_url):
+    # Define url to use
+    url = nitter_url + '/' + TOML['config']['twitter_account']
+
+    # Use different page if we need to handle replies
+    if TOML['options']['post_reply_to']:
+        url += '/with_replies'
+
    # Initiate session
    session = requests.Session()

@ -226,27 +279,37 @@ def get_timeline(url):
    logging.debug('Nitter page downloaded successfully from ' + url)

    # DEBUG: Save page to file
-    # of = open(TOML['config']['twitter_account'] + '.html', 'w')
+    # of = open('user_page_debug.html', 'w')
    # of.write(twit_account_page.text)
    # of.close()

    # Make soup
    soup = BeautifulSoup(twit_account_page.text, 'html.parser')

-    # Extract twitter timeline
+    # Get the div containing tweets
+    tl = soup.find('div', class_='timeline')
+
+    # Get the list of direct children of timeline
+    list = tl.find_all('div', recursive=False)
+
    timeline = []
-
-    # Get all the items from the timeline
-    list = soup.find_all('div', class_='timeline-item')
-
    for item in list:
        classes = item['class']
-        if 'more-replies-thread' in classes:
-            logging.debug('found a more-replies-thread item')
-        else:
+        if 'timeline-item' in classes:
            timeline.append(item)
+        elif 'thread-line' in classes:
+            # Get the first item of thread
+            first_item = item.find('div', class_='timeline-item')
+            timeline.append(first_item)

-
+            # Get the rest of the items of the thread
+            thread_link_tag = item.find('a', class_='tweet-link')
+            if thread_link_tag is not None:
+                thread_url = thread_link_tag.get('href')
+            timeline.extend(_get_rest_of_thread(session, headers, nitter_url + thread_url))
+        else:
+            # Ignore other classes
+            continue
    return soup, timeline


@ -615,6 +678,7 @@ def process_attachments(nitter_url, attachments_container, status_id, author_acc
    vid_class = attachments_container.find('div', class_='video-container')
    if vid_class is not None:
        if TOML['options']['upload_videos']:
+            logging.debug("downloading video from twitter")
            import youtube_dl

            video_path = f"{author_account}/status/{status_id}"
@ -899,12 +963,7 @@ def main(argv):
    # To store content of all tweets from this user
    tweets = []

-    url = nitter_url + '/' + TOML['config']['twitter_account']
-    # Use different page if we need to handle replies
-    if TOML['options']['post_reply_to']:
-        url += '/with_replies'
-
-    soup, timeline = get_timeline(url)
+    soup, timeline = get_timeline(nitter_url)

    logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')

@ -916,13 +975,8 @@ def main(argv):
    in_db_cnt = 0
    for status in timeline:
        # Extract tweet ID and status ID
-        try:
-            tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
-            status_id = tweet_id.split('/')[3]
-        except Exception as e:
-            logging.critical('Malformed timeline downloaded from nitter instance')
-            logging.debug(e)
-            shutdown(-1)
+        tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
+        status_id = tweet_id.split('/')[3]

        logging.debug('processing tweet %s', tweet_id)
Author	SHA1	Message	Date
jeancf	a4f3934d86	Fix indentation	2023-07-13 15:44:37 +02:00
jeancf	29c7457644	Add some log messages	2023-07-13 13:32:38 +02:00
jeancf	cdbb1bb8f2	Fine tune thread download	2023-07-13 11:53:07 +02:00
jeancf	5939484160	Complete get_timeline()	2023-07-13 11:36:04 +02:00