2025-02-25 09:28:43 +00:00
1 changed files with 25 additions and 79 deletions
--- a/twoot.py
+++ b/twoot.py
@ -192,60 +192,7 @@ def build_config(args):
        exit(-1)
-"""
+def get_timeline(url):
 Dowload page with full thread of tweets and extract all replied to tweet reference by url.
 Only used by `get_timeline()`.
 :param session: Existing HTTP session with Nitter instance
 :param headers: HTTP headers to use
 :param url: url of the thread page to download
 :return: List of tweets from the thread
 """
 def _get_rest_of_thread(session, headers, url):
    logging.debug("Downloading tweets in thread from separate page")
    # Download page with thread
    try:
        thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
    except requests.exceptions.ConnectionError:
        logging.fatal('Host did not respond when trying to download ' + url)
        shutdown(-1)
    except requests.exceptions.Timeout:
        logging.fatal(url + ' took too long to respond')
        shutdown(-1)
    # Verify that download worked
    if thread_page.status_code != 200:
        logging.fatal('The Nitter page did not download correctly from ' + url + ' (' + str(thread_page.status_code) + '). Aborting')
        shutdown(-1)
    logging.debug('Nitter page downloaded successfully from ' + url)
    # DEBUG: Save page to file
    # of = open('thread_page_debug.html', 'w')
    # of.write(twit_account_page.text)
    # of.close()
    # Make soup
    soup = BeautifulSoup(thread_page.text, 'html.parser')
    # Get all items in thread after main tweet
    after_tweet = soup.find('div', 'after-tweet')
    timeline = after_tweet.find_all('div', class_='timeline-item')
    return timeline
 """
 Dowload page with full thread of tweets. Only used by `get_timeline()`.
 :param url: url of the thread page to download
 :return: List of tweets from the thread
 """
 def get_timeline(nitter_url):
    # Define url to use
    url = nitter_url + '/' + TOML['config']['twitter_account']
    # Use different page if we need to handle replies
    if TOML['options']['post_reply_to']:
        url += '/with_replies'
    # Initiate session
    session = requests.Session()
@ -279,37 +226,27 @@ def get_timeline(nitter_url):
    logging.debug('Nitter page downloaded successfully from ' + url)
    # DEBUG: Save page to file
-    # of = open('user_page_debug.html', 'w')
+    # of = open(TOML['config']['twitter_account'] + '.html', 'w')
    # of.write(twit_account_page.text)
    # of.close()
    # Make soup
    soup = BeautifulSoup(twit_account_page.text, 'html.parser')
-    # Get the div containing tweets
+    # Extract twitter timeline
    tl = soup.find('div', class_='timeline')
    # Get the list of direct children of timeline
    list = tl.find_all('div', recursive=False)
    timeline = []
    # Get all the items from the timeline
    list = soup.find_all('div', class_='timeline-item')
    for item in list:
        classes = item['class']
-        if 'timeline-item' in classes:
+        if 'more-replies-thread' in classes:
-            timeline.append(item)
+            logging.debug('found a more-replies-thread item')
        elif 'thread-line' in classes:
            # Get the first item of thread
            first_item = item.find('div', class_='timeline-item')
            timeline.append(first_item)
            # Get the rest of the items of the thread
            thread_link_tag = item.find('a', class_='tweet-link')
            if thread_link_tag is not None:
                thread_url = thread_link_tag.get('href')
            timeline.extend(_get_rest_of_thread(session, headers, nitter_url + thread_url))
        else:
-            # Ignore other classes
+            timeline.append(item)
-            continue
+
    return soup, timeline
@ -678,7 +615,6 @@ def process_attachments(nitter_url, attachments_container, status_id, author_acc
    vid_class = attachments_container.find('div', class_='video-container')
    if vid_class is not None:
        if TOML['options']['upload_videos']:
            logging.debug("downloading video from twitter")
            import youtube_dl
            video_path = f"{author_account}/status/{status_id}"
@ -963,7 +899,12 @@ def main(argv):
    # To store content of all tweets from this user
    tweets = []
-    soup, timeline = get_timeline(nitter_url)
+    url = nitter_url + '/' + TOML['config']['twitter_account']
    # Use different page if we need to handle replies
    if TOML['options']['post_reply_to']:
        url += '/with_replies'
    soup, timeline = get_timeline(url)
    logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')
@ -975,8 +916,13 @@ def main(argv):
    in_db_cnt = 0
    for status in timeline:
        # Extract tweet ID and status ID
        try:
            tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
            status_id = tweet_id.split('/')[3]
        except Exception as e:
            logging.critical('Malformed timeline downloaded from nitter instance')
            logging.debug(e)
            shutdown(-1)
        logging.debug('processing tweet %s', tweet_id)