Compare commits

...

4 Commits

Author SHA1 Message Date
jeancf
a4f3934d86 Fix indentation 2023-07-13 15:44:37 +02:00
jeancf
29c7457644 Add some log messages 2023-07-13 13:32:38 +02:00
jeancf
cdbb1bb8f2 Fine tune thread download 2023-07-13 11:53:07 +02:00
jeancf
5939484160 Complete get_timeline() 2023-07-13 11:36:04 +02:00

102
twoot.py
View File

@ -192,7 +192,60 @@ def build_config(args):
exit(-1)
def get_timeline(url):
"""
Dowload page with full thread of tweets and extract all replied to tweet reference by url.
Only used by `get_timeline()`.
:param session: Existing HTTP session with Nitter instance
:param headers: HTTP headers to use
:param url: url of the thread page to download
:return: List of tweets from the thread
"""
def _get_rest_of_thread(session, headers, url):
logging.debug("Downloading tweets in thread from separate page")
# Download page with thread
try:
thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
except requests.exceptions.ConnectionError:
logging.fatal('Host did not respond when trying to download ' + url)
shutdown(-1)
except requests.exceptions.Timeout:
logging.fatal(url + ' took too long to respond')
shutdown(-1)
# Verify that download worked
if thread_page.status_code != 200:
logging.fatal('The Nitter page did not download correctly from ' + url + ' (' + str(thread_page.status_code) + '). Aborting')
shutdown(-1)
logging.debug('Nitter page downloaded successfully from ' + url)
# DEBUG: Save page to file
# of = open('thread_page_debug.html', 'w')
# of.write(twit_account_page.text)
# of.close()
# Make soup
soup = BeautifulSoup(thread_page.text, 'html.parser')
# Get all items in thread after main tweet
after_tweet = soup.find('div', 'after-tweet')
timeline = after_tweet.find_all('div', class_='timeline-item')
return timeline
"""
Dowload page with full thread of tweets. Only used by `get_timeline()`.
:param url: url of the thread page to download
:return: List of tweets from the thread
"""
def get_timeline(nitter_url):
# Define url to use
url = nitter_url + '/' + TOML['config']['twitter_account']
# Use different page if we need to handle replies
if TOML['options']['post_reply_to']:
url += '/with_replies'
# Initiate session
session = requests.Session()
@ -226,27 +279,37 @@ def get_timeline(url):
logging.debug('Nitter page downloaded successfully from ' + url)
# DEBUG: Save page to file
# of = open(TOML['config']['twitter_account'] + '.html', 'w')
# of = open('user_page_debug.html', 'w')
# of.write(twit_account_page.text)
# of.close()
# Make soup
soup = BeautifulSoup(twit_account_page.text, 'html.parser')
# Extract twitter timeline
# Get the div containing tweets
tl = soup.find('div', class_='timeline')
# Get the list of direct children of timeline
list = tl.find_all('div', recursive=False)
timeline = []
# Get all the items from the timeline
list = soup.find_all('div', class_='timeline-item')
for item in list:
classes = item['class']
if 'more-replies-thread' in classes:
logging.debug('found a more-replies-thread item')
else:
if 'timeline-item' in classes:
timeline.append(item)
elif 'thread-line' in classes:
# Get the first item of thread
first_item = item.find('div', class_='timeline-item')
timeline.append(first_item)
# Get the rest of the items of the thread
thread_link_tag = item.find('a', class_='tweet-link')
if thread_link_tag is not None:
thread_url = thread_link_tag.get('href')
timeline.extend(_get_rest_of_thread(session, headers, nitter_url + thread_url))
else:
# Ignore other classes
continue
return soup, timeline
@ -615,6 +678,7 @@ def process_attachments(nitter_url, attachments_container, status_id, author_acc
vid_class = attachments_container.find('div', class_='video-container')
if vid_class is not None:
if TOML['options']['upload_videos']:
logging.debug("downloading video from twitter")
import youtube_dl
video_path = f"{author_account}/status/{status_id}"
@ -899,12 +963,7 @@ def main(argv):
# To store content of all tweets from this user
tweets = []
url = nitter_url + '/' + TOML['config']['twitter_account']
# Use different page if we need to handle replies
if TOML['options']['post_reply_to']:
url += '/with_replies'
soup, timeline = get_timeline(url)
soup, timeline = get_timeline(nitter_url)
logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')
@ -916,13 +975,8 @@ def main(argv):
in_db_cnt = 0
for status in timeline:
# Extract tweet ID and status ID
try:
tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
status_id = tweet_id.split('/')[3]
except Exception as e:
logging.critical('Malformed timeline downloaded from nitter instance')
logging.debug(e)
shutdown(-1)
tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
status_id = tweet_id.split('/')[3]
logging.debug('processing tweet %s', tweet_id)