Simplified process_media_body and _attachments

This commit is contained in:
jeancf 2022-11-23 14:49:06 +01:00
parent 294bf1fae1
commit b6315f193c

View File

@ -173,13 +173,11 @@ def clean_url(dirty_url):
return cleaned_url return cleaned_url
def process_media_body(tt_iter, remove_redir, remove_trackers): def process_media_body(tt_iter):
""" """
Receives an iterator over all the elements contained in the tweet-text container. Receives an iterator over all the elements contained in the tweet-text container.
Processes them to make them suitable for posting on Mastodon Processes them to make them suitable for posting on Mastodon
:param tt_iter: iterator over the HTML elements in the text of the tweet :param tt_iter: iterator over the HTML elements in the text of the tweet
:param remove_redir: bool to indicate if redirections should be removed
:param remove_trackers: bool to indicate if trackers should be removed
:return: cleaned up text of the tweet :return: cleaned up text of the tweet
""" """
tweet_text = '' tweet_text = ''
@ -200,12 +198,12 @@ def process_media_body(tt_iter, remove_redir, remove_trackers):
tweet_text += tag_text tweet_text += tag_text
else: else:
# This is a real link # This is a real link
if remove_redir: if TOML['options']['remove_link_redirections']:
url = deredir_url(tag.get('href')) url = deredir_url(tag.get('href'))
else: else:
url = tag.get('href') url = tag.get('href')
if remove_trackers: if TOML['options']['remove_trackers_from_urls']:
tweet_text += clean_url(url) tweet_text += clean_url(url)
else: else:
tweet_text += url tweet_text += url
@ -232,12 +230,11 @@ def process_card(nitter_url, card_container):
return list return list
def process_attachments(nitter_url, attachments_container, get_vids, twit_account, status_id, author_account): def process_attachments(nitter_url, attachments_container, status_id, author_account):
""" """
Extract images or video from attachments. Videos are downloaded on the file system. Extract images or video from attachments. Videos are downloaded on the file system.
:param nitter_url: url of nitter mirror :param nitter_url: url of nitter mirror
:param attachments_container: soup of 'div' tag containing attachments markup :param attachments_container: soup of 'div' tag containing attachments markup
:param get_vids: whether to download videos or not
:param twit_account: name of twitter account :param twit_account: name of twitter account
:param status_id: id of tweet being processed :param status_id: id of tweet being processed
:param author_account: author of tweet with video attachment :param author_account: author of tweet with video attachment
@ -256,7 +253,7 @@ def process_attachments(nitter_url, attachments_container, get_vids, twit_accoun
if gif_class is not None: if gif_class is not None:
gif_video_file = nitter_url + gif_class.source.get('src') gif_video_file = nitter_url + gif_class.source.get('src')
video_path = os.path.join('output', twit_account, status_id, author_account, status_id) video_path = os.path.join('output', TOML['config']['twitter_account'], status_id, author_account, status_id)
os.makedirs(video_path, exist_ok=True) os.makedirs(video_path, exist_ok=True)
# Open directory for writing file # Open directory for writing file
@ -283,12 +280,12 @@ def process_attachments(nitter_url, attachments_container, get_vids, twit_accoun
vid_in_tweet = False vid_in_tweet = False
vid_class = attachments_container.find('div', class_='video-container') vid_class = attachments_container.find('div', class_='video-container')
if vid_class is not None: if vid_class is not None:
if get_vids: if TOML['options']['upload_videos']:
import youtube_dl import youtube_dl
video_file = os.path.join('https://twitter.com', author_account, 'status', status_id) video_file = os.path.join('https://twitter.com', author_account, 'status', status_id)
ydl_opts = { ydl_opts = {
'outtmpl': "output/" + twit_account + "/" + status_id + "/%(id)s.%(ext)s", 'outtmpl': "output/" + TOML['config']['twitter_account'] + "/" + status_id + "/%(id)s.%(ext)s",
'format': "best[width<=500]", 'format': "best[width<=500]",
'socket_timeout': 60, 'socket_timeout': 60,
'quiet': True, 'quiet': True,
@ -653,10 +650,7 @@ def main(argv):
tt_iter = status.find('div', class_='tweet-content media-body').children tt_iter = status.find('div', class_='tweet-content media-body').children
# Process text of tweet # Process text of tweet
tweet_text += process_media_body(tt_iter, tweet_text += process_media_body(tt_iter)
TOML['options']['remove_link_redirections'],
TOML['options']['remove_trackers_from_urls']
)
# Process quote: append link to tweet_text # Process quote: append link to tweet_text
quote_div = status.find('a', class_='quote-link') quote_div = status.find('a', class_='quote-link')
@ -673,8 +667,6 @@ def main(argv):
if attachments_class is not None: if attachments_class is not None:
pics, vid_in_tweet = process_attachments(nitter_url, pics, vid_in_tweet = process_attachments(nitter_url,
attachments_class, attachments_class,
TOML['options']['upload_videos'],
TOML['config']['twitter_account'],
status_id, author_account status_id, author_account
) )
photos.extend(pics) photos.extend(pics)