Refactored process_media_body()

This commit is contained in:
jeancf 2022-11-27 18:30:39 +01:00
parent 8aa9bb9ae0
commit 27ed4cf10e

View File

@ -74,6 +74,10 @@ def deredir_url(url):
:return: direct url :return: direct url
""" """
# Check if we need to do anyting
if TOML['options']['remove_link_redirections'] is False:
return url
# Get a copy of the default headers that requests would use # Get a copy of the default headers that requests would use
headers = requests.utils.default_headers() headers = requests.utils.default_headers()
@ -187,7 +191,7 @@ def substitute_source(orig_url):
return dest_url return dest_url
def clean_url(dirty_url): def clean_url(orig_url):
""" """
Given a URL, return it with the UTM parameters removed from query and fragment Given a URL, return it with the UTM parameters removed from query and fragment
:param dirty_url: url to be cleaned :param dirty_url: url to be cleaned
@ -196,20 +200,24 @@ def clean_url(dirty_url):
'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok' 'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
""" """
url_parsed = urlparse(dirty_url) # Check if we have to do anything
if TOML['options']['remove_trackers_from_urls'] is False:
return orig_url
cleaned_url = urlunparse([ url_parsed = urlparse(orig_url)
dest_url = urlunparse([
url_parsed.scheme, url_parsed.scheme,
_substitute_source(url_parsed.netloc), url_parsed.netloc,
url_parsed.path, url_parsed.path,
url_parsed.params, url_parsed.params,
_remove_trackers_query(url_parsed.query), _remove_trackers_query(url_parsed.query),
_remove_trackers_fragment(url_parsed.fragment) _remove_trackers_fragment(url_parsed.fragment)
]) ])
if cleaned_url != dirty_url: if dest_url != orig_url:
logging.debug('Cleaned URL from: ' + dirty_url + ' to: ' + cleaned_url) logging.debug('Cleaned URL from: ' + orig_url + ' to: ' + dest_url)
return cleaned_url return dest_url
def process_media_body(tt_iter): def process_media_body(tt_iter):
@ -237,14 +245,10 @@ def process_media_body(tt_iter):
tweet_text += tag_text tweet_text += tag_text
else: else:
# This is a real link # This is a real link
if TOML['options']['remove_link_redirections']:
url = deredir_url(tag.get('href')) url = deredir_url(tag.get('href'))
else: url = substitute_source(url)
url = tag.get('href') url = clean_url(url)
if TOML['options']['remove_trackers_from_urls']:
tweet_text += clean_url(url)
else:
tweet_text += url tweet_text += url
else: else:
logging.warning("No handler for tag in twitter text: " + tag.prettify()) logging.warning("No handler for tag in twitter text: " + tag.prettify())