Refactored process_media_body()

This commit is contained in:
jeancf 2022-11-27 18:30:39 +01:00
parent 8aa9bb9ae0
commit 27ed4cf10e

View File

@ -74,6 +74,10 @@ def deredir_url(url):
:return: direct url
"""
# Check if we need to do anyting
if TOML['options']['remove_link_redirections'] is False:
return url
# Get a copy of the default headers that requests would use
headers = requests.utils.default_headers()
@ -187,7 +191,7 @@ def substitute_source(orig_url):
return dest_url
def clean_url(dirty_url):
def clean_url(orig_url):
"""
Given a URL, return it with the UTM parameters removed from query and fragment
:param dirty_url: url to be cleaned
@ -196,20 +200,24 @@ def clean_url(dirty_url):
'https://example.com/video/this-aerial-ropeway?a=aaa&b=1#mkt_tik=tok'
"""
url_parsed = urlparse(dirty_url)
# Check if we have to do anything
if TOML['options']['remove_trackers_from_urls'] is False:
return orig_url
cleaned_url = urlunparse([
url_parsed = urlparse(orig_url)
dest_url = urlunparse([
url_parsed.scheme,
_substitute_source(url_parsed.netloc),
url_parsed.netloc,
url_parsed.path,
url_parsed.params,
_remove_trackers_query(url_parsed.query),
_remove_trackers_fragment(url_parsed.fragment)
])
if cleaned_url != dirty_url:
logging.debug('Cleaned URL from: ' + dirty_url + ' to: ' + cleaned_url)
if dest_url != orig_url:
logging.debug('Cleaned URL from: ' + orig_url + ' to: ' + dest_url)
return cleaned_url
return dest_url
def process_media_body(tt_iter):
@ -237,15 +245,11 @@ def process_media_body(tt_iter):
tweet_text += tag_text
else:
# This is a real link
if TOML['options']['remove_link_redirections']:
url = deredir_url(tag.get('href'))
else:
url = tag.get('href')
url = deredir_url(tag.get('href'))
url = substitute_source(url)
url = clean_url(url)
if TOML['options']['remove_trackers_from_urls']:
tweet_text += clean_url(url)
else:
tweet_text += url
tweet_text += url
else:
logging.warning("No handler for tag in twitter text: " + tag.prettify())