mirror of
https://gitlab.com/jeancf/twoot.git
synced 2025-02-22 16:22:12 +00:00
Compare commits
4 Commits
8930d5329f
...
9625c2128b
Author | SHA1 | Date | |
---|---|---|---|
|
9625c2128b | ||
|
e11102f4a6 | ||
|
68e4918b02 | ||
|
40d14c4d5d |
40
twoot.py
40
twoot.py
|
@ -66,6 +66,37 @@ USER_AGENTS = [
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Vivaldi/5.4.2753.51',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Vivaldi/5.4.2753.51',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def deredir_url(url):
|
||||||
|
"""
|
||||||
|
Given a URL, return the URL that the page really downloads from
|
||||||
|
:param url: url to be de-redirected
|
||||||
|
:return: direct url
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Get a copy of the default headers that requests would use
|
||||||
|
headers = requests.utils.default_headers()
|
||||||
|
|
||||||
|
# Update default headers with randomly selected user agent
|
||||||
|
headers.update(
|
||||||
|
{
|
||||||
|
'User-Agent': USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
ret = None
|
||||||
|
try:
|
||||||
|
# Download the page
|
||||||
|
ret = requests.get(url, headers=headers, timeout=5)
|
||||||
|
except:
|
||||||
|
# If anything goes wrong keep the URL intact
|
||||||
|
return url
|
||||||
|
|
||||||
|
if ret.url != url:
|
||||||
|
logging.debug("Removed redirection from: " + url + " to: " + ret.url)
|
||||||
|
|
||||||
|
# Return the URL that the page was downloaded from
|
||||||
|
return ret.url
|
||||||
|
|
||||||
|
|
||||||
def _remove_trackers_query(query_str):
|
def _remove_trackers_query(query_str):
|
||||||
"""
|
"""
|
||||||
|
@ -158,11 +189,12 @@ def process_media_body(tt_iter, remove_trackers):
|
||||||
# Only keep hashtag text
|
# Only keep hashtag text
|
||||||
tweet_text += tag_text
|
tweet_text += tag_text
|
||||||
else:
|
else:
|
||||||
# This is a real link, keep url
|
# This is a real link
|
||||||
|
url = deredir_url(tag.get('href'))
|
||||||
if remove_trackers:
|
if remove_trackers:
|
||||||
tweet_text += clean_url(tag.get('href'))
|
tweet_text += clean_url(url)
|
||||||
else:
|
else:
|
||||||
tweet_text += tag.get('href')
|
tweet_text += url
|
||||||
else:
|
else:
|
||||||
logging.warning("No handler for tag in twitter text: " + tag.prettify())
|
logging.warning("No handler for tag in twitter text: " + tag.prettify())
|
||||||
|
|
||||||
|
@ -426,7 +458,7 @@ def main(argv):
|
||||||
if tweets_and_replies:
|
if tweets_and_replies:
|
||||||
url += '/with_replies'
|
url += '/with_replies'
|
||||||
|
|
||||||
# Download twitter page of user.
|
# Download twitter page of user
|
||||||
try:
|
try:
|
||||||
twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
|
twit_account_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user