mirror of
https://gitlab.com/jeancf/twoot.git
synced 2025-02-25 01:18:41 +00:00
Compare commits
5 Commits
f585878d0f
...
497d9f3a20
Author | SHA1 | Date | |
---|---|---|---|
|
497d9f3a20 | ||
|
4cd49a1de7 | ||
|
9b9c7702f1 | ||
|
3fab787738 | ||
|
9bdcccf713 |
42
twoot.py
42
twoot.py
@ -169,12 +169,14 @@ Dowload page with full thread of tweets and extract all replied to tweet referen
|
|||||||
Only used by `get_timeline()`.
|
Only used by `get_timeline()`.
|
||||||
:param session: Existing HTTP session with Nitter instance
|
:param session: Existing HTTP session with Nitter instance
|
||||||
:param headers: HTTP headers to use
|
:param headers: HTTP headers to use
|
||||||
:param url: url of the thread page to download
|
:param nitter url: url of the nitter instance to use
|
||||||
:return: List of tweets from the thread
|
:param thread_url: url of the first tweet in thread
|
||||||
|
:return: list of tuples with url of tweet replied-to (or None) and content of tweet
|
||||||
"""
|
"""
|
||||||
def _get_rest_of_thread(session, headers, url):
|
def _get_rest_of_thread(session, headers, nitter_url, thread_url):
|
||||||
logging.debug("Downloading tweets in thread from separate page")
|
logging.debug("Downloading tweets in thread from separate page")
|
||||||
# Download page with thread
|
# Download page with thread
|
||||||
|
url = nitter_url + thread_url
|
||||||
try:
|
try:
|
||||||
thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
|
thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
@ -201,14 +203,25 @@ def _get_rest_of_thread(session, headers, url):
|
|||||||
|
|
||||||
# Get all items in thread after main tweet
|
# Get all items in thread after main tweet
|
||||||
after_tweet = soup.find('div', 'after-tweet')
|
after_tweet = soup.find('div', 'after-tweet')
|
||||||
|
list = after_tweet.find_all('div', class_='timeline-item')
|
||||||
|
|
||||||
|
# Build timeline of tuples
|
||||||
|
timeline = []
|
||||||
|
previous_tweet_url = thread_url
|
||||||
|
for item in list:
|
||||||
|
timeline.append((previous_tweet_url, item))
|
||||||
|
# Get the url of the tweet
|
||||||
|
previous_tweet_url = item.find('a', class_='tweet-link')
|
||||||
|
if previous_tweet_url is None:
|
||||||
|
logging.error('Thread tweet is missing link tag')
|
||||||
|
|
||||||
timeline = after_tweet.find_all('div', class_='timeline-item')
|
|
||||||
return timeline
|
return timeline
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Dowload page with full thread of tweets. Only used by `get_timeline()`.
|
Download timeline of twitter account
|
||||||
:param url: url of the thread page to download
|
:param url: url of the account page to download
|
||||||
:return: List of tweets from the thread
|
:return: list of tuples with url of tweet replied-to (or None) and content of tweet
|
||||||
"""
|
"""
|
||||||
def get_timeline(nitter_url):
|
def get_timeline(nitter_url):
|
||||||
# Define url to use
|
# Define url to use
|
||||||
@ -268,17 +281,20 @@ def get_timeline(nitter_url):
|
|||||||
for item in list:
|
for item in list:
|
||||||
classes = item['class']
|
classes = item['class']
|
||||||
if 'timeline-item' in classes: # Individual tweet
|
if 'timeline-item' in classes: # Individual tweet
|
||||||
timeline.append(item)
|
timeline.append((None, item))
|
||||||
elif 'thread-line' in classes: # First tweet of a thread
|
elif 'thread-line' in classes: # First tweet of a thread
|
||||||
# Get the first item of thread
|
# Get the first item of thread
|
||||||
first_item = item.find('div', class_='timeline-item')
|
first_item = item.find('div', class_='timeline-item')
|
||||||
timeline.append(first_item)
|
|
||||||
|
|
||||||
# Get the rest of the items of the thread
|
# Get the url of the tweet
|
||||||
thread_link_tag = item.find('a', class_='tweet-link')
|
thread_link_tag = item.find('a', class_='tweet-link')
|
||||||
if thread_link_tag is not None:
|
if thread_link_tag is not None:
|
||||||
thread_url = thread_link_tag.get('href')
|
thread_url = thread_link_tag.get('href')
|
||||||
timeline.extend(_get_rest_of_thread(session, headers, nitter_url + thread_url))
|
|
||||||
|
timeline.append((thread_url, first_item))
|
||||||
|
|
||||||
|
# Get the rest of the items of the thread
|
||||||
|
timeline.extend(_get_rest_of_thread(session, headers, nitter_url, thread_url))
|
||||||
else:
|
else:
|
||||||
# Ignore other classes
|
# Ignore other classes
|
||||||
continue
|
continue
|
||||||
@ -877,7 +893,6 @@ def main(argv):
|
|||||||
log_level = logging.INFO
|
log_level = logging.INFO
|
||||||
elif ll_str == "WARNING":
|
elif ll_str == "WARNING":
|
||||||
log_level = logging.WARNING
|
log_level = logging.WARNING
|
||||||
print('log level warning set')
|
|
||||||
elif ll_str == "ERROR":
|
elif ll_str == "ERROR":
|
||||||
log_level = logging.ERROR
|
log_level = logging.ERROR
|
||||||
elif ll_str == "CRITICAL":
|
elif ll_str == "CRITICAL":
|
||||||
@ -944,7 +959,7 @@ def main(argv):
|
|||||||
tweets = []
|
tweets = []
|
||||||
out_date_cnt = 0
|
out_date_cnt = 0
|
||||||
in_db_cnt = 0
|
in_db_cnt = 0
|
||||||
for status in timeline:
|
for reply_to, status in timeline:
|
||||||
# Extract tweet ID and status ID
|
# Extract tweet ID and status ID
|
||||||
tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
|
tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
|
||||||
status_id = tweet_id.split('/')[3]
|
status_id = tweet_id.split('/')[3]
|
||||||
@ -1107,6 +1122,7 @@ def main(argv):
|
|||||||
"tweet_text": tweet_text,
|
"tweet_text": tweet_text,
|
||||||
"video": video_file,
|
"video": video_file,
|
||||||
"photos": photos,
|
"photos": photos,
|
||||||
|
"reply-to": reply_to,
|
||||||
}
|
}
|
||||||
tweets.append(tweet)
|
tweets.append(tweet)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user