mirror of
https://gitlab.com/jeancf/twoot.git
synced 2025-02-25 09:28:43 +00:00
Compare commits
No commits in common. "a4f3934d8654c32869525a48cbb4d0ae55c8b753" and "f8bd948b9c4eb9310fd9807d90df039990c62580" have entirely different histories.
a4f3934d86
...
f8bd948b9c
100
twoot.py
100
twoot.py
@ -192,60 +192,7 @@ def build_config(args):
|
|||||||
exit(-1)
|
exit(-1)
|
||||||
|
|
||||||
|
|
||||||
"""
|
def get_timeline(url):
|
||||||
Dowload page with full thread of tweets and extract all replied to tweet reference by url.
|
|
||||||
Only used by `get_timeline()`.
|
|
||||||
:param session: Existing HTTP session with Nitter instance
|
|
||||||
:param headers: HTTP headers to use
|
|
||||||
:param url: url of the thread page to download
|
|
||||||
:return: List of tweets from the thread
|
|
||||||
"""
|
|
||||||
def _get_rest_of_thread(session, headers, url):
|
|
||||||
logging.debug("Downloading tweets in thread from separate page")
|
|
||||||
# Download page with thread
|
|
||||||
try:
|
|
||||||
thread_page = session.get(url, headers=headers, timeout=HTTPS_REQ_TIMEOUT)
|
|
||||||
except requests.exceptions.ConnectionError:
|
|
||||||
logging.fatal('Host did not respond when trying to download ' + url)
|
|
||||||
shutdown(-1)
|
|
||||||
except requests.exceptions.Timeout:
|
|
||||||
logging.fatal(url + ' took too long to respond')
|
|
||||||
shutdown(-1)
|
|
||||||
|
|
||||||
# Verify that download worked
|
|
||||||
if thread_page.status_code != 200:
|
|
||||||
logging.fatal('The Nitter page did not download correctly from ' + url + ' (' + str(thread_page.status_code) + '). Aborting')
|
|
||||||
shutdown(-1)
|
|
||||||
|
|
||||||
logging.debug('Nitter page downloaded successfully from ' + url)
|
|
||||||
|
|
||||||
# DEBUG: Save page to file
|
|
||||||
# of = open('thread_page_debug.html', 'w')
|
|
||||||
# of.write(twit_account_page.text)
|
|
||||||
# of.close()
|
|
||||||
|
|
||||||
# Make soup
|
|
||||||
soup = BeautifulSoup(thread_page.text, 'html.parser')
|
|
||||||
|
|
||||||
# Get all items in thread after main tweet
|
|
||||||
after_tweet = soup.find('div', 'after-tweet')
|
|
||||||
|
|
||||||
timeline = after_tweet.find_all('div', class_='timeline-item')
|
|
||||||
return timeline
|
|
||||||
|
|
||||||
"""
|
|
||||||
Dowload page with full thread of tweets. Only used by `get_timeline()`.
|
|
||||||
:param url: url of the thread page to download
|
|
||||||
:return: List of tweets from the thread
|
|
||||||
"""
|
|
||||||
def get_timeline(nitter_url):
|
|
||||||
# Define url to use
|
|
||||||
url = nitter_url + '/' + TOML['config']['twitter_account']
|
|
||||||
|
|
||||||
# Use different page if we need to handle replies
|
|
||||||
if TOML['options']['post_reply_to']:
|
|
||||||
url += '/with_replies'
|
|
||||||
|
|
||||||
# Initiate session
|
# Initiate session
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
|
||||||
@ -279,37 +226,27 @@ def get_timeline(nitter_url):
|
|||||||
logging.debug('Nitter page downloaded successfully from ' + url)
|
logging.debug('Nitter page downloaded successfully from ' + url)
|
||||||
|
|
||||||
# DEBUG: Save page to file
|
# DEBUG: Save page to file
|
||||||
# of = open('user_page_debug.html', 'w')
|
# of = open(TOML['config']['twitter_account'] + '.html', 'w')
|
||||||
# of.write(twit_account_page.text)
|
# of.write(twit_account_page.text)
|
||||||
# of.close()
|
# of.close()
|
||||||
|
|
||||||
# Make soup
|
# Make soup
|
||||||
soup = BeautifulSoup(twit_account_page.text, 'html.parser')
|
soup = BeautifulSoup(twit_account_page.text, 'html.parser')
|
||||||
|
|
||||||
# Get the div containing tweets
|
# Extract twitter timeline
|
||||||
tl = soup.find('div', class_='timeline')
|
|
||||||
|
|
||||||
# Get the list of direct children of timeline
|
|
||||||
list = tl.find_all('div', recursive=False)
|
|
||||||
|
|
||||||
timeline = []
|
timeline = []
|
||||||
|
|
||||||
|
# Get all the items from the timeline
|
||||||
|
list = soup.find_all('div', class_='timeline-item')
|
||||||
|
|
||||||
for item in list:
|
for item in list:
|
||||||
classes = item['class']
|
classes = item['class']
|
||||||
if 'timeline-item' in classes:
|
if 'more-replies-thread' in classes:
|
||||||
timeline.append(item)
|
logging.debug('found a more-replies-thread item')
|
||||||
elif 'thread-line' in classes:
|
|
||||||
# Get the first item of thread
|
|
||||||
first_item = item.find('div', class_='timeline-item')
|
|
||||||
timeline.append(first_item)
|
|
||||||
|
|
||||||
# Get the rest of the items of the thread
|
|
||||||
thread_link_tag = item.find('a', class_='tweet-link')
|
|
||||||
if thread_link_tag is not None:
|
|
||||||
thread_url = thread_link_tag.get('href')
|
|
||||||
timeline.extend(_get_rest_of_thread(session, headers, nitter_url + thread_url))
|
|
||||||
else:
|
else:
|
||||||
# Ignore other classes
|
timeline.append(item)
|
||||||
continue
|
|
||||||
|
|
||||||
return soup, timeline
|
return soup, timeline
|
||||||
|
|
||||||
|
|
||||||
@ -678,7 +615,6 @@ def process_attachments(nitter_url, attachments_container, status_id, author_acc
|
|||||||
vid_class = attachments_container.find('div', class_='video-container')
|
vid_class = attachments_container.find('div', class_='video-container')
|
||||||
if vid_class is not None:
|
if vid_class is not None:
|
||||||
if TOML['options']['upload_videos']:
|
if TOML['options']['upload_videos']:
|
||||||
logging.debug("downloading video from twitter")
|
|
||||||
import youtube_dl
|
import youtube_dl
|
||||||
|
|
||||||
video_path = f"{author_account}/status/{status_id}"
|
video_path = f"{author_account}/status/{status_id}"
|
||||||
@ -963,7 +899,12 @@ def main(argv):
|
|||||||
# To store content of all tweets from this user
|
# To store content of all tweets from this user
|
||||||
tweets = []
|
tweets = []
|
||||||
|
|
||||||
soup, timeline = get_timeline(nitter_url)
|
url = nitter_url + '/' + TOML['config']['twitter_account']
|
||||||
|
# Use different page if we need to handle replies
|
||||||
|
if TOML['options']['post_reply_to']:
|
||||||
|
url += '/with_replies'
|
||||||
|
|
||||||
|
soup, timeline = get_timeline(url)
|
||||||
|
|
||||||
logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')
|
logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')
|
||||||
|
|
||||||
@ -975,8 +916,13 @@ def main(argv):
|
|||||||
in_db_cnt = 0
|
in_db_cnt = 0
|
||||||
for status in timeline:
|
for status in timeline:
|
||||||
# Extract tweet ID and status ID
|
# Extract tweet ID and status ID
|
||||||
|
try:
|
||||||
tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
|
tweet_id = status.find('a', class_='tweet-link').get('href').strip('#m')
|
||||||
status_id = tweet_id.split('/')[3]
|
status_id = tweet_id.split('/')[3]
|
||||||
|
except Exception as e:
|
||||||
|
logging.critical('Malformed timeline downloaded from nitter instance')
|
||||||
|
logging.debug(e)
|
||||||
|
shutdown(-1)
|
||||||
|
|
||||||
logging.debug('processing tweet %s', tweet_id)
|
logging.debug('processing tweet %s', tweet_id)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user