Add Exclusion of thread tweets

This commit is contained in:
jeancf 2023-07-12 14:51:04 +02:00
parent ea12cea20f
commit 530953f48b

View File

@ -72,6 +72,22 @@ USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Vivaldi/6.1.3035.84', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Vivaldi/6.1.3035.84',
] ]
"""
Temporary mitigation for unability to parse threads. Skip tweets that are part of a thread
"""
def has_class_timeline_item_but_not_thread(tag):
if tag.has_attr('class'):
classes = tag['class']
if 'timeline-item' in classes and 'thread' not in classes:
return True
elif 'timeline-item' in classes and 'thread' in classes:
logging.warning('Tweet is part of a thread which are a new nitter feature that is not handled yet. Skipping')
return False
else:
return False
else:
return False
def build_config(args): def build_config(args):
""" """
@ -873,7 +889,7 @@ def main(argv):
soup = BeautifulSoup(twit_account_page.text, 'html.parser') soup = BeautifulSoup(twit_account_page.text, 'html.parser')
# Extract twitter timeline # Extract twitter timeline
timeline = soup.find_all('div', class_='timeline-item') timeline = soup.find_all(has_class_timeline_item_but_not_thread)
logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline') logging.info('Processing ' + str(len(timeline)) + ' tweets found in timeline')