WIP twitter changes

This commit is contained in:
JC Francois 2020-02-14 07:58:39 +01:00
parent 446f39f173
commit 9dbf40bb5d

View File

@ -64,10 +64,8 @@ def cleanup_tweet_text(tt_iter):
elif tc == 'twitter-atreply': elif tc == 'twitter-atreply':
tweet_text += tag.get_text() tweet_text += tag.get_text()
# If element is a link # If element is an external link
elif tc == 'twitter-timeline-link': elif tc == 'twitter_external_link':
# If it is not a link to some embedded content, keep raw link
if not tag.has_attr('data-pre-embedded') and tag.has_attr('data-expanded-url'):
# Add a sometimes missing space before url # Add a sometimes missing space before url
if not tweet_text.endswith(' ') and not tweet_text.endswith('\n'): if not tweet_text.endswith(' ') and not tweet_text.endswith('\n'):
tweet_text += ' ' tweet_text += ' '
@ -181,35 +179,33 @@ def main(argv):
'This is not the correct twitter page. Quitting' 'This is not the correct twitter page. Quitting'
# Extract twitter timeline # Extract twitter timeline
results = soup.find_all('div', class_='content') results = soup.find_all('table', class_='tweet')
for result in results: for result in results:
# Isolate tweet header # Isolate tweet header
sih = result.find('div', class_='stream-item-header') sih = result.find('tr', class_='tweet-header')
# extract author # extract author
author = sih.find('strong', class_='fullname').get_text() author = sih.find('strong', class_='fullname').get_text()
# Extract author's logo # Extract author's logo
author_logo_url = sih.find('img', class_='avatar')['src'] author_logo_url = sih.find('img', alt=author)['src']
# Extract time stamp # TODO: Extract time stamp by following link under td.timestamp
try: import datetime
timestamp = sih.find('a', class_='tweet-timestamp').find('span', class_='_timestamp')['data-time'] timestamp = datetime.datetime.now().timestamp()
except AttributeError:
continue
# Extract tweet id
tweet_id = sih.find('a', class_='tweet-timestamp')['href']
# Extract user name # Extract user name
author_account = re.search('^/(.+?)/', tweet_id).group(1) author_account = str(sih.find('div', class_='username').span.next_sibling).strip('\n ')
# Isolate tweet text container # Isolate tweet text container
ttc = result.find('div', class_='js-tweet-text-container') ttc = result.find('tr', class_='tweet-container')
# Extract tweet id
tweet_id = ttc.find('div', class_='tweet-text')['data-id']
# extract iterator over tweet text contents # extract iterator over tweet text contents
tt_iter = ttc.find('p', class_='tweet-text').children tt_iter = ttc.find('div', class_='dir-ltr').children
tweet_text = cleanup_tweet_text(tt_iter) tweet_text = cleanup_tweet_text(tt_iter)