mirror of
https://gitlab.com/jeancf/twoot.git
synced 2025-01-31 13:43:46 +00:00
WIP twitter changes
This commit is contained in:
parent
446f39f173
commit
9dbf40bb5d
32
twoot.py
32
twoot.py
|
@ -64,10 +64,8 @@ def cleanup_tweet_text(tt_iter):
|
||||||
elif tc == 'twitter-atreply':
|
elif tc == 'twitter-atreply':
|
||||||
tweet_text += tag.get_text()
|
tweet_text += tag.get_text()
|
||||||
|
|
||||||
# If element is a link
|
# If element is an external link
|
||||||
elif tc == 'twitter-timeline-link':
|
elif tc == 'twitter_external_link':
|
||||||
# If it is not a link to some embedded content, keep raw link
|
|
||||||
if not tag.has_attr('data-pre-embedded') and tag.has_attr('data-expanded-url'):
|
|
||||||
# Add a sometimes missing space before url
|
# Add a sometimes missing space before url
|
||||||
if not tweet_text.endswith(' ') and not tweet_text.endswith('\n'):
|
if not tweet_text.endswith(' ') and not tweet_text.endswith('\n'):
|
||||||
tweet_text += ' '
|
tweet_text += ' '
|
||||||
|
@ -181,35 +179,33 @@ def main(argv):
|
||||||
'This is not the correct twitter page. Quitting'
|
'This is not the correct twitter page. Quitting'
|
||||||
|
|
||||||
# Extract twitter timeline
|
# Extract twitter timeline
|
||||||
results = soup.find_all('div', class_='content')
|
results = soup.find_all('table', class_='tweet')
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
# Isolate tweet header
|
# Isolate tweet header
|
||||||
sih = result.find('div', class_='stream-item-header')
|
sih = result.find('tr', class_='tweet-header')
|
||||||
|
|
||||||
# extract author
|
# extract author
|
||||||
author = sih.find('strong', class_='fullname').get_text()
|
author = sih.find('strong', class_='fullname').get_text()
|
||||||
|
|
||||||
# Extract author's logo
|
# Extract author's logo
|
||||||
author_logo_url = sih.find('img', class_='avatar')['src']
|
author_logo_url = sih.find('img', alt=author)['src']
|
||||||
|
|
||||||
# Extract time stamp
|
# TODO: Extract time stamp by following link under td.timestamp
|
||||||
try:
|
import datetime
|
||||||
timestamp = sih.find('a', class_='tweet-timestamp').find('span', class_='_timestamp')['data-time']
|
timestamp = datetime.datetime.now().timestamp()
|
||||||
except AttributeError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Extract tweet id
|
|
||||||
tweet_id = sih.find('a', class_='tweet-timestamp')['href']
|
|
||||||
|
|
||||||
# Extract user name
|
# Extract user name
|
||||||
author_account = re.search('^/(.+?)/', tweet_id).group(1)
|
author_account = str(sih.find('div', class_='username').span.next_sibling).strip('\n ')
|
||||||
|
|
||||||
# Isolate tweet text container
|
# Isolate tweet text container
|
||||||
ttc = result.find('div', class_='js-tweet-text-container')
|
ttc = result.find('tr', class_='tweet-container')
|
||||||
|
|
||||||
|
# Extract tweet id
|
||||||
|
tweet_id = ttc.find('div', class_='tweet-text')['data-id']
|
||||||
|
|
||||||
# extract iterator over tweet text contents
|
# extract iterator over tweet text contents
|
||||||
tt_iter = ttc.find('p', class_='tweet-text').children
|
tt_iter = ttc.find('div', class_='dir-ltr').children
|
||||||
|
|
||||||
tweet_text = cleanup_tweet_text(tt_iter)
|
tweet_text = cleanup_tweet_text(tt_iter)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user