WIP twitter changes

2025-01-31 05:33:45 +00:00 · 2020-02-14 07:58:39 +01:00 · 2020-02-14 07:58:39 +01:00 · 9dbf40bb5d
commit 9dbf40bb5d
parent 446f39f173
1 changed files with 19 additions and 23 deletions
--- a/twoot.py
+++ b/twoot.py
@ -64,10 +64,8 @@ def cleanup_tweet_text(tt_iter):
                elif tc == 'twitter-atreply':
                    tweet_text += tag.get_text()

-                # If element is a link
-                elif tc == 'twitter-timeline-link':
-                    # If it is not a link to some embedded content, keep raw link
-                    if not tag.has_attr('data-pre-embedded') and tag.has_attr('data-expanded-url'):
+                # If element is an external link
+                elif tc == 'twitter_external_link':
                    # Add a sometimes missing space before url
                    if not tweet_text.endswith(' ') and not tweet_text.endswith('\n'):
                        tweet_text += ' '
@ -181,35 +179,33 @@ def main(argv):
        'This is not the correct twitter page. Quitting'

    # Extract twitter timeline
-    results = soup.find_all('div', class_='content')
+    results = soup.find_all('table', class_='tweet')

    for result in results:
        # Isolate tweet header
-        sih = result.find('div', class_='stream-item-header')
+        sih = result.find('tr', class_='tweet-header')

        # extract author
        author = sih.find('strong', class_='fullname').get_text()

        # Extract author's logo
-        author_logo_url = sih.find('img', class_='avatar')['src']
+        author_logo_url = sih.find('img', alt=author)['src']

-        # Extract time stamp
-        try:
-            timestamp = sih.find('a', class_='tweet-timestamp').find('span', class_='_timestamp')['data-time']
-        except AttributeError:
-            continue
-
-        # Extract tweet id
-        tweet_id = sih.find('a', class_='tweet-timestamp')['href']
+        # TODO: Extract time stamp by following link under td.timestamp
+        import datetime
+        timestamp = datetime.datetime.now().timestamp()

        # Extract user name
-        author_account = re.search('^/(.+?)/', tweet_id).group(1)
+        author_account = str(sih.find('div', class_='username').span.next_sibling).strip('\n ')

        # Isolate tweet text container
-        ttc = result.find('div', class_='js-tweet-text-container')
+        ttc = result.find('tr', class_='tweet-container')
+
+        # Extract tweet id
+        tweet_id = ttc.find('div', class_='tweet-text')['data-id']

        # extract iterator over tweet text contents
-        tt_iter = ttc.find('p', class_='tweet-text').children
+        tt_iter = ttc.find('div', class_='dir-ltr').children

        tweet_text = cleanup_tweet_text(tt_iter)