From a5fde5861583800a6428bcd5a2e57ca700a1240c Mon Sep 17 00:00:00 2001
From: jeancf <jc@noirextreme.com>
Date: Sat, 15 Feb 2020 15:39:01 +0100
Subject: [PATCH] Rewrite complete

---
 twoot.py | 105 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 65 insertions(+), 40 deletions(-)

diff --git a/twoot.py b/twoot.py
index 31fcdc5..41dc251 100755
--- a/twoot.py
+++ b/twoot.py
@@ -25,7 +25,7 @@ import random
 import requests
 from bs4 import BeautifulSoup, element
 import sqlite3
-import time
+import datetime, time
 import re
 from mastodon import Mastodon, MastodonError, MastodonAPIError, MastodonIllegalArgumentError
 
@@ -73,6 +73,10 @@ def cleanup_tweet_text(tt_iter):
                             tweet_text += ' '
                         # Add full url
                         tweet_text += tag['data-expanded-url']
+                    if tag.has_attr('data-expanded-path'):
+                        data_expanded_path = tag['data-expanded-path']
+                        if 'video' in data_expanded_path:
+                            tweet_text += '\n\n[Video embedded in original tweet]'
 
         # If element is hashflag (hashtag + icon), handle as simple hashtag
         elif tag.name == 'span' and tag['class'][0] == 'twitter-hashflag-container':
@@ -105,6 +109,19 @@ def cleanup_tweet_text(tt_iter):
     return tweet_text
 
 
+def contains_class(body_classes, some_class):
+    '''
+    :param body_classes: list of classes to search
+    :param some_class: class that we are interested in
+    :return: True if found, false otherwise
+    '''
+    found = False
+    for body_class in body_classes:
+        if body_class == some_class:
+            found = True
+
+    return found
+
 def main(argv):
 
     # Build parser for command line arguments
@@ -147,6 +164,10 @@ def main(argv):
     # Download twitter page of user. We should get a 'no javascript' landing page and some cookies
     no_js_page = requests.get(url, headers=headers)
 
+    # Verify that download worked
+    assert no_js_page.status_code == 200,\
+        'The twitter page did not download correctly. Aborting'
+
     # DEBUG: Save page to file
     of = open('no_js_page.html', 'w')
     of.write(no_js_page.text)
@@ -167,62 +188,72 @@ def main(argv):
 
     twit_account_page = requests.post('https://mobile.twitter.com/i/nojs_router?path=%2F' + twit_account, headers=headers, cookies=no_js_page.cookies)
 
+    # Verify that download worked
+    assert twit_account_page.status_code == 200,\
+        'The twitter page did not download correctly. Aborting'
+
     # DEBUG: Save page to file
     of = open(twit_account + '.html', 'w')
     of.write(twit_account_page.text)
     of.close()
 
-    # Verify that download worked
-    assert twit_account_page.status_code == 200,\
-        'The twitter page did not download correctly. Aborting'
+    # Make soup
+    soup = BeautifulSoup(twit_account_page.text, 'html.parser')
 
     # Verify that we now have the correct twitter page
-    soup = BeautifulSoup(twit_account_page.text, 'html.parser')
-    assert twit_account.lower() in str(soup.head.title.string).lower(),\
+    body_classes = soup.body.get_attribute_list('class')
+    assert contains_class(body_classes, 'users-show-page'), \
         'This is not the correct twitter page. Quitting'
 
     # Extract twitter timeline
     timeline = soup.find_all('table', class_='tweet')
 
     for status in timeline:
+        # Extract tweet id
+        tweet_id = str(status['href']).strip('?p=v')
+
         # Extract url of full status page
-        full_status_url = 'https://mobile.twitter.com' + status['href']
+        full_status_url = 'https://mobile.twitter.com' + tweet_id
 
         # fetch full status page
         full_status_page = requests.get(full_status_url, cookies=twit_account_page.cookies)
-        # For some funny reason the command above only works if I don't provide headers
-        # If I do, I get the no_js page...
+        # FIXME: For some funny reason the command above only works if I don't provide headers. If I do, I get the no_js page...
+
+        # Verify that download worked
+        assert twit_account_page.status_code == 200, \
+            'The twitter page did not download correctly. Aborting'
 
         # DEBUG: Save page to file
         of = open('full_status_page.html', 'w')
         of.write(full_status_page.text)
         of.close()
-        sys.exit(1)
 
-        # Extract tweet id
-        tweet_id = str(status['href']).strip('?p=v')
+        # Make soup
+        soup = BeautifulSoup(full_status_page.text, 'html.parser')
 
-        # Isolate tweet header
-        sih = status.find('tr', class_='tweet-header')
+        # Verify that we now have the correct twitter page
+        body_classes = soup.body.get_attribute_list('class')
+        assert contains_class(body_classes, 'tweets-show-page'), \
+            'This is not the correct twitter page. Quitting'
+
+        # Isolate table main-tweet
+        tmt = soup.find('table', class_='main-tweet')
+
+        # Extract avatar
+        author_logo_url = tmt.find('td', class_='avatar').a.img['src']
 
         # extract author
-        author = sih.find('strong', class_='fullname').get_text()
-
-        # Extract author's logo
-        author_logo_url = sih.find('img', alt=author)['src']
-
-        # TODO: Extract time stamp by following link under td.timestamp
-        import datetime
-        timestamp = datetime.datetime.now().timestamp()
+        author = tmt.find('div', class_='fullname').a.strong.get_text()
 
         # Extract user name
-        author_account = str(sih.find('div', class_='username').span.next_sibling).strip('\n ')
+        author_account = str(tmt.find('span', class_='username').span.next_sibling).strip('\n ')
 
-        # Isolate tweet text container
-        ttc = status.find('tr', class_='tweet-container')
+        # TODO: Extract time stamp
+        time_string = tmt.find('div', class_='metadata').a.get_text()
+        timestamp = datetime.datetime.strptime(time_string, '%I:%M %p - %d %b %Y').timestamp()
 
         # extract iterator over tweet text contents
-        tt_iter = ttc.find('div', class_='dir-ltr').children
+        tt_iter = tmt.find('div', class_='tweet-text').div.children
 
         tweet_text = cleanup_tweet_text(tt_iter)
 
@@ -233,21 +264,15 @@ def main(argv):
         # Add footer with link to original tweet
         tweet_text += '\n\nOriginal tweet : https://twitter.com/' + tweet_id
 
-        # Isolate attached media container
-        amoc = status.find('div', class_='AdaptiveMediaOuterContainer')
+        # DEBUG: STOP HERE
+        #sys.exit(1)
 
-        photos = []
-        if amoc:
-            # Extract photos
-            photo_conts = amoc.find_all('div', class_='AdaptiveMedia-photoContainer')
-            for p in photo_conts:
-                photos.append(p['data-image-url'])
-        # Extract tweet id
-        tweet_id = ttc.find('div', class_='tweet-text')['data-id']
-        # Mention presence of videos in tweet
-        videos = amoc.find_all('div', class_='AdaptiveMedia-videoContainer')
-        if len(videos) != 0:
-            tweet_text += '\n\n[Video embedded in original tweet]'
+        photos = []  # The no_js version of twitter only shows one photo
+
+        media = tmt.find('div', class_='media')
+        if media:
+            pic = str(media.img['src']).strip(':small')
+            photos.append(pic)
 
         # If no media was specifically added in the tweet, try to get the first picture
         # with "twitter:image" meta tag in first linked page in tweet text