Added option to scrape linked page if no pic is provided in tweet

2025-05-07 03:53:56 +00:00 · 2019-08-16 15:27:55 +02:00 · 2019-08-16 15:27:55 +02:00 · 5b23c66b6b
commit 5b23c66b6b
parent 8059b062ac
1 changed files with 13 additions and 0 deletions
--- a/twoot.py
+++ b/twoot.py
@ -213,6 +213,19 @@ def main(argv):
            if len(videos) != 0:
                tweet_text += '\n\n[Embedded video in original tweet]'

+        # If no media was specifically added in the tweet, try to get the first picture
+        # with "twitter:image" meta tag in first linked page in tweet text
+        if not photos:
+            m = re.search(r"http[^ \n\xa0]*", tweet_text)
+            if m is not None:
+                link_url = m.group(0)
+                r = requests.get(link_url)
+                if r.status_code == 200:
+                    # Matches the first instance of either twitter:image or twitter:image:src meta tag
+                    match = re.search(r'<meta name="twitter:image(?:|:src)" content="(.+?)".*?>', r.text)
+                    if match is not None:
+                        photos.append(match.group(1))
+
        # Add dictionary with content of tweet to list
        tweet = {
            "author": author,