Extract addresses of avatar and banner images

2025-05-02 17:43:35 +00:00 · 2023-06-13 16:31:41 +02:00 · 2023-06-13 16:31:41 +02:00 · b8bd0a12f5
commit b8bd0a12f5
parent 54c59fa676
1 changed files with 16 additions and 4 deletions
--- a/twoot.py
+++ b/twoot.py
@ -29,7 +29,7 @@ import sqlite3
 import sys
 import time
 from pathlib import Path
-from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse, urljoin
+from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse, urljoin, unquote

 import requests
 from bs4 import BeautifulSoup, element
@ -699,6 +699,8 @@ def main(argv):
               mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
    db.execute('''CREATE INDEX IF NOT EXISTS main_index ON toots (twitter_account,
               mastodon_instance, mastodon_account, tweet_id)''')
+    db.execute('''CREATE TABLE IF NOT EXISTS profiles (mastodon_account TEXT, avatar text, banner, text)''')
+    db.execute('''CREATE INDEX IF NOT EXIsTS profile_index ON profiles (mastodon_account)''')

    # Select random nitter instance to fetch updates from
    nitter_url = NITTER_URLS[random.randint(0, len(NITTER_URLS) - 1)]
@ -748,13 +750,23 @@ def main(argv):
    logging.debug('Nitter page downloaded successfully from ' + url)

    # DEBUG: Save page to file
-    # of = open(toml['config']['twitter_account'] + '.html', 'w')
-    # of.write(twit_account_page.text)
-    # of.close()
+    of = open(TOML['config']['twitter_account'] + '.html', 'w')
+    of.write(twit_account_page.text)
+    of.close()

    # Make soup
    soup = BeautifulSoup(twit_account_page.text, 'html.parser')

+    # Extract avatar picture address
+    avatar = 'https://' + unquote(soup.find('div', class_='profile-card-info').findChild('a').findChild('img').get('src').removeprefix('/pic/'))
+
+    # Extract banner picture address
+    banner = unquote(soup.find('div', class_='profile-banner').findChild('a').findChild('img').get('src').removeprefix('/pic/'))
+
+    print(avatar)
+    print(banner)
+    exit(0)
+
    # Extract twitter timeline
    timeline = soup.find_all('div', class_='timeline-item')