Extract addresses of avatar and banner images

This commit is contained in:
jeancf 2023-06-13 16:31:41 +02:00
parent 54c59fa676
commit b8bd0a12f5

View File

@ -29,7 +29,7 @@ import sqlite3
import sys
import time
from pathlib import Path
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse, urljoin
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse, urljoin, unquote
import requests
from bs4 import BeautifulSoup, element
@ -699,6 +699,8 @@ def main(argv):
mastodon_account TEXT, tweet_id TEXT, toot_id TEXT)''')
db.execute('''CREATE INDEX IF NOT EXISTS main_index ON toots (twitter_account,
mastodon_instance, mastodon_account, tweet_id)''')
db.execute('''CREATE TABLE IF NOT EXISTS profiles (mastodon_account TEXT, avatar text, banner, text)''')
db.execute('''CREATE INDEX IF NOT EXIsTS profile_index ON profiles (mastodon_account)''')
# Select random nitter instance to fetch updates from
nitter_url = NITTER_URLS[random.randint(0, len(NITTER_URLS) - 1)]
@ -748,13 +750,23 @@ def main(argv):
logging.debug('Nitter page downloaded successfully from ' + url)
# DEBUG: Save page to file
# of = open(toml['config']['twitter_account'] + '.html', 'w')
# of.write(twit_account_page.text)
# of.close()
of = open(TOML['config']['twitter_account'] + '.html', 'w')
of.write(twit_account_page.text)
of.close()
# Make soup
soup = BeautifulSoup(twit_account_page.text, 'html.parser')
# Extract avatar picture address
avatar = 'https://' + unquote(soup.find('div', class_='profile-card-info').findChild('a').findChild('img').get('src').removeprefix('/pic/'))
# Extract banner picture address
banner = unquote(soup.find('div', class_='profile-banner').findChild('a').findChild('img').get('src').removeprefix('/pic/'))
print(avatar)
print(banner)
exit(0)
# Extract twitter timeline
timeline = soup.find_all('div', class_='timeline-item')