exemple_mastodon.py/playlist2toot_v2/ytscraper.py
2022-05-23 20:36:29 +02:00

83 lines
3.0 KiB
Python
Executable File

# -*- coding: utf-8 -*-
from urllib.request import urlopen, urlretrieve
import re
import os
from datetime import timedelta
from bs4 import BeautifulSoup
"""
based on youtube-scraper (https://github.com/narfman0/youtube-scraper)
"""
class YoutubeScrape(object):
""" Scraper object to hold data """
def __init__(self, soup):
""" Initialize and scrape """
self.soup = soup
#self.title = self.parse_string('.watch-title')
self.title = self.parse_string({'name': 'title'})
self.artist = self.parse_string_o('.yt-user-info')
self.duration = self.parse_string({'itemprop': 'duration'})
self.durationinseconds = self.parseISO8591(self.duration)
self.durationtime = str(timedelta(seconds=self.durationinseconds))
self.poster = self.parse_string_o('.yt-user-info')
self.views = self.parse_int('.watch-view-count')
self.published = self.parse_string_o('.watch-time-text')
self.published = re.sub(r'(Published|Uploaded) on', '',
self.published).strip()
#self.like = self.parse_int('#watch-like')
#self.dislike = self.parse_int('#watch-dislike')
self.yturl = self.parse_string({'property': "og:url"})
self.thumbnail = self.parse_string({'property': "og:image"})
self.videoid = self.parse_string({'itemprop': 'videoId'})
def parse_int(self, selector):
""" Extract one integer element from soup """
return int(re.sub('[^0-9]', '', self.parse_string_o(selector)))
def parse_string(self, selector):
""" Extract one particular element from soup """
return self.soup.find_all(attrs=selector)[0]['content']
def parse_string_o(self, selector):
""" Extract one particular element from soup """
return self.soup.select(selector)[0].get_text().strip()
def parseISO8591(self, duration):
""" Parse ISO 8591 formated duration """
regex = re.compile(r'PT((\d{1,3})H)?((\d{1,3})M)?((\d{1,2})S)?')
if duration:
duration = regex.findall(duration)
if len(duration) > 0:
_, hours, _, minutes, _, seconds = duration[0]
duration = [seconds, minutes, hours]
duration = [int(v) if len(v) > 0 else 0 for v in duration]
duration = sum([60**p*v for p, v in enumerate(duration)])
else:
duration = 30
else:
duration = 30
return duration
def scrape_html(html):
""" Return meta information about a video """
soup = BeautifulSoup(html, "html.parser")
return YoutubeScrape(soup)
def scrape_url(url):
""" Scrape a given url for youtube information """
html = urlopen(url)
return YoutubeScrape(BeautifulSoup(html, "html.parser"))
def download_image(urlthumbnail, videoid, path):
"""Download the thumbnail"""
if not os.path.exists(path):
os.makedirs(path)
local_filename, headers = urlretrieve(urlthumbnail,
path + '/maxresdefault-' + videoid + '.jpg')
return local_filename