83 lines
3.0 KiB
Python
Executable File
83 lines
3.0 KiB
Python
Executable File
# -*- coding: utf-8 -*-
|
|
from urllib.request import urlopen, urlretrieve
|
|
import re
|
|
import os
|
|
from datetime import timedelta
|
|
from bs4 import BeautifulSoup
|
|
|
|
"""
|
|
based on youtube-scraper (https://github.com/narfman0/youtube-scraper)
|
|
"""
|
|
|
|
|
|
class YoutubeScrape(object):
|
|
""" Scraper object to hold data """
|
|
def __init__(self, soup):
|
|
""" Initialize and scrape """
|
|
self.soup = soup
|
|
#self.title = self.parse_string('.watch-title')
|
|
self.title = self.parse_string({'name': 'title'})
|
|
self.artist = self.parse_string_o('.yt-user-info')
|
|
self.duration = self.parse_string({'itemprop': 'duration'})
|
|
self.durationinseconds = self.parseISO8591(self.duration)
|
|
self.durationtime = str(timedelta(seconds=self.durationinseconds))
|
|
self.poster = self.parse_string_o('.yt-user-info')
|
|
self.views = self.parse_int('.watch-view-count')
|
|
self.published = self.parse_string_o('.watch-time-text')
|
|
self.published = re.sub(r'(Published|Uploaded) on', '',
|
|
self.published).strip()
|
|
#self.like = self.parse_int('#watch-like')
|
|
#self.dislike = self.parse_int('#watch-dislike')
|
|
self.yturl = self.parse_string({'property': "og:url"})
|
|
self.thumbnail = self.parse_string({'property': "og:image"})
|
|
self.videoid = self.parse_string({'itemprop': 'videoId'})
|
|
|
|
def parse_int(self, selector):
|
|
""" Extract one integer element from soup """
|
|
return int(re.sub('[^0-9]', '', self.parse_string_o(selector)))
|
|
|
|
def parse_string(self, selector):
|
|
""" Extract one particular element from soup """
|
|
return self.soup.find_all(attrs=selector)[0]['content']
|
|
|
|
def parse_string_o(self, selector):
|
|
""" Extract one particular element from soup """
|
|
return self.soup.select(selector)[0].get_text().strip()
|
|
|
|
def parseISO8591(self, duration):
|
|
""" Parse ISO 8591 formated duration """
|
|
regex = re.compile(r'PT((\d{1,3})H)?((\d{1,3})M)?((\d{1,2})S)?')
|
|
if duration:
|
|
duration = regex.findall(duration)
|
|
if len(duration) > 0:
|
|
_, hours, _, minutes, _, seconds = duration[0]
|
|
duration = [seconds, minutes, hours]
|
|
duration = [int(v) if len(v) > 0 else 0 for v in duration]
|
|
duration = sum([60**p*v for p, v in enumerate(duration)])
|
|
else:
|
|
duration = 30
|
|
else:
|
|
duration = 30
|
|
return duration
|
|
|
|
|
|
def scrape_html(html):
|
|
""" Return meta information about a video """
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
return YoutubeScrape(soup)
|
|
|
|
|
|
def scrape_url(url):
|
|
""" Scrape a given url for youtube information """
|
|
html = urlopen(url)
|
|
return YoutubeScrape(BeautifulSoup(html, "html.parser"))
|
|
|
|
|
|
def download_image(urlthumbnail, videoid, path):
|
|
"""Download the thumbnail"""
|
|
if not os.path.exists(path):
|
|
os.makedirs(path)
|
|
local_filename, headers = urlretrieve(urlthumbnail,
|
|
path + '/maxresdefault-' + videoid + '.jpg')
|
|
return local_filename
|