# -*- coding: utf-8 -*- from urllib.request import urlopen, urlretrieve import re import os from datetime import timedelta from bs4 import BeautifulSoup """ based on youtube-scraper (https://github.com/narfman0/youtube-scraper) """ class YoutubeScrape(object): """ Scraper object to hold data """ def __init__(self, soup): """ Initialize and scrape """ self.soup = soup #self.title = self.parse_string('.watch-title') self.title = self.parse_string({'name': 'title'}) self.artist = self.parse_string_o('.yt-user-info') self.duration = self.parse_string({'itemprop': 'duration'}) self.durationinseconds = self.parseISO8591(self.duration) self.durationtime = str(timedelta(seconds=self.durationinseconds)) self.poster = self.parse_string_o('.yt-user-info') self.views = self.parse_int('.watch-view-count') self.published = self.parse_string_o('.watch-time-text') self.published = re.sub(r'(Published|Uploaded) on', '', self.published).strip() #self.like = self.parse_int('#watch-like') #self.dislike = self.parse_int('#watch-dislike') self.yturl = self.parse_string({'property': "og:url"}) self.thumbnail = self.parse_string({'property': "og:image"}) self.videoid = self.parse_string({'itemprop': 'videoId'}) def parse_int(self, selector): """ Extract one integer element from soup """ return int(re.sub('[^0-9]', '', self.parse_string_o(selector))) def parse_string(self, selector): """ Extract one particular element from soup """ return self.soup.find_all(attrs=selector)[0]['content'] def parse_string_o(self, selector): """ Extract one particular element from soup """ return self.soup.select(selector)[0].get_text().strip() def parseISO8591(self, duration): """ Parse ISO 8591 formated duration """ regex = re.compile(r'PT((\d{1,3})H)?((\d{1,3})M)?((\d{1,2})S)?') if duration: duration = regex.findall(duration) if len(duration) > 0: _, hours, _, minutes, _, seconds = duration[0] duration = [seconds, minutes, hours] duration = [int(v) if len(v) > 0 else 0 for v in duration] duration = sum([60**p*v for p, v in enumerate(duration)]) else: duration = 30 else: duration = 30 return duration def scrape_html(html): """ Return meta information about a video """ soup = BeautifulSoup(html, "html.parser") return YoutubeScrape(soup) def scrape_url(url): """ Scrape a given url for youtube information """ html = urlopen(url) return YoutubeScrape(BeautifulSoup(html, "html.parser")) def download_image(urlthumbnail, videoid, path): """Download the thumbnail""" if not os.path.exists(path): os.makedirs(path) local_filename, headers = urlretrieve(urlthumbnail, path + '/maxresdefault-' + videoid + '.jpg') return local_filename