Add Slideshare-Downloader script to download SlideShare presentations

This commit is contained in:
Christopher Goes 2018-10-03 22:11:59 -06:00
parent a04df11006
commit 9792e9924b
3 changed files with 100 additions and 0 deletions

View File

@ -0,0 +1,23 @@
# Slideshare-Downloader
Download slides from slideshows shared on SlideShare (Now LinkedIn SlideShare) as a PDF.
# Usage
This was written for Python 3, but it should work with Python 2.7 as well.
## Installation
### Linux/Mac
```bash
python3 -m pip install --user -U -r requirements.txt
python3 slideshare_downloader.py --help
```
### Windows
```powershell
py -3 -m pip install --user -U -r requirements.txt
py -3 slideshare_downloader.py --help
```
## Running
```bash
slideshare_downloader.py -f some_slides -u http://www.slideshare.net/codeblue_jp/igor-skochinsky-enpub
```

View File

@ -0,0 +1,4 @@
beautifulsoup4>=4.0.0
requests>=2.0.0
img2pdf>=0.2.1
docopt>=0.6.0

View File

@ -0,0 +1,73 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Credit for base code goes to: yodiaditya
# https://github.com/yodiaditya/slideshare-downloader/blob/master/convertpdf.py
"""SlideShare Downloader.
Usage:
slideshare_downloader.py [options]
Options:
-h, --help Show this screen
-f <file> Specify output filename
-u <url> URL to download
"""
import img2pdf
from docopt import docopt
from os import walk, mkdir, chdir, getcwd
from os.path import join
from urllib.parse import urlparse
from urllib.request import urlopen
from bs4 import BeautifulSoup
from requests import get
class SlideShare:
""" Download slides from SlideShare and convert them into a PDF. """
def __init__(self):
self.TOP_DIR = getcwd()
def get_slides(self, download_url=None, filename=None):
if download_url:
i_dir = self.download_images(download_url)
else:
i_dir = self.download_images(input('SlideShare full URL (including "http://"): '))
if filename:
self.create_pdf(i_dir, filename + '.pdf')
else:
self.create_pdf(i_dir, i_dir + '.pdf')
@staticmethod
def download_images(page_url):
html = urlopen(page_url).read()
soup = BeautifulSoup(html, 'html.parser')
images = soup.findAll('img', {'class': 'slide_image'}) # Parse out the slide images
image_dir = soup.title.string.strip(' \t\r\n').lower().replace(' ', '-') # Get name of the slide deck
try:
mkdir(image_dir) # Create the folder for our images
except FileExistsError:
print("The directory '%s' already exists. Assuming PDF rebuild, continuing with existing contents...\n"
"Delete the directory to re-download the slide images." % image_dir)
return image_dir
chdir(image_dir) # Change to image folder so we don't pollute starting folder
for image in images:
image_url = image.get('data-full').split('?')[0]
with open(urlparse(image_url).path.split('/')[-1], "wb") as file:
response = get(image_url)
file.write(response.content)
return image_dir
def create_pdf(self, image_dir, filename):
chdir(join(self.TOP_DIR, image_dir))
files = next(walk(join(self.TOP_DIR, image_dir)))[2]
with open(join(self.TOP_DIR, filename), "wb") as file:
img2pdf.convert(*files, title=filename, outputstream=file)
if __name__ == "__main__":
arguments = docopt(__doc__)
ss = SlideShare()
ss.get_slides(arguments['-u'], arguments['-f'])