From 323f29ffccea39b08e07d06fa07b72625eedc807 Mon Sep 17 00:00:00 2001 From: Sonal Agrawal Date: Mon, 14 Oct 2019 18:37:19 +0530 Subject: [PATCH] added medium article downloader script --- README.md | 1 + medium_article_downloader/README.md | 3 ++ medium_article_downloader/helpers.py | 43 ++++++++++++++++++++++ medium_article_downloader/medium.py | 24 ++++++++++++ medium_article_downloader/requirements.txt | 2 + 5 files changed, 73 insertions(+) create mode 100644 medium_article_downloader/README.md create mode 100644 medium_article_downloader/helpers.py create mode 100644 medium_article_downloader/medium.py create mode 100644 medium_article_downloader/requirements.txt diff --git a/README.md b/README.md index 96e4bb8..b32b6ef 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ So far, the following projects have been integrated to this repo: |[Find PhoneNumber in String](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/Find-PhoneNumber-in-String)|[Austin Zuniga](https://github.com/AustinZuniga)| |[IMDB TV Series Info Extractor](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/imdb_episode_ratings)|[Yash Raj Sarrof](https://github.com/yashYRS) | |[Yoda-speak Translator](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/speak_like_yoda)|[sonniki](https://github.com/sonniki) | +|[Medium Article Downloader](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/medium_article_downloader)|[coolsonu39](https://github.com/coolsonu39)| ## How to use : diff --git a/medium_article_downloader/README.md b/medium_article_downloader/README.md new file mode 100644 index 0000000..30d69bf --- /dev/null +++ b/medium_article_downloader/README.md @@ -0,0 +1,3 @@ +A simple python script download latest articles from medium topicwise and save them in text files. + +It basically scrapes the site using requests and bs4 modules. I made it just for fun after I read Automate the Boring Stuff with Python by Al Sweigart. \ No newline at end of file diff --git a/medium_article_downloader/helpers.py b/medium_article_downloader/helpers.py new file mode 100644 index 0000000..e7d3177 --- /dev/null +++ b/medium_article_downloader/helpers.py @@ -0,0 +1,43 @@ +import requests, bs4 + +def get_topic(): + '''Get a topic to download from user.''' + + topic_list = ['comics', 'books', 'art', 'culture', 'film', 'food', 'gaming', 'humor', 'internet-culture', 'lit', 'medium-magazine', 'music', 'photography', 'social-media', 'sports', 'style', 'true-crime', 'tv', 'writing', 'business', 'design', 'economy', 'startups', 'freelancing', 'leadersip', 'marketing', 'productivity', 'work', 'artificial-intelligence', 'blockchain', 'cryptocurrency', 'cybersecurity', 'data-science', 'gadgets', 'javascript', 'macine-learning', 'math', 'neuroscience', 'programming', 'science', 'self-driving-cars', 'software-engineering', 'space', 'technology', 'visual-design', 'addiction', 'creativity', 'disability', 'family', 'health', 'mental-health', 'parenting', 'personal-finance', 'pets', 'psychedelics', 'psychology', 'relationships', 'self', 'sexuality', 'spirituality', 'travel', 'wellness', 'basic-income', 'cities', 'education', 'environment', 'equality', 'future', 'gun-control', 'history', 'justice', 'language', 'lgbtqia', 'media', 'masculinity', 'philosophy', 'politics', 'race', 'religion', 'san-francisco', 'transportation', 'women', 'world'] + print('Welcome to Medium aricle downloader by @CoolSonu39!') + choice = 'some-random-topic' + print('Which domain do you want to read today?') + while choice not in topic_list: + print("Enter 'list' to see the list of topics.") + choice = input('Enter your choice: ') + if choice == 'list': + print() + for i in topic_list: + print(i) + print() + elif choice not in topic_list: + print('\nTopic' + choice + 'not found :(') + return choice + + +def extract_links(url): + '''Extract article links from url''' + + html_response = requests.get(url) + parsed_response = bs4.BeautifulSoup(html_response.text, features='html5lib') + article_list = parsed_response.select('h3 > a') + return article_list + + +def medium_text(url): + '''Extract text from a medium article link.''' + + html_response = requests.get(url) + parsed_response = bs4.BeautifulSoup(html_response.text, features='html5lib') + tag_list = parsed_response.find_all(['h1', 'p', 'h2']) + + extracted_text = '' + for j in range(len(tag_list)): + extracted_text += tag_list[j].getText() + '\n\n' + + return extracted_text \ No newline at end of file diff --git a/medium_article_downloader/medium.py b/medium_article_downloader/medium.py new file mode 100644 index 0000000..6436f65 --- /dev/null +++ b/medium_article_downloader/medium.py @@ -0,0 +1,24 @@ +import requests, bs4 +from helpers import * + +choice = get_topic() +print('\nGetting latest article links from %s...' % (choice)) + +article_list = extract_links('https://medium.com/topic/' + choice) +print('Total articles found: ' + str(len(article_list))) + +for i in range(len(article_list)): + heading = article_list[i].getText() + artlink = article_list[i].get('href') + artlink = artlink if artlink.startswith("https://") else "https://medium.com" + artlink + print('Downloading article: ' + str(i+1)) + + # remove invalid characters from filename + file_name = f"{heading}.txt".replace(':', '').replace('?', '') + file = open(file_name, 'w') + + article_text = medium_text(artlink) + file.write(article_text) + file.close() + +print('Done.') \ No newline at end of file diff --git a/medium_article_downloader/requirements.txt b/medium_article_downloader/requirements.txt new file mode 100644 index 0000000..1f311f5 --- /dev/null +++ b/medium_article_downloader/requirements.txt @@ -0,0 +1,2 @@ +requests +bs4 \ No newline at end of file