added medium article downloader script

2024-11-23 20:11:07 +00:00 · 2019-10-14 18:37:19 +05:30 · 2019-10-14 18:37:19 +05:30 · 323f29ffcc
commit 323f29ffcc
parent 760ba8492c
5 changed files with 73 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -66,6 +66,7 @@ So far, the following projects have been integrated to this repo:
 |[Find PhoneNumber in String](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/Find-PhoneNumber-in-String)|[Austin Zuniga](https://github.com/AustinZuniga)|
 |[IMDB TV Series Info Extractor](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/imdb_episode_ratings)|[Yash Raj Sarrof](https://github.com/yashYRS) |
 |[Yoda-speak Translator](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/speak_like_yoda)|[sonniki](https://github.com/sonniki) |
 |[Medium Article Downloader](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/medium_article_downloader)|[coolsonu39](https://github.com/coolsonu39)|
 ## How to use :
--- a/medium_article_downloader/README.md
+++ b/medium_article_downloader/README.md
@ -0,0 +1,3 @@
 A simple python script download latest articles from medium topicwise and save them in text files.
 It basically scrapes the site using requests and bs4 modules. I made it just for fun after I read Automate the Boring Stuff with Python by Al Sweigart.
--- a/medium_article_downloader/helpers.py
+++ b/medium_article_downloader/helpers.py
@ -0,0 +1,43 @@
 import requests, bs4
 def get_topic():
 	'''Get a topic to download from user.'''
 	topic_list = ['comics', 'books', 'art', 'culture', 'film', 'food', 'gaming', 'humor', 'internet-culture', 'lit', 'medium-magazine', 'music', 'photography', 'social-media', 'sports', 'style', 'true-crime', 'tv', 'writing', 'business', 'design', 'economy', 'startups', 'freelancing', 'leadersip', 'marketing', 'productivity', 'work', 'artificial-intelligence', 'blockchain', 'cryptocurrency', 'cybersecurity', 'data-science', 'gadgets', 'javascript', 'macine-learning', 'math', 'neuroscience', 'programming', 'science', 'self-driving-cars', 'software-engineering', 'space', 'technology', 'visual-design', 'addiction', 'creativity', 'disability', 'family', 'health', 'mental-health', 'parenting', 'personal-finance', 'pets', 'psychedelics', 'psychology', 'relationships', 'self', 'sexuality', 'spirituality', 'travel', 'wellness', 'basic-income', 'cities', 'education', 'environment', 'equality', 'future', 'gun-control', 'history', 'justice', 'language', 'lgbtqia', 'media', 'masculinity', 'philosophy', 'politics', 'race', 'religion', 'san-francisco', 'transportation', 'women', 'world']
 	print('Welcome to Medium aricle downloader by @CoolSonu39!')
 	choice = 'some-random-topic'
 	print('Which domain do you want to read today?')
 	while choice not in topic_list:
 	    print("Enter 'list' to see the list of topics.")
 	    choice = input('Enter your choice: ')
 	    if choice == 'list':
 	        print()
 	        for i in topic_list:
 	            print(i)
 	        print()
 	    elif choice not in topic_list:
 	        print('\nTopic' + choice + 'not found :(')
 	return choice
 def extract_links(url):
 	'''Extract article links from url'''
 	html_response = requests.get(url)
 	parsed_response = bs4.BeautifulSoup(html_response.text, features='html5lib')
 	article_list = parsed_response.select('h3 > a')
 	return article_list
 def medium_text(url):
 	'''Extract text from a medium article link.'''
 	html_response = requests.get(url)
 	parsed_response = bs4.BeautifulSoup(html_response.text, features='html5lib')
 	tag_list = parsed_response.find_all(['h1', 'p', 'h2'])
 	extracted_text = ''
 	for j in range(len(tag_list)):
 		extracted_text += tag_list[j].getText() + '\n\n'
 	return extracted_text
--- a/medium_article_downloader/medium.py
+++ b/medium_article_downloader/medium.py
@ -0,0 +1,24 @@
 import requests, bs4
 from helpers import *
 choice = get_topic()    
 print('\nGetting latest article links from %s...' % (choice))
 article_list = extract_links('https://medium.com/topic/' + choice)
 print('Total articles found: ' + str(len(article_list)))
 for i in range(len(article_list)):
    heading = article_list[i].getText()
    artlink = article_list[i].get('href')
    artlink = artlink if artlink.startswith("https://") else "https://medium.com" + artlink
    print('Downloading article: ' + str(i+1))
    # remove invalid characters from filename
    file_name = f"{heading}.txt".replace(':', '').replace('?', '')
    file = open(file_name, 'w')
    article_text = medium_text(artlink)
    file.write(article_text)
    file.close()
 print('Done.')
--- a/medium_article_downloader/requirements.txt
+++ b/medium_article_downloader/requirements.txt
@ -0,0 +1,2 @@
 requests
 bs4
		`@ -0,0 +1,3 @@`
							`A simple python script download latest articles from medium topicwise and save them in text files.`

							`It basically scrapes the site using requests and bs4 modules. I made it just for fun after I read Automate the Boring Stuff with Python by Al Sweigart.`