diff --git a/README.md b/README.md index 91e10fd..cbd22ec 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ So far, the following projects have been integrated to this repo: |[Zip password cracker](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/zip_password_cracker)|[umar abdullahi](https://github.com/umarbrowser)| |[CLI Calculator](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/cli_calculator)|[Willian GL](https://github.com/williangl) | |[Find PhoneNumber in String](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/Find-PhoneNumber-in-String)|[Austin Zuniga](https://github.com/AustinZuniga)| +|[IMDB TV Series Info Extractor](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/imdb_episode_ratings)|[Yash Raj Sarrof](https://github.com/yashYRS) | ## How to use : diff --git a/imdb_episode_ratings/README.md b/imdb_episode_ratings/README.md new file mode 100644 index 0000000..4e0154b --- /dev/null +++ b/imdb_episode_ratings/README.md @@ -0,0 +1,19 @@ +# Get information about your favorite TV shows at once +This python script will make a excel files, with information about every episode from every season of the TV show that you searched for + +## Requirement + +Python 3.6 onwards +```bash +pip3 install requests +pip3 install xlwt +pip3 install bs4 + +``` + +#Usage +Call python following with the simple algebra problem +```bash +$ python scraper.py +``` +Then simply enter the name of the show you want to search for, and then you will find a excel file in the same directory with the name of the show you searched for \ No newline at end of file diff --git a/imdb_episode_ratings/scraper.py b/imdb_episode_ratings/scraper.py new file mode 100644 index 0000000..ace4c9b --- /dev/null +++ b/imdb_episode_ratings/scraper.py @@ -0,0 +1,107 @@ +import requests +from bs4 import BeautifulSoup as BS +import xlwt +import time + + +def get_static_html ( search_url ) : + ## create the soup object for the page + try: + r_page = requests.get ( search_url ) + except: + print("Connection refused by the server..") + time.sleep(5) + soup_object = BS( r_page.content , 'html.parser' ) + #print ( soup_object.prettify() ) + return soup_object + +def get_url () : + ## convert to query url , and get raw HTML for the page + show_name = input ( " Enter show name ") + show_name = '+'.join ( show_name.split() ) + search_url = "https://www.imdb.com/find?ref_=nv_sr_fn&q="+ show_name + "&s=all" + return search_url, show_name + + +def get_new_url ( soup_object ) : + ## list of possible search results + list_queries = soup_object.find_all('td', class_ = "result_text") + + show_final = None + ## find the first TV show listing in the relevant searches + for show in list_queries : + if "(TV Series)" in show.text : + show_final = show + break + + if show_final == None : + print( " No relevant search ") + exit() + #print ( " Show found - " , show_final ) + + ## find the link to open the new page + hyperlink = show_final.find('a') + url_change = hyperlink['href'] + + show_url = "https://www.imdb.com/" + url_change + "episodes?season=" + return show_url + + +def start() : + + search_url , show_name = get_url() + soup_object = get_static_html(search_url) + show_url = get_new_url(soup_object) + result_file = xlwt.Workbook() + + season_number = 1 + + while True : + + soup_object = get_static_html( show_url + str(season_number) ) + + ## verify if extra season exists + verify_season = soup_object.find('h3' , attrs = {'id' :'episode_top'}) + curr_season = int ( verify_season.text[6:] ) + if not season_number == curr_season : + break + + print ("Season - ", season_number , " information extracted " ) + + ## excel file + result_sheet = result_file.add_sheet( verify_season.text , cell_overwrite_ok=True) + result_sheet.write( 0 , 0 , " Name " ) + result_sheet.write( 0 , 1 , " Rating " ) + result_sheet.write( 0 , 2 , " Total votes " ) + result_sheet.write( 0 , 3 , " Summary " ) + result_sheet.col(3).width = 21000 + result_sheet.col(0).width = 10000 + + episodes_season = soup_object.find_all('div' , class_ = 'info' ) + curr_episode = 1 + for episode in episodes_season : + ## get the name of the episode + name_episode = episode.find('strong') + ## get the rating of the episode + rating_episode = episode.find('span' , class_ = 'ipl-rating-star__rating' ) + ## total votes + votes_episode = episode.find('span' , class_ = 'ipl-rating-star__total-votes' ) + ## summary + summary_episode = episode.find('div' , class_ = 'item_description' ) + + ## write to the excel file + if name_episode : + result_sheet.write( curr_episode , 0 , name_episode.text ) + if rating_episode : + result_sheet.write( curr_episode , 1 , rating_episode.text ) + if votes_episode : + result_sheet.write( curr_episode , 2 , votes_episode.text[1:-1] ) + if summary_episode : + result_sheet.write( curr_episode , 3 , summary_episode.text ) + curr_episode = curr_episode + 1 + season_number = season_number + 1 + + print ( " Finished ") + result_file.save( show_name.replace('+' , '_') + '.xls') + +start() \ No newline at end of file