diff --git a/IMDBQuerier/.gitignore b/IMDBQuerier/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/IMDBQuerier/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/IMDBQuerier/ClassFilm.py b/IMDBQuerier/ClassFilm.py new file mode 100644 index 0000000..9bd9e13 --- /dev/null +++ b/IMDBQuerier/ClassFilm.py @@ -0,0 +1,40 @@ +""" +Represents the film objects in the list. +""" + +class Film(object): + def __init__(self, f_name, f_year, f_rating, f_genres, + f_runtime, f_storyline, f_type, f_img_source, f_link): + self.name = f_name + self.year = f_year + self.rating = f_rating + self.genres = f_genres + self.runtime = f_runtime + self.storyline = f_storyline + self.type = f_type + self.image_source = f_img_source + self.imdb_link = f_link + + + def print_film(self): + print("Film, ", self.name) + print("Year ", self.year) + print('Rating', self.rating) + print("Genres", self.genres) + print('Runtime', self.runtime) + print('Storyline', self.storyline) + print('Type,', self.type) + + def get_genres_string(self): + sep = ', ' + return sep.join(self.genres) + + def get_image_html(self): + return ' %s ' % (self.imdb_link, self.name, self.image_source) + + def get_title(self): + return '

%s

' % (self.imdb_link, self.name) + + + def get_rating(self): + return ' %s ' % str((self.rating / 10)) diff --git a/IMDBQuerier/README.md b/IMDBQuerier/README.md new file mode 100644 index 0000000..baa89d7 --- /dev/null +++ b/IMDBQuerier/README.md @@ -0,0 +1,43 @@ +# IMDBQuerier + +This project is written to parsing films from IMDB user lists based on some attributes. It uses Selenium and BeautifulSoup to obtain and parse the film data. + +Until now, the project can parse films based on their: + +* Runtime +* Score +* Year +* Genre +* Type (TV show or film) + +Currently, one can make the exact queries on the refine section at the bottom of each user list. However, it is hard to apply your selections to all lists. + +Checkout [original repo](https://github.com/Bekci/IMDBQuerier) for the latest version. +## Requirements + +Selenium and BeautifulSoup modules are necessary for the project. Other than that, you will need a WebDriver. The project is using ChromeDriver but you can change it to the other supported browsers easily. + +If you have changed the driver, make sure to change the below code accordingly. + +``` +# main.py line 16 +driver = webdriver.Chrome() +``` + +[Here is a link for the Firefox driver.](https://github.com/mozilla/geckodriver/releases) + +## Usage + +First of all, change the values in the `parse_options` dictionary in the [parser_config.py](parser_config.py). + +Then, change the value of `list_url` variable in the [main.py](main.py) code to the list wanted to be parsed. + + Run the code, the output html will apear in list_htmls folder. + +## Common Driver Error + +The used version of the browser driver can be out-dated. Always use the latest version in case of an error. + +[Firefox Driver](https://github.com/mozilla/geckodriver/releases) + +[Chrome Driver](https://chromedriver.chromium.org/) \ No newline at end of file diff --git a/IMDBQuerier/chromedriver.exe b/IMDBQuerier/chromedriver.exe new file mode 100644 index 0000000..c0efae8 Binary files /dev/null and b/IMDBQuerier/chromedriver.exe differ diff --git a/IMDBQuerier/css/list_style.css b/IMDBQuerier/css/list_style.css new file mode 100644 index 0000000..cfcdf0f --- /dev/null +++ b/IMDBQuerier/css/list_style.css @@ -0,0 +1,15 @@ +li { + list-style-type: none; + text-align: center; + max-width: 50%; + background-color: #EEEEEE; +} + +span.rating { + color: #D9AA00; +} + + +span.list_title{ + font-weight: bold; +} diff --git a/IMDBQuerier/film_content_parser.py b/IMDBQuerier/film_content_parser.py new file mode 100644 index 0000000..d533e67 --- /dev/null +++ b/IMDBQuerier/film_content_parser.py @@ -0,0 +1,113 @@ +""" +Parse strings obtained from the html to get the film metadata. +Fix metadata and create an film object to be use it later. +""" + +from ClassFilm import Film +import re + +""" +Eliminate parenthesis from the text. +'(2019)' -> '2019' +""" +def parse_film_year(year_text): + found_numbers = re.findall("[0-9]", year_text) + return ''.join(found_numbers[0:4]) + + +""" +Obtain decimal value of the score from its text. +'7' -> 70 +'7,9'-> 79 +""" +def parse_imdb_score(score_text): + units_digit = 0 + if ',' in score_text: + tens_digit, units_digit = score_text.split(',') + else: + tens_digit = score_text.split(',')[0] + return int(tens_digit) * 10 + int(units_digit) + + +""" +Parse runtime in minutes from runtime text. +"134 min" -> 134 +""" +def parse_runtime(runtime_text): + return runtime_text.split(' ')[0] + + +""" +From the string of genres, obtain the genres list. +Remove extra spaces and new line characters. +Return genres in a list. +""" +def obtain_all_genres(genres_text): + obtained_genres = [] + for genre in genres_text.split(','): + obtained_genres.append(genre.replace('\n', '').replace(' ', '')) + return obtained_genres + + +""" +Storyline obtained as text from the html yet some characters must be deleted +from it. +""" +def obtain_story_line(story_text): + return story_text.replace('\n', '') + +""" +Determine the film type from the year text. +A TV-series will include '-' but a film will not include. +""" +def determine_film_type(year_text): + if '–' in year_text: + return 'tv-series' + return 'film' + +""" +Sometimes images cannot be loaded and its src will be a placeholder. +For such cases, loadlate tag will be the real source. +""" +def obtain_image_source(img_html): + if 'loadlate' in img_html.attrs: + return img_html['loadlate'] + else: + return img_html['src'] + + +""" +Take a html block representing the film item +Apply parsing and return film object +""" +def obtain_film_object(content, image_raw): + # Runtime and score of a film might not given in the list item. + runtime = "unknown" + point = "unknown" + + raw_name_with_link = content.find('a') + raw_name = raw_name_with_link.text + film_imdb_link = raw_name_with_link['href'] + raw_year = content.find("span", class_="lister-item-year text-muted unbold").text + raw_runtime = content.find("span", class_="runtime") + + if raw_runtime is not None: + raw_runtime = raw_runtime.text + runtime = int(parse_runtime(raw_runtime)) + + raw_genre = content.find("span", class_="genre").text + raw_point = content.find("span", class_="ipl-rating-star__rating") + + if raw_point is not None: + raw_point = raw_point.text + point = int(parse_imdb_score(raw_point)) + + raw_storyline = content.find("p", class_="").text + + year = parse_film_year(raw_year) + genre_list = obtain_all_genres(raw_genre) + storyline = obtain_story_line(raw_storyline) + f_type = determine_film_type(year) + image_source = obtain_image_source(image_raw) + + return Film(raw_name, year, point, genre_list, runtime, storyline, f_type, image_source, film_imdb_link) diff --git a/IMDBQuerier/html_creator.py b/IMDBQuerier/html_creator.py new file mode 100644 index 0000000..cc74ea7 --- /dev/null +++ b/IMDBQuerier/html_creator.py @@ -0,0 +1,68 @@ +""" +Create a new html file from selected films. +Save the file under lists directory. +""" +import os +HTML_DIRS = 'list_htmls' + +def crete_directory(): + if not os.path.exists(HTML_DIRS): + os.mkdir(HTML_DIRS) + +def start_html(list_name): + return """ + + + + + + Selected Films + + +

%s

+ + + + """ + +def create_table_from_object(film_object): + return """ +
+
  • + + + + + + + + + + + + + + + + +
    %s %s
    Year: %s
    %s mins%s IMDB Rating: %s
    %s
    +
  • +
    + """ % (film_object.get_image_html(), film_object.get_title(), film_object.year, film_object.runtime, + film_object.get_genres_string(), film_object.get_rating(), film_object.storyline) + +def create_html_file(film_objects_list, list_name): + film_html_str = "" + # Generate html list + for film_object in film_objects_list: + film_html_str += create_table_from_object(film_object) + + crete_directory() + + html_file = open(os.path.join(HTML_DIRS, list_name + '.html'), "w", encoding='utf-8') + html_file.write(start_html(list_name) + film_html_str + close_html() ) diff --git a/IMDBQuerier/main.py b/IMDBQuerier/main.py new file mode 100644 index 0000000..d18fea1 --- /dev/null +++ b/IMDBQuerier/main.py @@ -0,0 +1,56 @@ +from bs4 import BeautifulSoup +from selenium import webdriver +import time +from selenium.webdriver.common.keys import Keys +from film_content_parser import obtain_film_object +from parser_config import check_film_object, watched_included +from html_creator import create_html_file + +def get_watched_films(file_path): + watched_films_txt = open(file_path, 'r') + if watched_films_txt: + watched_names = watched_films_txt.read().split('\n') + return [names for names in watched_names if names != ''] + return None + +watched_films = None +if not watched_included(): + watched_films = get_watched_films('watched_films.txt') + +# Time to wait for web page to be loaded. +TIME_FACTOR = 2 + +# Give the URL of the imdb list. +list_url = "https://www.imdb.com/list/ls025718406/?ref_=tt_rls_2" + +print("Opening a webdriver") +driver = webdriver.Chrome() + +driver.get(list_url) + +print("Waiting the website to be loaded") +# Wait browser to load the page. +time.sleep(TIME_FACTOR) + +content = driver.page_source.encode('utf-16').strip() +soup = BeautifulSoup(content, 'lxml') + +# Obtain all films +film_contents = soup.find_all("div", class_="lister-item mode-detail") + +wanted_films = [] + +list_header = soup.find("h1", class_='header list-name').text + +print("Parsing and querying films") +for all_content in film_contents: + img_source = all_content.find('div', class_='lister-item-image ribbonize').find('img') + content = all_content.find('div', class_='lister-item-content') + current_film = obtain_film_object(content, img_source) + if check_film_object(current_film, watched_films): + wanted_films.append(current_film) + +create_html_file(wanted_films, list_header) +print("New html created with the name ",list_header ) + +driver.close() diff --git a/IMDBQuerier/parser_config.py b/IMDBQuerier/parser_config.py new file mode 100644 index 0000000..d873645 --- /dev/null +++ b/IMDBQuerier/parser_config.py @@ -0,0 +1,82 @@ +""" +Define and check rules for a film object. +""" + +# Rules for the film object +parse_options = { + 'type': 'film', + 'runtime_min': 80, + 'runtime_max': 140, + 'inlude_unkown_runtime': False, + 'score_range_min': '6.9', + 'score_range_max': '10.0', + 'include_unknown_score': False, + 'year_range_oldest': 1990, + 'year_range_newest': 2019, + 'wanted_genres': ['drama'], + 'unwanted_genres': ['romance', 'musical','horror', 'documentary'], + # Whether add or remove a film + # whose genre neither in wanted_genres nor unwanted_genres list + 'add_not_unwanted_&_not_wanted': True, + 'include_watched': False +} + +def check_runtime(film_runtime): + if film_runtime == 'unknown': + return parse_options['inlude_unkown_runtime'] + min_runtime = parse_options['runtime_min'] + max_runtime = parse_options['runtime_max'] + + return film_runtime >= min_runtime and film_runtime <= max_runtime + +def check_genre(film_genre_list): + for genre in film_genre_list: + if genre.lower() in parse_options['unwanted_genres']: + return False + if parse_options['wanted_genres'] is None or len(parse_options['wanted_genres']) == 0: + return True + for genre in film_genre_list: + if genre.lower() in parse_options['wanted_genres']: + return True + return parse_options['add_not_unwanted_&_not_wanted'] + + +def check_score(score_range): + if score_range == 'unknown': + return parse_options['include_unknown_score'] + min_score = float(parse_options['score_range_min']) * 10 + max_score = float(parse_options['score_range_max']) * 10 + return score_range >= min_score and score_range <= max_score + + +def check_year(year_range): + min_year = parse_options['year_range_oldest'] + max_year = parse_options['year_range_newest'] + return int(year_range) >= min_year and int(year_range) <= max_year + + +def check_type(film_type): + if parse_options['type'] == 'both': + return True + elif parse_options['type'] == film_type: + return True + return False + +def watched_included(): + return parse_options['include_watched'] + +def check_film_object(film_object, watched_films=None): + if not check_runtime(film_object.runtime): + return False + if not check_genre(film_object.genres): + return False + if not check_score(film_object.rating): + return False + if film_object.type == 'film' and not check_year(film_object.year): + return False + if not check_type(film_object.type): + return False + if watched_films is not None and film_object.name in watched_films: + return False + # All of the above rules applied for the object + return True diff --git a/IMDBQuerier/requirements.txt b/IMDBQuerier/requirements.txt new file mode 100644 index 0000000..6800c12 --- /dev/null +++ b/IMDBQuerier/requirements.txt @@ -0,0 +1,3 @@ +bs4 +selenium +regex \ No newline at end of file diff --git a/README.md b/README.md index ec3929f..eaa64e6 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,7 @@ So far, the following projects have been integrated to this repo: |[Remove-Duplicate-Files](Remove-Duplicate-Files)|[Aayushi Varma](https://github.com/aayuv17) |[PDF2text](PDF2text)|[QuangPH](https://github.com/quangph-1686a) |[Image Watermarker (batch)](imageWatermarker)|[Remco Halman](https://github.com/remcohalman) +|[IMDBQuerier](IMDBQuerier)|[Burak Bekci](https://github.com/Bekci) |[URL shortener](url_shortener)|[Sam Ebison](https://github.com/ebsa491)