IMDB list querier project by Burak Bekci (#187)

Co-authored-by: Ayush Bhardwaj <classicayush@gmail.com>
This commit is contained in:
Burak Bekci 2020-10-14 14:46:08 +03:00 committed by GitHub
parent b9f4162f13
commit bd9a89afd0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 525 additions and 0 deletions

104
IMDBQuerier/.gitignore vendored Normal file
View File

@ -0,0 +1,104 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/

40
IMDBQuerier/ClassFilm.py Normal file
View File

@ -0,0 +1,40 @@
"""
Represents the film objects in the list.
"""
class Film(object):
def __init__(self, f_name, f_year, f_rating, f_genres,
f_runtime, f_storyline, f_type, f_img_source, f_link):
self.name = f_name
self.year = f_year
self.rating = f_rating
self.genres = f_genres
self.runtime = f_runtime
self.storyline = f_storyline
self.type = f_type
self.image_source = f_img_source
self.imdb_link = f_link
def print_film(self):
print("Film, ", self.name)
print("Year ", self.year)
print('Rating', self.rating)
print("Genres", self.genres)
print('Runtime', self.runtime)
print('Storyline', self.storyline)
print('Type,', self.type)
def get_genres_string(self):
sep = ', '
return sep.join(self.genres)
def get_image_html(self):
return '<a href="https://www.imdb.com%s"> <img alt="%s" height="209" width="140" src="%s" > </a>' % (self.imdb_link, self.name, self.image_source)
def get_title(self):
return '<a href="https://www.imdb.com%s"><h4> %s </h4></a>' % (self.imdb_link, self.name)
def get_rating(self):
return '<span class="rating"> %s </span>' % str((self.rating / 10))

43
IMDBQuerier/README.md Normal file
View File

@ -0,0 +1,43 @@
# IMDBQuerier
This project is written to parsing films from IMDB user lists based on some attributes. It uses Selenium and BeautifulSoup to obtain and parse the film data.
Until now, the project can parse films based on their:
* Runtime
* Score
* Year
* Genre
* Type (TV show or film)
Currently, one can make the exact queries on the refine section at the bottom of each user list. However, it is hard to apply your selections to all lists.
Checkout [original repo](https://github.com/Bekci/IMDBQuerier) for the latest version.
## Requirements
Selenium and BeautifulSoup modules are necessary for the project. Other than that, you will need a WebDriver. The project is using ChromeDriver but you can change it to the other supported browsers easily.
If you have changed the driver, make sure to change the below code accordingly.
```
# main.py line 16
driver = webdriver.Chrome()
```
[Here is a link for the Firefox driver.](https://github.com/mozilla/geckodriver/releases)
## Usage
First of all, change the values in the `parse_options` dictionary in the [parser_config.py](parser_config.py).
Then, change the value of `list_url` variable in the [main.py](main.py) code to the list wanted to be parsed.
Run the code, the output html will apear in list_htmls folder.
## Common Driver Error
The used version of the browser driver can be out-dated. Always use the latest version in case of an error.
[Firefox Driver](https://github.com/mozilla/geckodriver/releases)
[Chrome Driver](https://chromedriver.chromium.org/)

Binary file not shown.

View File

@ -0,0 +1,15 @@
li {
list-style-type: none;
text-align: center;
max-width: 50%;
background-color: #EEEEEE;
}
span.rating {
color: #D9AA00;
}
span.list_title{
font-weight: bold;
}

View File

@ -0,0 +1,113 @@
"""
Parse strings obtained from the html to get the film metadata.
Fix metadata and create an film object to be use it later.
"""
from ClassFilm import Film
import re
"""
Eliminate parenthesis from the text.
'(2019)' -> '2019'
"""
def parse_film_year(year_text):
found_numbers = re.findall("[0-9]", year_text)
return ''.join(found_numbers[0:4])
"""
Obtain decimal value of the score from its text.
'7' -> 70
'7,9'-> 79
"""
def parse_imdb_score(score_text):
units_digit = 0
if ',' in score_text:
tens_digit, units_digit = score_text.split(',')
else:
tens_digit = score_text.split(',')[0]
return int(tens_digit) * 10 + int(units_digit)
"""
Parse runtime in minutes from runtime text.
"134 min" -> 134
"""
def parse_runtime(runtime_text):
return runtime_text.split(' ')[0]
"""
From the string of genres, obtain the genres list.
Remove extra spaces and new line characters.
Return genres in a list.
"""
def obtain_all_genres(genres_text):
obtained_genres = []
for genre in genres_text.split(','):
obtained_genres.append(genre.replace('\n', '').replace(' ', ''))
return obtained_genres
"""
Storyline obtained as text from the html yet some characters must be deleted
from it.
"""
def obtain_story_line(story_text):
return story_text.replace('\n', '')
"""
Determine the film type from the year text.
A TV-series will include '-' but a film will not include.
"""
def determine_film_type(year_text):
if '' in year_text:
return 'tv-series'
return 'film'
"""
Sometimes images cannot be loaded and its src will be a placeholder.
For such cases, loadlate tag will be the real source.
"""
def obtain_image_source(img_html):
if 'loadlate' in img_html.attrs:
return img_html['loadlate']
else:
return img_html['src']
"""
Take a html block representing the film item
Apply parsing and return film object
"""
def obtain_film_object(content, image_raw):
# Runtime and score of a film might not given in the list item.
runtime = "unknown"
point = "unknown"
raw_name_with_link = content.find('a')
raw_name = raw_name_with_link.text
film_imdb_link = raw_name_with_link['href']
raw_year = content.find("span", class_="lister-item-year text-muted unbold").text
raw_runtime = content.find("span", class_="runtime")
if raw_runtime is not None:
raw_runtime = raw_runtime.text
runtime = int(parse_runtime(raw_runtime))
raw_genre = content.find("span", class_="genre").text
raw_point = content.find("span", class_="ipl-rating-star__rating")
if raw_point is not None:
raw_point = raw_point.text
point = int(parse_imdb_score(raw_point))
raw_storyline = content.find("p", class_="").text
year = parse_film_year(raw_year)
genre_list = obtain_all_genres(raw_genre)
storyline = obtain_story_line(raw_storyline)
f_type = determine_film_type(year)
image_source = obtain_image_source(image_raw)
return Film(raw_name, year, point, genre_list, runtime, storyline, f_type, image_source, film_imdb_link)

View File

@ -0,0 +1,68 @@
"""
Create a new html file from selected films.
Save the file under lists directory.
"""
import os
HTML_DIRS = 'list_htmls'
def crete_directory():
if not os.path.exists(HTML_DIRS):
os.mkdir(HTML_DIRS)
def start_html(list_name):
return """
<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
<link rel="stylesheet" href="../css/list_style.css">
<meta charset="utf-8">
<title>Selected Films</title>
</head>
<body>
<span class="list_title"><h2> %s </h2></span>
<ul>
""" % list_name
def close_html():
return """
</ul>
</body>
</html>
"""
def create_table_from_object(film_object):
return """
<br>
<li>
<table>
<tr>
<td rowspan="5">%s</td>
<td colspan="3"> %s</td>
</tr>
<tr>
<td colspan="3">Year: %s</td>
</tr>
<tr>
<td> %s mins</td>
<td>%s </td>
<td>IMDB Rating: %s </td>
</tr>
<tr>
<td colspan="3">%s</td>
</tr>
</table>
</li>
<br>
""" % (film_object.get_image_html(), film_object.get_title(), film_object.year, film_object.runtime,
film_object.get_genres_string(), film_object.get_rating(), film_object.storyline)
def create_html_file(film_objects_list, list_name):
film_html_str = ""
# Generate html list
for film_object in film_objects_list:
film_html_str += create_table_from_object(film_object)
crete_directory()
html_file = open(os.path.join(HTML_DIRS, list_name + '.html'), "w", encoding='utf-8')
html_file.write(start_html(list_name) + film_html_str + close_html() )

56
IMDBQuerier/main.py Normal file
View File

@ -0,0 +1,56 @@
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from film_content_parser import obtain_film_object
from parser_config import check_film_object, watched_included
from html_creator import create_html_file
def get_watched_films(file_path):
watched_films_txt = open(file_path, 'r')
if watched_films_txt:
watched_names = watched_films_txt.read().split('\n')
return [names for names in watched_names if names != '']
return None
watched_films = None
if not watched_included():
watched_films = get_watched_films('watched_films.txt')
# Time to wait for web page to be loaded.
TIME_FACTOR = 2
# Give the URL of the imdb list.
list_url = "https://www.imdb.com/list/ls025718406/?ref_=tt_rls_2"
print("Opening a webdriver")
driver = webdriver.Chrome()
driver.get(list_url)
print("Waiting the website to be loaded")
# Wait browser to load the page.
time.sleep(TIME_FACTOR)
content = driver.page_source.encode('utf-16').strip()
soup = BeautifulSoup(content, 'lxml')
# Obtain all films
film_contents = soup.find_all("div", class_="lister-item mode-detail")
wanted_films = []
list_header = soup.find("h1", class_='header list-name').text
print("Parsing and querying films")
for all_content in film_contents:
img_source = all_content.find('div', class_='lister-item-image ribbonize').find('img')
content = all_content.find('div', class_='lister-item-content')
current_film = obtain_film_object(content, img_source)
if check_film_object(current_film, watched_films):
wanted_films.append(current_film)
create_html_file(wanted_films, list_header)
print("New html created with the name ",list_header )
driver.close()

View File

@ -0,0 +1,82 @@
"""
Define and check rules for a film object.
"""
# Rules for the film object
parse_options = {
'type': 'film',
'runtime_min': 80,
'runtime_max': 140,
'inlude_unkown_runtime': False,
'score_range_min': '6.9',
'score_range_max': '10.0',
'include_unknown_score': False,
'year_range_oldest': 1990,
'year_range_newest': 2019,
'wanted_genres': ['drama'],
'unwanted_genres': ['romance', 'musical','horror', 'documentary'],
# Whether add or remove a film
# whose genre neither in wanted_genres nor unwanted_genres list
'add_not_unwanted_&_not_wanted': True,
'include_watched': False
}
def check_runtime(film_runtime):
if film_runtime == 'unknown':
return parse_options['inlude_unkown_runtime']
min_runtime = parse_options['runtime_min']
max_runtime = parse_options['runtime_max']
return film_runtime >= min_runtime and film_runtime <= max_runtime
def check_genre(film_genre_list):
for genre in film_genre_list:
if genre.lower() in parse_options['unwanted_genres']:
return False
if parse_options['wanted_genres'] is None or len(parse_options['wanted_genres']) == 0:
return True
for genre in film_genre_list:
if genre.lower() in parse_options['wanted_genres']:
return True
return parse_options['add_not_unwanted_&_not_wanted']
def check_score(score_range):
if score_range == 'unknown':
return parse_options['include_unknown_score']
min_score = float(parse_options['score_range_min']) * 10
max_score = float(parse_options['score_range_max']) * 10
return score_range >= min_score and score_range <= max_score
def check_year(year_range):
min_year = parse_options['year_range_oldest']
max_year = parse_options['year_range_newest']
return int(year_range) >= min_year and int(year_range) <= max_year
def check_type(film_type):
if parse_options['type'] == 'both':
return True
elif parse_options['type'] == film_type:
return True
return False
def watched_included():
return parse_options['include_watched']
def check_film_object(film_object, watched_films=None):
if not check_runtime(film_object.runtime):
return False
if not check_genre(film_object.genres):
return False
if not check_score(film_object.rating):
return False
if film_object.type == 'film' and not check_year(film_object.year):
return False
if not check_type(film_object.type):
return False
if watched_films is not None and film_object.name in watched_films:
return False
# All of the above rules applied for the object
return True

View File

@ -0,0 +1,3 @@
bs4
selenium
regex

View File

@ -160,6 +160,7 @@ So far, the following projects have been integrated to this repo:
|[Remove-Duplicate-Files](Remove-Duplicate-Files)|[Aayushi Varma](https://github.com/aayuv17)
|[PDF2text](PDF2text)|[QuangPH](https://github.com/quangph-1686a)
|[Image Watermarker (batch)](imageWatermarker)|[Remco Halman](https://github.com/remcohalman)
|[IMDBQuerier](IMDBQuerier)|[Burak Bekci](https://github.com/Bekci)
|[URL shortener](url_shortener)|[Sam Ebison](https://github.com/ebsa491)