mirror of
https://github.com/hastagAB/Awesome-Python-Scripts.git
synced 2024-11-23 20:11:07 +00:00
IMDB list querier project by Burak Bekci (#187)
Co-authored-by: Ayush Bhardwaj <classicayush@gmail.com>
This commit is contained in:
parent
b9f4162f13
commit
bd9a89afd0
104
IMDBQuerier/.gitignore
vendored
Normal file
104
IMDBQuerier/.gitignore
vendored
Normal file
|
@ -0,0 +1,104 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
40
IMDBQuerier/ClassFilm.py
Normal file
40
IMDBQuerier/ClassFilm.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
"""
|
||||
Represents the film objects in the list.
|
||||
"""
|
||||
|
||||
class Film(object):
|
||||
def __init__(self, f_name, f_year, f_rating, f_genres,
|
||||
f_runtime, f_storyline, f_type, f_img_source, f_link):
|
||||
self.name = f_name
|
||||
self.year = f_year
|
||||
self.rating = f_rating
|
||||
self.genres = f_genres
|
||||
self.runtime = f_runtime
|
||||
self.storyline = f_storyline
|
||||
self.type = f_type
|
||||
self.image_source = f_img_source
|
||||
self.imdb_link = f_link
|
||||
|
||||
|
||||
def print_film(self):
|
||||
print("Film, ", self.name)
|
||||
print("Year ", self.year)
|
||||
print('Rating', self.rating)
|
||||
print("Genres", self.genres)
|
||||
print('Runtime', self.runtime)
|
||||
print('Storyline', self.storyline)
|
||||
print('Type,', self.type)
|
||||
|
||||
def get_genres_string(self):
|
||||
sep = ', '
|
||||
return sep.join(self.genres)
|
||||
|
||||
def get_image_html(self):
|
||||
return '<a href="https://www.imdb.com%s"> <img alt="%s" height="209" width="140" src="%s" > </a>' % (self.imdb_link, self.name, self.image_source)
|
||||
|
||||
def get_title(self):
|
||||
return '<a href="https://www.imdb.com%s"><h4> %s </h4></a>' % (self.imdb_link, self.name)
|
||||
|
||||
|
||||
def get_rating(self):
|
||||
return '<span class="rating"> %s </span>' % str((self.rating / 10))
|
43
IMDBQuerier/README.md
Normal file
43
IMDBQuerier/README.md
Normal file
|
@ -0,0 +1,43 @@
|
|||
# IMDBQuerier
|
||||
|
||||
This project is written to parsing films from IMDB user lists based on some attributes. It uses Selenium and BeautifulSoup to obtain and parse the film data.
|
||||
|
||||
Until now, the project can parse films based on their:
|
||||
|
||||
* Runtime
|
||||
* Score
|
||||
* Year
|
||||
* Genre
|
||||
* Type (TV show or film)
|
||||
|
||||
Currently, one can make the exact queries on the refine section at the bottom of each user list. However, it is hard to apply your selections to all lists.
|
||||
|
||||
Checkout [original repo](https://github.com/Bekci/IMDBQuerier) for the latest version.
|
||||
## Requirements
|
||||
|
||||
Selenium and BeautifulSoup modules are necessary for the project. Other than that, you will need a WebDriver. The project is using ChromeDriver but you can change it to the other supported browsers easily.
|
||||
|
||||
If you have changed the driver, make sure to change the below code accordingly.
|
||||
|
||||
```
|
||||
# main.py line 16
|
||||
driver = webdriver.Chrome()
|
||||
```
|
||||
|
||||
[Here is a link for the Firefox driver.](https://github.com/mozilla/geckodriver/releases)
|
||||
|
||||
## Usage
|
||||
|
||||
First of all, change the values in the `parse_options` dictionary in the [parser_config.py](parser_config.py).
|
||||
|
||||
Then, change the value of `list_url` variable in the [main.py](main.py) code to the list wanted to be parsed.
|
||||
|
||||
Run the code, the output html will apear in list_htmls folder.
|
||||
|
||||
## Common Driver Error
|
||||
|
||||
The used version of the browser driver can be out-dated. Always use the latest version in case of an error.
|
||||
|
||||
[Firefox Driver](https://github.com/mozilla/geckodriver/releases)
|
||||
|
||||
[Chrome Driver](https://chromedriver.chromium.org/)
|
BIN
IMDBQuerier/chromedriver.exe
Normal file
BIN
IMDBQuerier/chromedriver.exe
Normal file
Binary file not shown.
15
IMDBQuerier/css/list_style.css
Normal file
15
IMDBQuerier/css/list_style.css
Normal file
|
@ -0,0 +1,15 @@
|
|||
li {
|
||||
list-style-type: none;
|
||||
text-align: center;
|
||||
max-width: 50%;
|
||||
background-color: #EEEEEE;
|
||||
}
|
||||
|
||||
span.rating {
|
||||
color: #D9AA00;
|
||||
}
|
||||
|
||||
|
||||
span.list_title{
|
||||
font-weight: bold;
|
||||
}
|
113
IMDBQuerier/film_content_parser.py
Normal file
113
IMDBQuerier/film_content_parser.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
"""
|
||||
Parse strings obtained from the html to get the film metadata.
|
||||
Fix metadata and create an film object to be use it later.
|
||||
"""
|
||||
|
||||
from ClassFilm import Film
|
||||
import re
|
||||
|
||||
"""
|
||||
Eliminate parenthesis from the text.
|
||||
'(2019)' -> '2019'
|
||||
"""
|
||||
def parse_film_year(year_text):
|
||||
found_numbers = re.findall("[0-9]", year_text)
|
||||
return ''.join(found_numbers[0:4])
|
||||
|
||||
|
||||
"""
|
||||
Obtain decimal value of the score from its text.
|
||||
'7' -> 70
|
||||
'7,9'-> 79
|
||||
"""
|
||||
def parse_imdb_score(score_text):
|
||||
units_digit = 0
|
||||
if ',' in score_text:
|
||||
tens_digit, units_digit = score_text.split(',')
|
||||
else:
|
||||
tens_digit = score_text.split(',')[0]
|
||||
return int(tens_digit) * 10 + int(units_digit)
|
||||
|
||||
|
||||
"""
|
||||
Parse runtime in minutes from runtime text.
|
||||
"134 min" -> 134
|
||||
"""
|
||||
def parse_runtime(runtime_text):
|
||||
return runtime_text.split(' ')[0]
|
||||
|
||||
|
||||
"""
|
||||
From the string of genres, obtain the genres list.
|
||||
Remove extra spaces and new line characters.
|
||||
Return genres in a list.
|
||||
"""
|
||||
def obtain_all_genres(genres_text):
|
||||
obtained_genres = []
|
||||
for genre in genres_text.split(','):
|
||||
obtained_genres.append(genre.replace('\n', '').replace(' ', ''))
|
||||
return obtained_genres
|
||||
|
||||
|
||||
"""
|
||||
Storyline obtained as text from the html yet some characters must be deleted
|
||||
from it.
|
||||
"""
|
||||
def obtain_story_line(story_text):
|
||||
return story_text.replace('\n', '')
|
||||
|
||||
"""
|
||||
Determine the film type from the year text.
|
||||
A TV-series will include '-' but a film will not include.
|
||||
"""
|
||||
def determine_film_type(year_text):
|
||||
if '–' in year_text:
|
||||
return 'tv-series'
|
||||
return 'film'
|
||||
|
||||
"""
|
||||
Sometimes images cannot be loaded and its src will be a placeholder.
|
||||
For such cases, loadlate tag will be the real source.
|
||||
"""
|
||||
def obtain_image_source(img_html):
|
||||
if 'loadlate' in img_html.attrs:
|
||||
return img_html['loadlate']
|
||||
else:
|
||||
return img_html['src']
|
||||
|
||||
|
||||
"""
|
||||
Take a html block representing the film item
|
||||
Apply parsing and return film object
|
||||
"""
|
||||
def obtain_film_object(content, image_raw):
|
||||
# Runtime and score of a film might not given in the list item.
|
||||
runtime = "unknown"
|
||||
point = "unknown"
|
||||
|
||||
raw_name_with_link = content.find('a')
|
||||
raw_name = raw_name_with_link.text
|
||||
film_imdb_link = raw_name_with_link['href']
|
||||
raw_year = content.find("span", class_="lister-item-year text-muted unbold").text
|
||||
raw_runtime = content.find("span", class_="runtime")
|
||||
|
||||
if raw_runtime is not None:
|
||||
raw_runtime = raw_runtime.text
|
||||
runtime = int(parse_runtime(raw_runtime))
|
||||
|
||||
raw_genre = content.find("span", class_="genre").text
|
||||
raw_point = content.find("span", class_="ipl-rating-star__rating")
|
||||
|
||||
if raw_point is not None:
|
||||
raw_point = raw_point.text
|
||||
point = int(parse_imdb_score(raw_point))
|
||||
|
||||
raw_storyline = content.find("p", class_="").text
|
||||
|
||||
year = parse_film_year(raw_year)
|
||||
genre_list = obtain_all_genres(raw_genre)
|
||||
storyline = obtain_story_line(raw_storyline)
|
||||
f_type = determine_film_type(year)
|
||||
image_source = obtain_image_source(image_raw)
|
||||
|
||||
return Film(raw_name, year, point, genre_list, runtime, storyline, f_type, image_source, film_imdb_link)
|
68
IMDBQuerier/html_creator.py
Normal file
68
IMDBQuerier/html_creator.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
"""
|
||||
Create a new html file from selected films.
|
||||
Save the file under lists directory.
|
||||
"""
|
||||
import os
|
||||
HTML_DIRS = 'list_htmls'
|
||||
|
||||
def crete_directory():
|
||||
if not os.path.exists(HTML_DIRS):
|
||||
os.mkdir(HTML_DIRS)
|
||||
|
||||
def start_html(list_name):
|
||||
return """
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" dir="ltr">
|
||||
<head>
|
||||
<link rel="stylesheet" href="../css/list_style.css">
|
||||
<meta charset="utf-8">
|
||||
<title>Selected Films</title>
|
||||
</head>
|
||||
<body>
|
||||
<span class="list_title"><h2> %s </h2></span>
|
||||
<ul>
|
||||
""" % list_name
|
||||
|
||||
def close_html():
|
||||
return """
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
def create_table_from_object(film_object):
|
||||
return """
|
||||
<br>
|
||||
<li>
|
||||
<table>
|
||||
<tr>
|
||||
<td rowspan="5">%s</td>
|
||||
<td colspan="3"> %s</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3">Year: %s</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td> %s mins</td>
|
||||
<td>%s </td>
|
||||
<td>IMDB Rating: %s </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3">%s</td>
|
||||
</tr>
|
||||
</table>
|
||||
</li>
|
||||
<br>
|
||||
""" % (film_object.get_image_html(), film_object.get_title(), film_object.year, film_object.runtime,
|
||||
film_object.get_genres_string(), film_object.get_rating(), film_object.storyline)
|
||||
|
||||
def create_html_file(film_objects_list, list_name):
|
||||
film_html_str = ""
|
||||
# Generate html list
|
||||
for film_object in film_objects_list:
|
||||
film_html_str += create_table_from_object(film_object)
|
||||
|
||||
crete_directory()
|
||||
|
||||
html_file = open(os.path.join(HTML_DIRS, list_name + '.html'), "w", encoding='utf-8')
|
||||
html_file.write(start_html(list_name) + film_html_str + close_html() )
|
56
IMDBQuerier/main.py
Normal file
56
IMDBQuerier/main.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
from bs4 import BeautifulSoup
|
||||
from selenium import webdriver
|
||||
import time
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from film_content_parser import obtain_film_object
|
||||
from parser_config import check_film_object, watched_included
|
||||
from html_creator import create_html_file
|
||||
|
||||
def get_watched_films(file_path):
|
||||
watched_films_txt = open(file_path, 'r')
|
||||
if watched_films_txt:
|
||||
watched_names = watched_films_txt.read().split('\n')
|
||||
return [names for names in watched_names if names != '']
|
||||
return None
|
||||
|
||||
watched_films = None
|
||||
if not watched_included():
|
||||
watched_films = get_watched_films('watched_films.txt')
|
||||
|
||||
# Time to wait for web page to be loaded.
|
||||
TIME_FACTOR = 2
|
||||
|
||||
# Give the URL of the imdb list.
|
||||
list_url = "https://www.imdb.com/list/ls025718406/?ref_=tt_rls_2"
|
||||
|
||||
print("Opening a webdriver")
|
||||
driver = webdriver.Chrome()
|
||||
|
||||
driver.get(list_url)
|
||||
|
||||
print("Waiting the website to be loaded")
|
||||
# Wait browser to load the page.
|
||||
time.sleep(TIME_FACTOR)
|
||||
|
||||
content = driver.page_source.encode('utf-16').strip()
|
||||
soup = BeautifulSoup(content, 'lxml')
|
||||
|
||||
# Obtain all films
|
||||
film_contents = soup.find_all("div", class_="lister-item mode-detail")
|
||||
|
||||
wanted_films = []
|
||||
|
||||
list_header = soup.find("h1", class_='header list-name').text
|
||||
|
||||
print("Parsing and querying films")
|
||||
for all_content in film_contents:
|
||||
img_source = all_content.find('div', class_='lister-item-image ribbonize').find('img')
|
||||
content = all_content.find('div', class_='lister-item-content')
|
||||
current_film = obtain_film_object(content, img_source)
|
||||
if check_film_object(current_film, watched_films):
|
||||
wanted_films.append(current_film)
|
||||
|
||||
create_html_file(wanted_films, list_header)
|
||||
print("New html created with the name ",list_header )
|
||||
|
||||
driver.close()
|
82
IMDBQuerier/parser_config.py
Normal file
82
IMDBQuerier/parser_config.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
"""
|
||||
Define and check rules for a film object.
|
||||
"""
|
||||
|
||||
# Rules for the film object
|
||||
parse_options = {
|
||||
'type': 'film',
|
||||
'runtime_min': 80,
|
||||
'runtime_max': 140,
|
||||
'inlude_unkown_runtime': False,
|
||||
'score_range_min': '6.9',
|
||||
'score_range_max': '10.0',
|
||||
'include_unknown_score': False,
|
||||
'year_range_oldest': 1990,
|
||||
'year_range_newest': 2019,
|
||||
'wanted_genres': ['drama'],
|
||||
'unwanted_genres': ['romance', 'musical','horror', 'documentary'],
|
||||
# Whether add or remove a film
|
||||
# whose genre neither in wanted_genres nor unwanted_genres list
|
||||
'add_not_unwanted_&_not_wanted': True,
|
||||
'include_watched': False
|
||||
}
|
||||
|
||||
def check_runtime(film_runtime):
|
||||
if film_runtime == 'unknown':
|
||||
return parse_options['inlude_unkown_runtime']
|
||||
min_runtime = parse_options['runtime_min']
|
||||
max_runtime = parse_options['runtime_max']
|
||||
|
||||
return film_runtime >= min_runtime and film_runtime <= max_runtime
|
||||
|
||||
def check_genre(film_genre_list):
|
||||
for genre in film_genre_list:
|
||||
if genre.lower() in parse_options['unwanted_genres']:
|
||||
return False
|
||||
if parse_options['wanted_genres'] is None or len(parse_options['wanted_genres']) == 0:
|
||||
return True
|
||||
for genre in film_genre_list:
|
||||
if genre.lower() in parse_options['wanted_genres']:
|
||||
return True
|
||||
return parse_options['add_not_unwanted_&_not_wanted']
|
||||
|
||||
|
||||
def check_score(score_range):
|
||||
if score_range == 'unknown':
|
||||
return parse_options['include_unknown_score']
|
||||
min_score = float(parse_options['score_range_min']) * 10
|
||||
max_score = float(parse_options['score_range_max']) * 10
|
||||
return score_range >= min_score and score_range <= max_score
|
||||
|
||||
|
||||
def check_year(year_range):
|
||||
min_year = parse_options['year_range_oldest']
|
||||
max_year = parse_options['year_range_newest']
|
||||
return int(year_range) >= min_year and int(year_range) <= max_year
|
||||
|
||||
|
||||
def check_type(film_type):
|
||||
if parse_options['type'] == 'both':
|
||||
return True
|
||||
elif parse_options['type'] == film_type:
|
||||
return True
|
||||
return False
|
||||
|
||||
def watched_included():
|
||||
return parse_options['include_watched']
|
||||
|
||||
def check_film_object(film_object, watched_films=None):
|
||||
if not check_runtime(film_object.runtime):
|
||||
return False
|
||||
if not check_genre(film_object.genres):
|
||||
return False
|
||||
if not check_score(film_object.rating):
|
||||
return False
|
||||
if film_object.type == 'film' and not check_year(film_object.year):
|
||||
return False
|
||||
if not check_type(film_object.type):
|
||||
return False
|
||||
if watched_films is not None and film_object.name in watched_films:
|
||||
return False
|
||||
# All of the above rules applied for the object
|
||||
return True
|
3
IMDBQuerier/requirements.txt
Normal file
3
IMDBQuerier/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
bs4
|
||||
selenium
|
||||
regex
|
|
@ -160,6 +160,7 @@ So far, the following projects have been integrated to this repo:
|
|||
|[Remove-Duplicate-Files](Remove-Duplicate-Files)|[Aayushi Varma](https://github.com/aayuv17)
|
||||
|[PDF2text](PDF2text)|[QuangPH](https://github.com/quangph-1686a)
|
||||
|[Image Watermarker (batch)](imageWatermarker)|[Remco Halman](https://github.com/remcohalman)
|
||||
|[IMDBQuerier](IMDBQuerier)|[Burak Bekci](https://github.com/Bekci)
|
||||
|[URL shortener](url_shortener)|[Sam Ebison](https://github.com/ebsa491)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user