mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-27 15:01:08 +00:00
756bb268eb
* [pre-commit.ci] pre-commit autoupdate updates: - [github.com/psf/black: 22.6.0 → 22.8.0](https://github.com/psf/black/compare/22.6.0...22.8.0) - [github.com/asottile/pyupgrade: v2.37.0 → v2.38.2](https://github.com/asottile/pyupgrade/compare/v2.37.0...v2.38.2) - https://gitlab.com/pycqa/flake8 → https://github.com/PyCQA/flake8 - [github.com/PyCQA/flake8: 3.9.2 → 5.0.4](https://github.com/PyCQA/flake8/compare/3.9.2...5.0.4) - [github.com/pre-commit/mirrors-mypy: v0.961 → v0.981](https://github.com/pre-commit/mirrors-mypy/compare/v0.961...v0.981) - [github.com/codespell-project/codespell: v2.1.0 → v2.2.1](https://github.com/codespell-project/codespell/compare/v2.1.0...v2.2.1) * Fix a long line * Update sol1.py * Update sol1.py * lambda_ * Update multi_level_feedback_queue.py * Update double_ended_queue.py * Update sequential_minimum_optimization.py * Update .pre-commit-config.yaml Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Christian Clauss <cclauss@me.com>
102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import urllib.request
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
" (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
|
|
}
|
|
|
|
|
|
def download_images_from_google_query(query: str = "dhaka", max_images: int = 5) -> int:
|
|
"""
|
|
Searches google using the provided query term and downloads the images in a folder.
|
|
|
|
Args:
|
|
query : The image search term to be provided by the user. Defaults to
|
|
"dhaka".
|
|
image_numbers : [description]. Defaults to 5.
|
|
|
|
Returns:
|
|
The number of images successfully downloaded.
|
|
|
|
# Comment out slow (4.20s call) doctests
|
|
# >>> download_images_from_google_query()
|
|
5
|
|
# >>> download_images_from_google_query("potato")
|
|
5
|
|
"""
|
|
max_images = min(max_images, 50) # Prevent abuse!
|
|
params = {
|
|
"q": query,
|
|
"tbm": "isch",
|
|
"hl": "en",
|
|
"ijn": "0",
|
|
}
|
|
|
|
html = requests.get("https://www.google.com/search", params=params, headers=headers)
|
|
soup = BeautifulSoup(html.text, "html.parser")
|
|
matched_images_data = "".join(
|
|
re.findall(r"AF_initDataCallback\(([^<]+)\);", str(soup.select("script")))
|
|
)
|
|
|
|
matched_images_data_fix = json.dumps(matched_images_data)
|
|
matched_images_data_json = json.loads(matched_images_data_fix)
|
|
|
|
matched_google_image_data = re.findall(
|
|
r"\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",",
|
|
matched_images_data_json,
|
|
)
|
|
if not matched_google_image_data:
|
|
return 0
|
|
|
|
removed_matched_google_images_thumbnails = re.sub(
|
|
r"\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]",
|
|
"",
|
|
str(matched_google_image_data),
|
|
)
|
|
|
|
matched_google_full_resolution_images = re.findall(
|
|
r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
|
|
removed_matched_google_images_thumbnails,
|
|
)
|
|
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
|
|
if index >= max_images:
|
|
return index
|
|
original_size_img_not_fixed = bytes(fixed_full_res_image, "ascii").decode(
|
|
"unicode-escape"
|
|
)
|
|
original_size_img = bytes(original_size_img_not_fixed, "ascii").decode(
|
|
"unicode-escape"
|
|
)
|
|
opener = urllib.request.build_opener()
|
|
opener.addheaders = [
|
|
(
|
|
"User-Agent",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
" (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582",
|
|
)
|
|
]
|
|
urllib.request.install_opener(opener)
|
|
path_name = f"query_{query.replace(' ', '_')}"
|
|
if not os.path.exists(path_name):
|
|
os.makedirs(path_name)
|
|
urllib.request.urlretrieve(
|
|
original_size_img, f"{path_name}/original_size_img_{index}.jpg"
|
|
)
|
|
return index
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
image_count = download_images_from_google_query(sys.argv[1])
|
|
print(f"{image_count} images were downloaded to disk.")
|
|
except IndexError:
|
|
print("Please provide a search term.")
|
|
raise
|