From 152261765a93c2a12eb4af0abdd297652766d47d Mon Sep 17 00:00:00 2001 From: Appledora Date: Sat, 16 Oct 2021 06:02:44 +0600 Subject: [PATCH] Show images from google query (#4853) * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query with doctests. * Fixed doctest error, removed print() from method, changed return type * Update web_programming/show_image_tab_from_google_query.py using iterators instead of lists Co-authored-by: Christian Clauss * Update web_programming/show_image_tab_from_google_query.py Improve readability by removing one-time used variable Co-authored-by: Christian Clauss * Update web_programming/show_image_tab_from_google_query.py Decreasing complication through standard practices. Co-authored-by: Christian Clauss * Update web_programming/show_image_tab_from_google_query.py Exception Handling Co-authored-by: Christian Clauss * changed complete method to download images from google search query * Update download_images_from_google_query.py * Delete show_image_tab_from_google_query.py Co-authored-by: Christian Clauss --- .../download_images_from_google_query.py | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 web_programming/download_images_from_google_query.py diff --git a/web_programming/download_images_from_google_query.py b/web_programming/download_images_from_google_query.py new file mode 100644 index 000000000..c26262788 --- /dev/null +++ b/web_programming/download_images_from_google_query.py @@ -0,0 +1,99 @@ +import json +import os +import re +import sys +import urllib.request + +import requests +from bs4 import BeautifulSoup + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + " (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582" +} + + +def download_images_from_google_query(query: str = "dhaka", max_images: int = 5) -> int: + """Searches google using the provided query term and downloads the images in a folder. + + Args: + query : The image search term to be provided by the user. Defaults to + "dhaka". + image_numbers : [description]. Defaults to 5. + + Returns: + The number of images successfully downloaded. + + >>> download_images_from_google_query() + 5 + >>> download_images_from_google_query("potato") + 5 + """ + max_images = min(max_images, 50) # Prevent abuse! + params = { + "q": query, + "tbm": "isch", + "hl": "en", + "ijn": "0", + } + + html = requests.get("https://www.google.com/search", params=params, headers=headers) + soup = BeautifulSoup(html.text, "html.parser") + matched_images_data = "".join( + re.findall(r"AF_initDataCallback\(([^<]+)\);", str(soup.select("script"))) + ) + + matched_images_data_fix = json.dumps(matched_images_data) + matched_images_data_json = json.loads(matched_images_data_fix) + + matched_google_image_data = re.findall( + r"\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",", + matched_images_data_json, + ) + if not matched_google_image_data: + return 0 + + removed_matched_google_images_thumbnails = re.sub( + r"\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]", + "", + str(matched_google_image_data), + ) + + matched_google_full_resolution_images = re.findall( + r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", + removed_matched_google_images_thumbnails, + ) + for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images): + if index >= max_images: + return index + original_size_img_not_fixed = bytes(fixed_full_res_image, "ascii").decode( + "unicode-escape" + ) + original_size_img = bytes(original_size_img_not_fixed, "ascii").decode( + "unicode-escape" + ) + opener = urllib.request.build_opener() + opener.addheaders = [ + ( + "User-Agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + " (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582", + ) + ] + urllib.request.install_opener(opener) + path_name = f"query_{query.replace(' ', '_')}" + if not os.path.exists(path_name): + os.makedirs(path_name) + urllib.request.urlretrieve( + original_size_img, f"{path_name}/original_size_img_{index}.jpg" + ) + return index + + +if __name__ == "__main__": + try: + image_count = download_images_from_google_query(sys.argv[1]) + print(f"{image_count} images were downloaded to disk.") + except IndexError: + print("Please provide a search term.") + raise