Python/web_programming/download_images_from_google_query.py

import json
import os
import re
import sys
import urllib.request

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    " (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}


def download_images_from_google_query(query: str = "dhaka", max_images: int = 5) -> int:
    """
    Searches google using the provided query term and downloads the images in a folder.

    Args:
         query : The image search term to be provided by the user. Defaults to
        "dhaka".
        image_numbers : [description]. Defaults to 5.

    Returns:
        The number of images successfully downloaded.

    # Comment out slow (4.20s call) doctests
    # >>> download_images_from_google_query()
    5
    # >>> download_images_from_google_query("potato")
    5
    """
    max_images = min(max_images, 50)  # Prevent abuse!
    params = {
        "q": query,
        "tbm": "isch",
        "hl": "en",
        "ijn": "0",
    }

    html = requests.get("https://www.google.com/search", params=params, headers=headers)
    soup = BeautifulSoup(html.text, "html.parser")
    matched_images_data = "".join(
        re.findall(r"AF_initDataCallback\(([^<]+)\);", str(soup.select("script")))
    )

    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    matched_google_image_data = re.findall(
        r"\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",",
        matched_images_data_json,
    )
    if not matched_google_image_data:
        return 0

    removed_matched_google_images_thumbnails = re.sub(
        r"\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]",
        "",
        str(matched_google_image_data),
    )

    matched_google_full_resolution_images = re.findall(
        r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
        removed_matched_google_images_thumbnails,
    )
    for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
        if index >= max_images:
            return index
        original_size_img_not_fixed = bytes(fixed_full_res_image, "ascii").decode(
            "unicode-escape"
        )
        original_size_img = bytes(original_size_img_not_fixed, "ascii").decode(
            "unicode-escape"
        )
        opener = urllib.request.build_opener()
        opener.addheaders = [
            (
                "User-Agent",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                " (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582",
            )
        ]
        urllib.request.install_opener(opener)
        path_name = f"query_{query.replace(' ', '_')}"
        if not os.path.exists(path_name):
            os.makedirs(path_name)
        urllib.request.urlretrieve(
            original_size_img, f"{path_name}/original_size_img_{index}.jpg"
        )
    return index


if __name__ == "__main__":
    try:
        image_count = download_images_from_google_query(sys.argv[1])
        print(f"{image_count} images were downloaded to disk.")
    except IndexError:
        print("Please provide a search term.")
        raise
Show images from google query (#4853) * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query with doctests. * Fixed doctest error, removed print() from method, changed return type * Update web_programming/show_image_tab_from_google_query.py using iterators instead of lists Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Improve readability by removing one-time used variable Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Decreasing complication through standard practices. Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Exception Handling Co-authored-by: Christian Clauss <cclauss@me.com> * changed complete method to download images from google search query * Update download_images_from_google_query.py * Delete show_image_tab_from_google_query.py Co-authored-by: Christian Clauss <cclauss@me.com> 2021-10-16 00:02:44 +00:00			`import json`
			`import os`
			`import re`
			`import sys`
			`import urllib.request`

			`import requests`
			`from bs4 import BeautifulSoup`

			`headers = {`
			`"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"`
			`" (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"`
			`}`


			`def download_images_from_google_query(query: str = "dhaka", max_images: int = 5) -> int:`
[pre-commit.ci] pre-commit autoupdate (#6629) * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/psf/black: 22.6.0 → 22.8.0](https://github.com/psf/black/compare/22.6.0...22.8.0) - [github.com/asottile/pyupgrade: v2.37.0 → v2.38.2](https://github.com/asottile/pyupgrade/compare/v2.37.0...v2.38.2) - https://gitlab.com/pycqa/flake8 → https://github.com/PyCQA/flake8 - [github.com/PyCQA/flake8: 3.9.2 → 5.0.4](https://github.com/PyCQA/flake8/compare/3.9.2...5.0.4) - [github.com/pre-commit/mirrors-mypy: v0.961 → v0.981](https://github.com/pre-commit/mirrors-mypy/compare/v0.961...v0.981) - [github.com/codespell-project/codespell: v2.1.0 → v2.2.1](https://github.com/codespell-project/codespell/compare/v2.1.0...v2.2.1) * Fix a long line * Update sol1.py * Update sol1.py * lambda_ * Update multi_level_feedback_queue.py * Update double_ended_queue.py * Update sequential_minimum_optimization.py * Update .pre-commit-config.yaml Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2022-10-03 20:00:45 +00:00			`"""`
			`Searches google using the provided query term and downloads the images in a folder.`
Show images from google query (#4853) * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query with doctests. * Fixed doctest error, removed print() from method, changed return type * Update web_programming/show_image_tab_from_google_query.py using iterators instead of lists Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Improve readability by removing one-time used variable Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Decreasing complication through standard practices. Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Exception Handling Co-authored-by: Christian Clauss <cclauss@me.com> * changed complete method to download images from google search query * Update download_images_from_google_query.py * Delete show_image_tab_from_google_query.py Co-authored-by: Christian Clauss <cclauss@me.com> 2021-10-16 00:02:44 +00:00
			`Args:`
			`query : The image search term to be provided by the user. Defaults to`
			`"dhaka".`
			`image_numbers : [description]. Defaults to 5.`

			`Returns:`
			`The number of images successfully downloaded.`

mandelbrot.py: Commenting out long running tests (#5558) * mandelbrot.py: Commenting out long running tests * updating DIRECTORY.md * Comment out 9 sec doctests * Update bidirectional_breadth_first_search.py * Comment out slow tests * Comment out slow (9.15 sec) pytests... * # Comment out slow (4.20s call) doctests * Comment out slow (3.45s) doctests * Update miller_rabin.py * Update miller_rabin.py Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2021-10-23 16:15:30 +00:00			`# Comment out slow (4.20s call) doctests`
			`# >>> download_images_from_google_query()`
Show images from google query (#4853) * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query with doctests. * Fixed doctest error, removed print() from method, changed return type * Update web_programming/show_image_tab_from_google_query.py using iterators instead of lists Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Improve readability by removing one-time used variable Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Decreasing complication through standard practices. Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Exception Handling Co-authored-by: Christian Clauss <cclauss@me.com> * changed complete method to download images from google search query * Update download_images_from_google_query.py * Delete show_image_tab_from_google_query.py Co-authored-by: Christian Clauss <cclauss@me.com> 2021-10-16 00:02:44 +00:00			`5`
mandelbrot.py: Commenting out long running tests (#5558) * mandelbrot.py: Commenting out long running tests * updating DIRECTORY.md * Comment out 9 sec doctests * Update bidirectional_breadth_first_search.py * Comment out slow tests * Comment out slow (9.15 sec) pytests... * # Comment out slow (4.20s call) doctests * Comment out slow (3.45s) doctests * Update miller_rabin.py * Update miller_rabin.py Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2021-10-23 16:15:30 +00:00			`# >>> download_images_from_google_query("potato")`
Show images from google query (#4853) * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query. * Added new script to open the google image tab with a search query with doctests. * Fixed doctest error, removed print() from method, changed return type * Update web_programming/show_image_tab_from_google_query.py using iterators instead of lists Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Improve readability by removing one-time used variable Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Decreasing complication through standard practices. Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/show_image_tab_from_google_query.py Exception Handling Co-authored-by: Christian Clauss <cclauss@me.com> * changed complete method to download images from google search query * Update download_images_from_google_query.py * Delete show_image_tab_from_google_query.py Co-authored-by: Christian Clauss <cclauss@me.com> 2021-10-16 00:02:44 +00:00			`5`
			`"""`
			`max_images = min(max_images, 50) # Prevent abuse!`
			`params = {`
			`"q": query,`
			`"tbm": "isch",`
			`"hl": "en",`
			`"ijn": "0",`
			`}`

			`html = requests.get("https://www.google.com/search", params=params, headers=headers)`
			`soup = BeautifulSoup(html.text, "html.parser")`
			`matched_images_data = "".join(`
			`re.findall(r"AF_initDataCallback\(([^<]+)\);", str(soup.select("script")))`
			`)`

			`matched_images_data_fix = json.dumps(matched_images_data)`
			`matched_images_data_json = json.loads(matched_images_data_fix)`

			`matched_google_image_data = re.findall(`
			`r"\[\"GRID_STATE0\",null,\[\[1,\[0,\".?\",(.),\"All\",",`
			`matched_images_data_json,`
			`)`
			`if not matched_google_image_data:`
			`return 0`

			`removed_matched_google_images_thumbnails = re.sub(`
			`r"\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]",`
			`"",`
			`str(matched_google_image_data),`
			`)`

			`matched_google_full_resolution_images = re.findall(`
			`r"(?:'\|,),\[\"(https:\|http.*?)\",\d+,\d+\]",`
			`removed_matched_google_images_thumbnails,`
			`)`
			`for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):`
			`if index >= max_images:`
			`return index`
			`original_size_img_not_fixed = bytes(fixed_full_res_image, "ascii").decode(`
			`"unicode-escape"`
			`)`
			`original_size_img = bytes(original_size_img_not_fixed, "ascii").decode(`
			`"unicode-escape"`
			`)`
			`opener = urllib.request.build_opener()`
			`opener.addheaders = [`
			`(`
			`"User-Agent",`
			`"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"`
			`" (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582",`
			`)`
			`]`
			`urllib.request.install_opener(opener)`
			`path_name = f"query_{query.replace(' ', '_')}"`
			`if not os.path.exists(path_name):`
			`os.makedirs(path_name)`
			`urllib.request.urlretrieve(`
			`original_size_img, f"{path_name}/original_size_img_{index}.jpg"`
			`)`
			`return index`


			`if __name__ == "__main__":`
			`try:`
			`image_count = download_images_from_google_query(sys.argv[1])`
			`print(f"{image_count} images were downloaded to disk.")`
			`except IndexError:`
			`print("Please provide a search term.")`
			`raise`