Python/web_programming/get_amazon_product_data.py

"""
This file provides a function which will take a product name as input from the user,
and fetch from Amazon information about products of this name or category.  The product
information will include title, URL, price, ratings, and the discount available.
"""


from itertools import zip_longest

import requests
from bs4 import BeautifulSoup
from pandas import DataFrame


def get_amazon_product_data(product: str = "laptop") -> DataFrame:
    """
    Take a product name or category as input and return product information from Amazon
    including title, URL, price, ratings, and the discount available.
    """
    url = f"https://www.amazon.in/laptop/s?k={product}"
    header = {
        "User-Agent": """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36
        (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36""",
        "Accept-Language": "en-US, en;q=0.5",
    }
    soup = BeautifulSoup(requests.get(url, headers=header).text)
    # Initialize a Pandas dataframe with the column titles
    data_frame = DataFrame(
        columns=[
            "Product Title",
            "Product Link",
            "Current Price of the product",
            "Product Rating",
            "MRP of the product",
            "Discount",
        ]
    )
    # Loop through each entry and store them in the dataframe
    for item, _ in zip_longest(
        soup.find_all(
            "div",
            attrs={"class": "s-result-item", "data-component-type": "s-search-result"},
        ),
        soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}),
    ):
        try:
            product_title = item.h2.text
            product_link = "https://www.amazon.in/" + item.h2.a["href"]
            product_price = item.find("span", attrs={"class": "a-offscreen"}).text
            try:
                product_rating = item.find("span", attrs={"class": "a-icon-alt"}).text
            except AttributeError:
                product_rating = "Not available"
            try:
                product_mrp = (
                    "₹"
                    + item.find(
                        "span", attrs={"class": "a-price a-text-price"}
                    ).text.split("₹")[1]
                )
            except AttributeError:
                product_mrp = ""
            try:
                discount = float(
                    (
                        (
                            float(product_mrp.strip("₹").replace(",", ""))
                            - float(product_price.strip("₹").replace(",", ""))
                        )
                        / float(product_mrp.strip("₹").replace(",", ""))
                    )
                    * 100
                )
            except ValueError:
                discount = float("nan")
        except AttributeError:
            pass
        data_frame.loc[len(data_frame.index)] = [
            product_title,
            product_link,
            product_price,
            product_rating,
            product_mrp,
            discount,
        ]
    data_frame.loc[
        data_frame["Current Price of the product"] > data_frame["MRP of the product"],
        "MRP of the product",
    ] = " "
    data_frame.loc[
        data_frame["Current Price of the product"] > data_frame["MRP of the product"],
        "Discount",
    ] = " "
    data_frame.index += 1
    return data_frame


if __name__ == "__main__":
    product = "headphones"
    get_amazon_product_data(product).to_csv(f"Amazon Product Data for {product}.csv")
Create fetch_amazon_product_data.py (#7585) * Create fetch_amazon_product_data.py This file provides a function which will take a product name as input from the user,and fetch the necessary information about that kind of products from Amazon like the product title,link to that product,price of the product,the ratings of the product and the discount available on the product in the form of a csv file,this will help the users by improving searchability and navigability and find the right product easily and in a short period of time, it will also be beneficial for performing better analysis on products * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update fetch_amazon_product_data.py Added type hints and modified files to pass precommit test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update fetch_amazon_product_data.py Added type hints and made changes to pass the precommit * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update fetch_amazon_product_data.py Modified function to return the data in the form of Pandas Dataframe,modified type hints and added a functionality to let the user determine if they need the data in a csv file * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update fetch_amazon_product_data.py Made some bug fixes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update and rename fetch_amazon_product_data.py to get_amazon_product_data.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update get_amazon_product_data.py Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2022-10-28 14:33:21 +00:00			`"""`
			`This file provides a function which will take a product name as input from the user,`
			`and fetch from Amazon information about products of this name or category. The product`
			`information will include title, URL, price, ratings, and the discount available.`
			`"""`


			`from itertools import zip_longest`

			`import requests`
			`from bs4 import BeautifulSoup`
			`from pandas import DataFrame`


			`def get_amazon_product_data(product: str = "laptop") -> DataFrame:`
			`"""`
			`Take a product name or category as input and return product information from Amazon`
			`including title, URL, price, ratings, and the discount available.`
			`"""`
			`url = f"https://www.amazon.in/laptop/s?k={product}"`
			`header = {`
			`"User-Agent": """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36`
			`(KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36""",`
			`"Accept-Language": "en-US, en;q=0.5",`
			`}`
			`soup = BeautifulSoup(requests.get(url, headers=header).text)`
			`# Initialize a Pandas dataframe with the column titles`
			`data_frame = DataFrame(`
			`columns=[`
			`"Product Title",`
			`"Product Link",`
			`"Current Price of the product",`
			`"Product Rating",`
			`"MRP of the product",`
			`"Discount",`
			`]`
			`)`
			`# Loop through each entry and store them in the dataframe`
			`for item, _ in zip_longest(`
			`soup.find_all(`
			`"div",`
			`attrs={"class": "s-result-item", "data-component-type": "s-search-result"},`
			`),`
			`soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}),`
			`):`
			`try:`
			`product_title = item.h2.text`
			`product_link = "https://www.amazon.in/" + item.h2.a["href"]`
			`product_price = item.find("span", attrs={"class": "a-offscreen"}).text`
			`try:`
			`product_rating = item.find("span", attrs={"class": "a-icon-alt"}).text`
			`except AttributeError:`
			`product_rating = "Not available"`
			`try:`
			`product_mrp = (`
			`"₹"`
			`+ item.find(`
			`"span", attrs={"class": "a-price a-text-price"}`
			`).text.split("₹")[1]`
			`)`
			`except AttributeError:`
			`product_mrp = ""`
			`try:`
			`discount = float(`
			`(`
			`(`
			`float(product_mrp.strip("₹").replace(",", ""))`
			`- float(product_price.strip("₹").replace(",", ""))`
			`)`
			`/ float(product_mrp.strip("₹").replace(",", ""))`
			`)`
			`* 100`
			`)`
			`except ValueError:`
			`discount = float("nan")`
			`except AttributeError:`
			`pass`
			`data_frame.loc[len(data_frame.index)] = [`
			`product_title,`
			`product_link,`
			`product_price,`
			`product_rating,`
			`product_mrp,`
			`discount,`
			`]`
			`data_frame.loc[`
			`data_frame["Current Price of the product"] > data_frame["MRP of the product"],`
			`"MRP of the product",`
			`] = " "`
			`data_frame.loc[`
			`data_frame["Current Price of the product"] > data_frame["MRP of the product"],`
			`"Discount",`
			`] = " "`
			`data_frame.index += 1`
			`return data_frame`


			`if __name__ == "__main__":`
			`product = "headphones"`
			`get_amazon_product_data(product).to_csv(f"Amazon Product Data for {product}.csv")`