mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-27 23:11:09 +00:00
72f6000365
* updating DIRECTORY.md * fix(get-amazon-product-data): Remove whitespace in headers * refactor(get-amazon-product-data): Don't print to_csv --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
103 lines
3.5 KiB
Python
103 lines
3.5 KiB
Python
"""
|
|
This file provides a function which will take a product name as input from the user,
|
|
and fetch from Amazon information about products of this name or category. The product
|
|
information will include title, URL, price, ratings, and the discount available.
|
|
"""
|
|
|
|
|
|
from itertools import zip_longest
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from pandas import DataFrame
|
|
|
|
|
|
def get_amazon_product_data(product: str = "laptop") -> DataFrame:
|
|
"""
|
|
Take a product name or category as input and return product information from Amazon
|
|
including title, URL, price, ratings, and the discount available.
|
|
"""
|
|
url = f"https://www.amazon.in/laptop/s?k={product}"
|
|
header = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
|
|
"(KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36"
|
|
),
|
|
"Accept-Language": "en-US, en;q=0.5",
|
|
}
|
|
soup = BeautifulSoup(requests.get(url, headers=header).text, features="lxml")
|
|
# Initialize a Pandas dataframe with the column titles
|
|
data_frame = DataFrame(
|
|
columns=[
|
|
"Product Title",
|
|
"Product Link",
|
|
"Current Price of the product",
|
|
"Product Rating",
|
|
"MRP of the product",
|
|
"Discount",
|
|
]
|
|
)
|
|
# Loop through each entry and store them in the dataframe
|
|
for item, _ in zip_longest(
|
|
soup.find_all(
|
|
"div",
|
|
attrs={"class": "s-result-item", "data-component-type": "s-search-result"},
|
|
),
|
|
soup.find_all("div", attrs={"class": "a-row a-size-base a-color-base"}),
|
|
):
|
|
try:
|
|
product_title = item.h2.text
|
|
product_link = "https://www.amazon.in/" + item.h2.a["href"]
|
|
product_price = item.find("span", attrs={"class": "a-offscreen"}).text
|
|
try:
|
|
product_rating = item.find("span", attrs={"class": "a-icon-alt"}).text
|
|
except AttributeError:
|
|
product_rating = "Not available"
|
|
try:
|
|
product_mrp = (
|
|
"₹"
|
|
+ item.find(
|
|
"span", attrs={"class": "a-price a-text-price"}
|
|
).text.split("₹")[1]
|
|
)
|
|
except AttributeError:
|
|
product_mrp = ""
|
|
try:
|
|
discount = float(
|
|
(
|
|
(
|
|
float(product_mrp.strip("₹").replace(",", ""))
|
|
- float(product_price.strip("₹").replace(",", ""))
|
|
)
|
|
/ float(product_mrp.strip("₹").replace(",", ""))
|
|
)
|
|
* 100
|
|
)
|
|
except ValueError:
|
|
discount = float("nan")
|
|
except AttributeError:
|
|
continue
|
|
data_frame.loc[str(len(data_frame.index))] = [
|
|
product_title,
|
|
product_link,
|
|
product_price,
|
|
product_rating,
|
|
product_mrp,
|
|
discount,
|
|
]
|
|
data_frame.loc[
|
|
data_frame["Current Price of the product"] > data_frame["MRP of the product"],
|
|
"MRP of the product",
|
|
] = " "
|
|
data_frame.loc[
|
|
data_frame["Current Price of the product"] > data_frame["MRP of the product"],
|
|
"Discount",
|
|
] = " "
|
|
data_frame.index += 1
|
|
return data_frame
|
|
|
|
|
|
if __name__ == "__main__":
|
|
product = "headphones"
|
|
get_amazon_product_data(product).to_csv(f"Amazon Product Data for {product}.csv")
|