Create crawl_hindustan_times_and_get_top_news.py

2025-05-10 06:13:57 +00:00 · 2024-10-02 00:35:28 +05:30 · 2024-10-02 00:35:28 +05:30 · 541a64a6d8
commit 541a64a6d8
parent ea533ae5b6
1 changed files with 7 additions and 6 deletions
--- a/web_programming/crawl_hindustan_times_and_get_top_news.py
+++ b/web_programming/crawl_hindustan_times_and_get_top_news.py
@ -1,6 +1,6 @@
 """
-Fetch all the top headlines from Hindustan Times News website with title, link to the news article
-and cover image link.
+Fetch all the top headlines from Hindustan Times News website with
+title, link to the news article and cover image link.

 The following format is used while displaying the data

@ -17,18 +17,19 @@ import requests
 from bs4 import BeautifulSoup


-def fetch_ht_news():
+def fetch_ht_news() -> dict:
+
    header = {
        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
        "Sec-GPC": "1",
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36""",
        "sec-ch-ua": '"Not)A;Brand";v="99", "Brave";v="127", "Chromium";v="127"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "Windows",
    }

    url = "https://www.hindustantimes.com/"
-    page_request = requests.get(url, headers=header)
+    page_request = requests.get(url, headers=header, timeout=10)
    data = page_request.content
    soup = BeautifulSoup(data, "html.parser")

@ -44,7 +45,7 @@ def fetch_ht_news():
            imgtag = divtag.find("img")
            try:
                img = imgtag["data-src"]
-            except Exception:
+            except KeyError:
                img = imgtag["src"]
            news[counter] = {"title": title, "link": link, "img": img}
            counter += 1