From 43f99e56c969b46d3ea65593d0a20e9d9896ed0f Mon Sep 17 00:00:00 2001 From: kunal kumar barman Date: Sat, 19 Oct 2019 03:00:52 +0530 Subject: [PATCH] Python program that surfs 3 site at a time (#1389) * Python program that scrufs 3 site at a time add input in the compiling time like -- python3 project1.py (man) * Update project1.py * noqa: F401 and reformat with black * Rename project1.py to web_programming/crawl_google_results.py * Add beautifulsoup4 to requirements.txt * Add fake_useragent to requirements.txt * Update crawl_google_results.py * headers={"UserAgent": UserAgent().random} * html.parser, not lxml * link, not links --- requirements.txt | 2 ++ web_programming/crawl_google_results.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 web_programming/crawl_google_results.py diff --git a/requirements.txt b/requirements.txt index 4f6ff321c..824f534a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ +beautifulsoup4 black +fake_useragent flake8 matplotlib mypy diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py new file mode 100644 index 000000000..c31ec1526 --- /dev/null +++ b/web_programming/crawl_google_results.py @@ -0,0 +1,20 @@ +import sys +import webbrowser + +from bs4 import BeautifulSoup +from fake_useragent import UserAgent +import requests + +print("Googling.....") +url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) +res = requests.get(url, headers={"UserAgent": UserAgent().random}) +# res.raise_for_status() +with open("project1a.html", "wb") as out_file: # only for knowing the class + for data in res.iter_content(10000): + out_file.write(data) +soup = BeautifulSoup(res.text, "html.parser") +links = list(soup.select(".eZt8xd"))[:5] + +print(len(links)) +for link in links: + webbrowser.open(f"http://google.com{link.get('href')}")