From 43f99e56c969b46d3ea65593d0a20e9d9896ed0f Mon Sep 17 00:00:00 2001
From: kunal kumar barman <kumar96kunal@gmail.com>
Date: Sat, 19 Oct 2019 03:00:52 +0530
Subject: [PATCH] Python program that surfs  3 site at a time (#1389)

* Python program that scrufs 3 site at a time

add input in the compiling time  like --  python3 project1.py (man)

* Update project1.py

* noqa: F401 and reformat with black

* Rename project1.py to web_programming/crawl_google_results.py

* Add beautifulsoup4 to requirements.txt

* Add fake_useragent to requirements.txt

* Update crawl_google_results.py

* headers={"UserAgent": UserAgent().random}

* html.parser, not lxml

* link, not links
---
 requirements.txt                        |  2 ++
 web_programming/crawl_google_results.py | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 web_programming/crawl_google_results.py

diff --git a/requirements.txt b/requirements.txt
index 4f6ff321c..824f534a2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,6 @@
+beautifulsoup4
 black
+fake_useragent
 flake8
 matplotlib
 mypy
diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py
new file mode 100644
index 000000000..c31ec1526
--- /dev/null
+++ b/web_programming/crawl_google_results.py
@@ -0,0 +1,20 @@
+import sys
+import webbrowser
+
+from bs4 import BeautifulSoup
+from fake_useragent import UserAgent
+import requests
+
+print("Googling.....")
+url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:])
+res = requests.get(url, headers={"UserAgent": UserAgent().random})
+# res.raise_for_status()
+with open("project1a.html", "wb") as out_file:  # only for knowing the class
+    for data in res.iter_content(10000):
+        out_file.write(data)
+soup = BeautifulSoup(res.text, "html.parser")
+links = list(soup.select(".eZt8xd"))[:5]
+
+print(len(links))
+for link in links:
+    webbrowser.open(f"http://google.com{link.get('href')}")