mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-27 15:01:08 +00:00
Python program that surfs 3 site at a time (#1389)
* Python program that scrufs 3 site at a time add input in the compiling time like -- python3 project1.py (man) * Update project1.py * noqa: F401 and reformat with black * Rename project1.py to web_programming/crawl_google_results.py * Add beautifulsoup4 to requirements.txt * Add fake_useragent to requirements.txt * Update crawl_google_results.py * headers={"UserAgent": UserAgent().random} * html.parser, not lxml * link, not links
This commit is contained in:
parent
5ef5f67a51
commit
43f99e56c9
|
@ -1,4 +1,6 @@
|
|||
beautifulsoup4
|
||||
black
|
||||
fake_useragent
|
||||
flake8
|
||||
matplotlib
|
||||
mypy
|
||||
|
|
20
web_programming/crawl_google_results.py
Normal file
20
web_programming/crawl_google_results.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
import sys
|
||||
import webbrowser
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from fake_useragent import UserAgent
|
||||
import requests
|
||||
|
||||
print("Googling.....")
|
||||
url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:])
|
||||
res = requests.get(url, headers={"UserAgent": UserAgent().random})
|
||||
# res.raise_for_status()
|
||||
with open("project1a.html", "wb") as out_file: # only for knowing the class
|
||||
for data in res.iter_content(10000):
|
||||
out_file.write(data)
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
links = list(soup.select(".eZt8xd"))[:5]
|
||||
|
||||
print(len(links))
|
||||
for link in links:
|
||||
webbrowser.open(f"http://google.com{link.get('href')}")
|
Loading…
Reference in New Issue
Block a user