diff --git a/scripts/Crawl Google Results/README.md b/scripts/Crawl Google Results/README.md new file mode 100644 index 0000000..17fe464 --- /dev/null +++ b/scripts/Crawl Google Results/README.md @@ -0,0 +1,9 @@ +# Crawl Google results +This is a simple script that lets you collect results provided by Google. + +## Usage + +* 3 packages required requests, BeautifulSoup and fake_useragent +* Use `pip install requests`, `pip install bs4` and `pip install fake_useragent` +* Add path to your csv file and output excel file WITH EXTENSTION `.csv` and `.xlsx` +* Run `python main.py "query search"` diff --git a/scripts/Crawl Google Results/main.py b/scripts/Crawl Google Results/main.py new file mode 100644 index 0000000..8937182 --- /dev/null +++ b/scripts/Crawl Google Results/main.py @@ -0,0 +1,24 @@ +import sys +import webbrowser + +import requests +from bs4 import BeautifulSoup +from fake_useragent import UserAgent + +if __name__ == "__main__": + print("Googling.....") + url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) + res = requests.get(url, headers={"UserAgent": UserAgent().random}) + # res.raise_for_status() + with open("project1a.html", "wb") as out_file: # only for knowing the class + for data in res.iter_content(10000): + out_file.write(data) + soup = BeautifulSoup(res.text, "html.parser") + links = list(soup.select(".eZt8xd"))[:5] + + print(len(links)) + for link in links: + if link.text == "Maps": + webbrowser.open(link.get("href")) + else: + webbrowser.open(f"http://google.com{link.get('href')}") \ No newline at end of file