From a9428eacd6753f124e948ea44ec34b173aef52f7 Mon Sep 17 00:00:00 2001 From: Sayan Roy Date: Sun, 9 Oct 2022 21:59:11 +0530 Subject: [PATCH 1/2] Email Extractor Email Extractor #ISSUE 253 --- scripts/Email Extractor/README.md | 16 +++ scripts/Email Extractor/email_extractor.py | 110 +++++++++++++++++++++ scripts/Email Extractor/requirements.txt | 6 ++ 3 files changed, 132 insertions(+) create mode 100644 scripts/Email Extractor/README.md create mode 100644 scripts/Email Extractor/email_extractor.py create mode 100644 scripts/Email Extractor/requirements.txt diff --git a/scripts/Email Extractor/README.md b/scripts/Email Extractor/README.md new file mode 100644 index 0000000..68230b9 --- /dev/null +++ b/scripts/Email Extractor/README.md @@ -0,0 +1,16 @@ +# Email Extractor with Python + +This is a script that takes input as a website and collect all the email address into a csv file. + + +### Setup + - Install the requirements (refer below) + - Run the script by 'python email_extractor.py' + - Input the website to collect emails + + +### Requirements +```pip install -r requirements.txt``` + +### usage +```python email_extractor.py``` \ No newline at end of file diff --git a/scripts/Email Extractor/email_extractor.py b/scripts/Email Extractor/email_extractor.py new file mode 100644 index 0000000..0c23cba --- /dev/null +++ b/scripts/Email Extractor/email_extractor.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- +"""Untitled0.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1BuQhjlIL_OYu39gpE2NQNZx9KJ_kPy3o +""" + +import requests +from bs4 import BeautifulSoup +import urllib.request +from email_scraper import scrape_emails +import pandas as pd +from google.colab import files + + +urlid = input("Enter Website url (i.e.: example.com): ") +url = "https://"+urlid+"/" +reqs = requests.get(url) +soup = BeautifulSoup(reqs.text, 'html.parser') + +urls = [] +response = [] +email = [] +for link in soup.find_all('a'): + urls.append(link.get('href')) +for i in range(len(urls)): + if(urls[i].startswith("https://")): + fp = urllib.request.urlopen(url+urls[i]) + mybytes = fp.read() + mystr = mybytes.decode("utf8") + fp.close() + response.append(scrape_emails(mystr)) + else: + fp = urllib.request.urlopen(url+urls[i]) + mybytes = fp.read() + mystr = mybytes.decode("utf8") + fp.close() + response.append(scrape_emails(mystr)) + +for r in range(len(response)): + if not response[r]: + continue + else: + email.append(response[r]) + +df = pd.DataFrame(email, columns=["Email"]) +df.to_csv('email.csv', index=False) + +files.download("email.csv") + +urllib.request.urlopen('https://www.youracclaim.com/badges/42b5d2d4-7c14-4c1a-b78a-adb3ac04105b/public_url').read().decode("utf-8") + + + +import urllib.request + +fp = urllib.request.urlopen("http://royninja.github.io/contact.html") +mybytes = fp.read() + +mystr = mybytes.decode("utf8") +fp.close() + +print(mystr) + + + +import urllib.request + +fp = urllib.request.urlopen("http://royninja.github.io/contact.html") +mybytes = fp.read() + +mystr = mybytes.decode("utf8") +fp.close() + +print(mystr) + +webUrl = urllib.request.urlopen("https://royninja.github.io") + +pip install email-scraper + + + +scrape_emails(mystr) + + + + + +import requests +from bs4 import BeautifulSoup + + +url = 'https://royninja.github.io/' +reqs = requests.get(url) +soup = BeautifulSoup(reqs.text, 'html.parser') + +urls = [] +for link in soup.find_all('a'): + urls.append(link.get('href')) + +urls[1] + +url+urls[1] + +BufautifulSoup(requests.get(url+urls[1]).text,'html.parser') + +url2 + diff --git a/scripts/Email Extractor/requirements.txt b/scripts/Email Extractor/requirements.txt new file mode 100644 index 0000000..424180a --- /dev/null +++ b/scripts/Email Extractor/requirements.txt @@ -0,0 +1,6 @@ +pip install requests +pip install bs4 +pip install urllib +pip install email_scraper +pip install pandas +pip install google \ No newline at end of file From 303d1cae1688cb4fa5ce590f3c08a50426034125 Mon Sep 17 00:00:00 2001 From: Sayan Roy Date: Sun, 9 Oct 2022 22:01:42 +0530 Subject: [PATCH 2/2] Email Extractor irrelevant code remove --- scripts/Email Extractor/email_extractor.py | 65 ---------------------- 1 file changed, 65 deletions(-) diff --git a/scripts/Email Extractor/email_extractor.py b/scripts/Email Extractor/email_extractor.py index 0c23cba..550752b 100644 --- a/scripts/Email Extractor/email_extractor.py +++ b/scripts/Email Extractor/email_extractor.py @@ -1,11 +1,4 @@ -# -*- coding: utf-8 -*- -"""Untitled0.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1BuQhjlIL_OYu39gpE2NQNZx9KJ_kPy3o -""" import requests from bs4 import BeautifulSoup @@ -50,61 +43,3 @@ df.to_csv('email.csv', index=False) files.download("email.csv") -urllib.request.urlopen('https://www.youracclaim.com/badges/42b5d2d4-7c14-4c1a-b78a-adb3ac04105b/public_url').read().decode("utf-8") - - - -import urllib.request - -fp = urllib.request.urlopen("http://royninja.github.io/contact.html") -mybytes = fp.read() - -mystr = mybytes.decode("utf8") -fp.close() - -print(mystr) - - - -import urllib.request - -fp = urllib.request.urlopen("http://royninja.github.io/contact.html") -mybytes = fp.read() - -mystr = mybytes.decode("utf8") -fp.close() - -print(mystr) - -webUrl = urllib.request.urlopen("https://royninja.github.io") - -pip install email-scraper - - - -scrape_emails(mystr) - - - - - -import requests -from bs4 import BeautifulSoup - - -url = 'https://royninja.github.io/' -reqs = requests.get(url) -soup = BeautifulSoup(reqs.text, 'html.parser') - -urls = [] -for link in soup.find_all('a'): - urls.append(link.get('href')) - -urls[1] - -url+urls[1] - -BufautifulSoup(requests.get(url+urls[1]).text,'html.parser') - -url2 -