mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-27 15:01:08 +00:00
Create emails_from_url.py (#1756)
* Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com>
This commit is contained in:
parent
c1a4cc96c8
commit
2b19e84767
105
web_programming/emails_from_url.py
Normal file
105
web_programming/emails_from_url.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
"""Get the site emails from URL."""
|
||||
__author__ = "Muhammad Umer Farooq"
|
||||
__license__ = "MIT"
|
||||
__version__ = "1.0.0"
|
||||
__maintainer__ = "Muhammad Umer Farooq"
|
||||
__email__ = "contact@muhammadumerfarooq.me"
|
||||
__status__ = "Alpha"
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
from urllib import parse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class Parser(HTMLParser):
|
||||
def __init__(self, domain: str):
|
||||
HTMLParser.__init__(self)
|
||||
self.data = []
|
||||
self.domain = domain
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: str) -> None:
|
||||
"""
|
||||
This function parse html to take takes url from tags
|
||||
"""
|
||||
# Only parse the 'anchor' tag.
|
||||
if tag == "a":
|
||||
# Check the list of defined attributes.
|
||||
for name, value in attrs:
|
||||
# If href is defined, and not empty nor # print it.
|
||||
if name == "href" and value != "#" and value != "":
|
||||
# If not already in data.
|
||||
if value not in self.data:
|
||||
url = parse.urljoin(self.domain, value)
|
||||
self.data.append(url)
|
||||
|
||||
|
||||
# Get main domain name (example.com)
|
||||
def get_domain_name(url: str) -> str:
|
||||
"""
|
||||
This function get the main domain name
|
||||
|
||||
>>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
|
||||
'c.d'
|
||||
>>> get_domain_name("Not a URL!")
|
||||
''
|
||||
"""
|
||||
return ".".join(get_sub_domain_name(url).split(".")[-2:])
|
||||
|
||||
|
||||
# Get sub domain name (sub.example.com)
|
||||
def get_sub_domain_name(url: str) -> str:
|
||||
"""
|
||||
This function get sub domin name
|
||||
|
||||
>>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
|
||||
'a.b.c.d'
|
||||
>>> get_sub_domain_name("Not a URL!")
|
||||
''
|
||||
"""
|
||||
return parse.urlparse(url).netloc
|
||||
|
||||
|
||||
def emails_from_url(url: str = "https://github.com") -> list:
|
||||
"""
|
||||
This function takes url and return all valid urls
|
||||
"""
|
||||
# Get the base domain from the url
|
||||
domain = get_domain_name(url)
|
||||
|
||||
# Initialize the parser
|
||||
parser = Parser(domain)
|
||||
|
||||
try:
|
||||
# Open URL
|
||||
r = requests.get(url)
|
||||
|
||||
# pass the raw HTML to the parser to get links
|
||||
parser.feed(r.text)
|
||||
|
||||
# Get links and loop through
|
||||
valid_emails = set()
|
||||
for link in parser.data:
|
||||
# open URL.
|
||||
# read = requests.get(link)
|
||||
try:
|
||||
read = requests.get(link)
|
||||
# Get the valid email.
|
||||
emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
|
||||
# If not in list then append it.
|
||||
for email in emails:
|
||||
valid_emails.add(email)
|
||||
except ValueError:
|
||||
pass
|
||||
except ValueError:
|
||||
exit(-1)
|
||||
|
||||
# Finally return a sorted list of email addresses with no duplicates.
|
||||
return sorted(valid_emails)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
emails = emails_from_url("https://github.com")
|
||||
print(f"{len(emails)} emails found:")
|
||||
print("\n".join(sorted(emails)))
|
Loading…
Reference in New Issue
Block a user