Create emails_from_url.py (#1756)

* Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com>
2025-04-29 17:13:36 +00:00 · 2020-02-26 05:41:56 -05:00 · 2020-02-26 05:41:56 -05:00 · 2b19e84767
commit 2b19e84767
parent c1a4cc96c8
1 changed files with 105 additions and 0 deletions
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@ -0,0 +1,105 @@
 """Get the site emails from URL."""
 __author__ = "Muhammad Umer Farooq"
 __license__ = "MIT"
 __version__ = "1.0.0"
 __maintainer__ = "Muhammad Umer Farooq"
 __email__ = "contact@muhammadumerfarooq.me"
 __status__ = "Alpha"
 import re
 from html.parser import HTMLParser
 from urllib import parse
 import requests
 class Parser(HTMLParser):
    def __init__(self, domain: str):
        HTMLParser.__init__(self)
        self.data = []
        self.domain = domain
    def handle_starttag(self, tag: str, attrs: str) -> None:
        """
        This function parse html to take takes url from tags
        """
        # Only parse the 'anchor' tag.
        if tag == "a":
            # Check the list of defined attributes.
            for name, value in attrs:
                # If href is defined, and not empty nor # print it.
                if name == "href" and value != "#" and value != "":
                    # If not already in data.
                    if value not in self.data:
                        url = parse.urljoin(self.domain, value)
                        self.data.append(url)
 # Get main domain name (example.com)
 def get_domain_name(url: str) -> str:
    """
    This function get the main domain name
    >>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
    'c.d'
    >>> get_domain_name("Not a URL!")
    ''
    """
    return ".".join(get_sub_domain_name(url).split(".")[-2:])
 # Get sub domain name (sub.example.com)
 def get_sub_domain_name(url: str) -> str:
    """
    This function get sub domin name
    >>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
    'a.b.c.d'
    >>> get_sub_domain_name("Not a URL!")
    ''
    """
    return parse.urlparse(url).netloc
 def emails_from_url(url: str = "https://github.com") -> list:
    """
    This function takes url and return all valid urls
    """
    # Get the base domain from the url
    domain = get_domain_name(url)
    # Initialize the parser
    parser = Parser(domain)
    try:
        # Open URL
        r = requests.get(url)
        # pass the raw HTML to the parser to get links
        parser.feed(r.text)
        # Get links and loop through
        valid_emails = set()
        for link in parser.data:
            # open URL.
            # read = requests.get(link)
            try:
                read = requests.get(link)
                # Get the valid email.
                emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
                # If not in list then append it.
                for email in emails:
                    valid_emails.add(email)
            except ValueError:
                pass
    except ValueError:
        exit(-1)
    # Finally return a sorted list of email addresses with no duplicates.
    return sorted(valid_emails)
 if __name__ == "__main__":
    emails = emails_from_url("https://github.com")
    print(f"{len(emails)} emails found:")
    print("\n".join(sorted(emails)))