Create emails_from_url.py (#1756)

* Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com>
2025-05-20 07:54:51 +00:00 · 2020-02-26 05:41:56 -05:00 · 2020-02-26 05:41:56 -05:00 · 2b19e84767
commit 2b19e84767
parent c1a4cc96c8
1 changed files with 105 additions and 0 deletions
--- a/web_programming/emails_from_url.py
+++ b/web_programming/emails_from_url.py
@ -0,0 +1,105 @@
+"""Get the site emails from URL."""
+__author__ = "Muhammad Umer Farooq"
+__license__ = "MIT"
+__version__ = "1.0.0"
+__maintainer__ = "Muhammad Umer Farooq"
+__email__ = "contact@muhammadumerfarooq.me"
+__status__ = "Alpha"
+
+import re
+from html.parser import HTMLParser
+from urllib import parse
+
+import requests
+
+
+class Parser(HTMLParser):
+    def __init__(self, domain: str):
+        HTMLParser.__init__(self)
+        self.data = []
+        self.domain = domain
+
+    def handle_starttag(self, tag: str, attrs: str) -> None:
+        """
+        This function parse html to take takes url from tags
+        """
+        # Only parse the 'anchor' tag.
+        if tag == "a":
+            # Check the list of defined attributes.
+            for name, value in attrs:
+                # If href is defined, and not empty nor # print it.
+                if name == "href" and value != "#" and value != "":
+                    # If not already in data.
+                    if value not in self.data:
+                        url = parse.urljoin(self.domain, value)
+                        self.data.append(url)
+
+
+# Get main domain name (example.com)
+def get_domain_name(url: str) -> str:
+    """
+    This function get the main domain name
+
+    >>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
+    'c.d'
+    >>> get_domain_name("Not a URL!")
+    ''
+    """
+    return ".".join(get_sub_domain_name(url).split(".")[-2:])
+
+
+# Get sub domain name (sub.example.com)
+def get_sub_domain_name(url: str) -> str:
+    """
+    This function get sub domin name
+
+    >>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
+    'a.b.c.d'
+    >>> get_sub_domain_name("Not a URL!")
+    ''
+    """
+    return parse.urlparse(url).netloc
+
+
+def emails_from_url(url: str = "https://github.com") -> list:
+    """
+    This function takes url and return all valid urls
+    """
+    # Get the base domain from the url
+    domain = get_domain_name(url)
+
+    # Initialize the parser
+    parser = Parser(domain)
+
+    try:
+        # Open URL
+        r = requests.get(url)
+
+        # pass the raw HTML to the parser to get links
+        parser.feed(r.text)
+
+        # Get links and loop through
+        valid_emails = set()
+        for link in parser.data:
+            # open URL.
+            # read = requests.get(link)
+            try:
+                read = requests.get(link)
+                # Get the valid email.
+                emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
+                # If not in list then append it.
+                for email in emails:
+                    valid_emails.add(email)
+            except ValueError:
+                pass
+    except ValueError:
+        exit(-1)
+
+    # Finally return a sorted list of email addresses with no duplicates.
+    return sorted(valid_emails)
+
+
+if __name__ == "__main__":
+    emails = emails_from_url("https://github.com")
+    print(f"{len(emails)} emails found:")
+    print("\n".join(sorted(emails)))