Python/web_programming/emails_from_url.py

"""Get the site emails from URL."""

from __future__ import annotations

__author__ = "Muhammad Umer Farooq"
__license__ = "MIT"
__version__ = "1.0.0"
__maintainer__ = "Muhammad Umer Farooq"
__email__ = "contact@muhammadumerfarooq.me"
__status__ = "Alpha"

import re
from html.parser import HTMLParser
from urllib import parse

import requests


class Parser(HTMLParser):
    def __init__(self, domain: str) -> None:
        super().__init__()
        self.urls: list[str] = []
        self.domain = domain

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        """
        This function parse html to take takes url from tags
        """
        # Only parse the 'anchor' tag.
        if tag == "a":
            # Check the list of defined attributes.
            for name, value in attrs:
                # If href is defined, not empty nor # print it and not already in urls.
                if (
                    name == "href"
                    and value != "#"
                    and value != ""
                    and value not in self.urls
                ):
                    url = parse.urljoin(self.domain, value)
                    self.urls.append(url)


# Get main domain name (example.com)
def get_domain_name(url: str) -> str:
    """
    This function get the main domain name

    >>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
    'c.d'
    >>> get_domain_name("Not a URL!")
    ''
    """
    return ".".join(get_sub_domain_name(url).split(".")[-2:])


# Get sub domain name (sub.example.com)
def get_sub_domain_name(url: str) -> str:
    """
    >>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
    'a.b.c.d'
    >>> get_sub_domain_name("Not a URL!")
    ''
    """
    return parse.urlparse(url).netloc


def emails_from_url(url: str = "https://github.com") -> list[str]:
    """
    This function takes url and return all valid urls
    """
    # Get the base domain from the url
    domain = get_domain_name(url)

    # Initialize the parser
    parser = Parser(domain)

    try:
        # Open URL
        r = requests.get(url)

        # pass the raw HTML to the parser to get links
        parser.feed(r.text)

        # Get links and loop through
        valid_emails = set()
        for link in parser.urls:
            # open URL.
            # read = requests.get(link)
            try:
                read = requests.get(link)
                # Get the valid email.
                emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
                # If not in list then append it.
                for email in emails:
                    valid_emails.add(email)
            except ValueError:
                pass
    except ValueError:
        raise SystemExit(1)

    # Finally return a sorted list of email addresses with no duplicates.
    return sorted(valid_emails)


if __name__ == "__main__":
    emails = emails_from_url("https://github.com")
    print(f"{len(emails)} emails found:")
    print("\n".join(sorted(emails)))
Create emails_from_url.py (#1756) * Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2020-02-26 10:41:56 +00:00			`"""Get the site emails from URL."""`
[pre-commit.ci] pre-commit autoupdate (#11322) * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.2.2 → v0.3.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.2.2...v0.3.2) - [github.com/pre-commit/mirrors-mypy: v1.8.0 → v1.9.0](https://github.com/pre-commit/mirrors-mypy/compare/v1.8.0...v1.9.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2024-03-13 06:52:41 +00:00
Pyupgrade to Python 3.9 (#4718) * Pyupgrade to Python 3.9 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2021-09-07 11:37:03 +00:00			`from __future__ import annotations`

Create emails_from_url.py (#1756) * Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2020-02-26 10:41:56 +00:00			`__author__ = "Muhammad Umer Farooq"`
			`__license__ = "MIT"`
			`__version__ = "1.0.0"`
			`__maintainer__ = "Muhammad Umer Farooq"`
			`__email__ = "contact@muhammadumerfarooq.me"`
			`__status__ = "Alpha"`

			`import re`
			`from html.parser import HTMLParser`
			`from urllib import parse`

			`import requests`


			`class Parser(HTMLParser):`
[mypy] Fix web_programming directory (#4297) * Update world_covid19_stats.py * Delete monkeytype_config.py * updating DIRECTORY.md * Apply pyannotate suggestions to emails_from_url.py * mypy web_programming/emails_from_url.py * super().__init__() * mypy --ignore-missing-imports web_programming/emails_from_url.py * Update emails_from_url.py * self.urls: list[str] = [] * mypy: Fix web_programming directory Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Co-authored-by: Dhruv Manilawala <dhruvmanila@gmail.com> 2021-03-31 03:18:07 +00:00			`def __init__(self, domain: str) -> None:`
			`super().__init__()`
			`self.urls: list[str] = []`
Create emails_from_url.py (#1756) * Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2020-02-26 10:41:56 +00:00			`self.domain = domain`

Pyupgrade to Python 3.9 (#4718) * Pyupgrade to Python 3.9 * updating DIRECTORY.md Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> 2021-09-07 11:37:03 +00:00			`def handle_starttag(self, tag: str, attrs: list[tuple[str, str \| None]]) -> None:`
Create emails_from_url.py (#1756) * Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2020-02-26 10:41:56 +00:00			`"""`
			`This function parse html to take takes url from tags`
			`"""`
			`# Only parse the 'anchor' tag.`
			`if tag == "a":`
			`# Check the list of defined attributes.`
			`for name, value in attrs:`
Enable ruff SIM102 rule (#11341) * Enable ruff SIM102 rule * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2024-04-02 01:27:56 +00:00			`# If href is defined, not empty nor # print it and not already in urls.`
			`if (`
			`name == "href"`
			`and value != "#"`
			`and value != ""`
			`and value not in self.urls`
			`):`
			`url = parse.urljoin(self.domain, value)`
			`self.urls.append(url)`
Create emails_from_url.py (#1756) * Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2020-02-26 10:41:56 +00:00

			`# Get main domain name (example.com)`
			`def get_domain_name(url: str) -> str:`
			`"""`
			`This function get the main domain name`

			`>>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")`
			`'c.d'`
			`>>> get_domain_name("Not a URL!")`
			`''`
			`"""`
			`return ".".join(get_sub_domain_name(url).split(".")[-2:])`


			`# Get sub domain name (sub.example.com)`
			`def get_sub_domain_name(url: str) -> str:`
			`"""`
			`>>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")`
			`'a.b.c.d'`
			`>>> get_sub_domain_name("Not a URL!")`
			`''`
			`"""`
			`return parse.urlparse(url).netloc`


[mypy] Fix web_programming directory (#4297) * Update world_covid19_stats.py * Delete monkeytype_config.py * updating DIRECTORY.md * Apply pyannotate suggestions to emails_from_url.py * mypy web_programming/emails_from_url.py * super().__init__() * mypy --ignore-missing-imports web_programming/emails_from_url.py * Update emails_from_url.py * self.urls: list[str] = [] * mypy: Fix web_programming directory Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Co-authored-by: Dhruv Manilawala <dhruvmanila@gmail.com> 2021-03-31 03:18:07 +00:00			`def emails_from_url(url: str = "https://github.com") -> list[str]:`
Create emails_from_url.py (#1756) * Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2020-02-26 10:41:56 +00:00			`"""`
			`This function takes url and return all valid urls`
			`"""`
			`# Get the base domain from the url`
			`domain = get_domain_name(url)`

			`# Initialize the parser`
			`parser = Parser(domain)`

			`try:`
			`# Open URL`
			`r = requests.get(url)`

			`# pass the raw HTML to the parser to get links`
			`parser.feed(r.text)`

			`# Get links and loop through`
			`valid_emails = set()`
[mypy] Fix web_programming directory (#4297) * Update world_covid19_stats.py * Delete monkeytype_config.py * updating DIRECTORY.md * Apply pyannotate suggestions to emails_from_url.py * mypy web_programming/emails_from_url.py * super().__init__() * mypy --ignore-missing-imports web_programming/emails_from_url.py * Update emails_from_url.py * self.urls: list[str] = [] * mypy: Fix web_programming directory Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Co-authored-by: Dhruv Manilawala <dhruvmanila@gmail.com> 2021-03-31 03:18:07 +00:00			`for link in parser.urls:`
Create emails_from_url.py (#1756) * Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2020-02-26 10:41:56 +00:00			`# open URL.`
			`# read = requests.get(link)`
			`try:`
			`read = requests.get(link)`
			`# Get the valid email.`
			`emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text)`
			`# If not in list then append it.`
			`for email in emails:`
			`valid_emails.add(email)`
			`except ValueError:`
			`pass`
			`except ValueError:`
Misc fixes across multiple algorithms (#6912) Source: Snyk code quality Add scikit-fuzzy to requirements Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Dhruv Manilawala <dhruvmanila@gmail.com> 2022-10-16 05:25:38 +00:00			`raise SystemExit(1)`
Create emails_from_url.py (#1756) * Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <itssvinayak@gmail.com> Co-authored-by: Christian Clauss <cclauss@me.com> 2020-02-26 10:41:56 +00:00
			`# Finally return a sorted list of email addresses with no duplicates.`
			`return sorted(valid_emails)`


			`if __name__ == "__main__":`
			`emails = emails_from_url("https://github.com")`
			`print(f"{len(emails)} emails found:")`
			`print("\n".join(sorted(emails)))`