Python/web_programming/emails_from_url.py
Christian Clauss a2fa32c7ad
Lukazlim: Replace dependency requests with httpx (#12744)
* Replace dependency `requests` with `httpx`

Fixes #12742
Signed-off-by: Lim, Lukaz Wei Hwang <lukaz.wei.hwang.lim@intel.com>

* updating DIRECTORY.md

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Lim, Lukaz Wei Hwang <lukaz.wei.hwang.lim@intel.com>
Co-authored-by: Lim, Lukaz Wei Hwang <lukaz.wei.hwang.lim@intel.com>
Co-authored-by: cclauss <cclauss@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-05-14 04:42:11 +03:00

118 lines
3.3 KiB
Python

"""Get the site emails from URL."""
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "httpx",
# ]
# ///
from __future__ import annotations
__author__ = "Muhammad Umer Farooq"
__license__ = "MIT"
__version__ = "1.0.0"
__maintainer__ = "Muhammad Umer Farooq"
__email__ = "contact@muhammadumerfarooq.me"
__status__ = "Alpha"
import re
from html.parser import HTMLParser
from urllib import parse
import httpx
class Parser(HTMLParser):
def __init__(self, domain: str) -> None:
super().__init__()
self.urls: list[str] = []
self.domain = domain
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
"""
This function parse html to take takes url from tags
"""
# Only parse the 'anchor' tag.
if tag == "a":
# Check the list of defined attributes.
for name, value in attrs:
# If href is defined, not empty nor # print it and not already in urls.
if name == "href" and value not in (*self.urls, "", "#"):
url = parse.urljoin(self.domain, value)
self.urls.append(url)
# Get main domain name (example.com)
def get_domain_name(url: str) -> str:
"""
This function get the main domain name
>>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
'c.d'
>>> get_domain_name("Not a URL!")
''
"""
return ".".join(get_sub_domain_name(url).split(".")[-2:])
# Get sub domain name (sub.example.com)
def get_sub_domain_name(url: str) -> str:
"""
>>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
'a.b.c.d'
>>> get_sub_domain_name("Not a URL!")
''
"""
return parse.urlparse(url).netloc
def emails_from_url(url: str = "https://github.com") -> list[str]:
"""
This function takes url and return all valid urls
"""
# Get the base domain from the url
domain = get_domain_name(url)
# Initialize the parser
parser = Parser(domain)
try:
# Open URL
r = httpx.get(url, timeout=10, follow_redirects=True)
# pass the raw HTML to the parser to get links
parser.feed(r.text)
# Get links and loop through
valid_emails = set()
for link in parser.urls:
# open URL.
# Check if the link is already absolute
if not link.startswith("http://") and not link.startswith("https://"):
# Prepend protocol only if link starts with domain, normalize otherwise
if link.startswith(domain):
link = f"https://{link}"
else:
link = parse.urljoin(f"https://{domain}", link)
try:
read = httpx.get(link, timeout=10, follow_redirects=True)
# Get the valid email.
emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
# If not in list then append it.
for email in emails:
valid_emails.add(email)
except ValueError:
pass
except ValueError:
raise SystemExit(1)
# Finally return a sorted list of email addresses with no duplicates.
return sorted(valid_emails)
if __name__ == "__main__":
emails = emails_from_url("https://github.com")
print(f"{len(emails)} emails found:")
print("\n".join(sorted(emails)))