awesome-django/scripts/main.py

#!/usr/bin/env -S uv --quiet run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "bs4",
#     "httpx",
#     "pydantic",
#     "python-dateutil",
#     "python-frontmatter",
#     "python-slugify",
#     "pytz",
#     "rich",
#     "typer",
#     "markdown-it-py",
# ]
# ///
import os
import re
from pathlib import Path
from typing import Any
from urllib.parse import urlparse

import frontmatter
import httpx
import typer
from bs4 import BeautifulSoup
from bs4 import Tag
from markdown_it import MarkdownIt
from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic import Field
from rich import print
from rich.progress import track
from slugify import slugify


app = typer.Typer(
    add_help_option=False,
    no_args_is_help=True,
    rich_markup_mode="rich",
)


class Project(BaseModel):
    """Model representing a Django project from the awesome list."""

    model_config = ConfigDict(extra="allow")

    name: str
    description: str
    url: str
    category: str
    slug: str = Field(default="")
    tags: list[str] = Field(default_factory=list)
    github_stars: int | None = None
    github_forks: int | None = None
    github_last_update: str | None = None
    previous_urls: list[str] = Field(default_factory=list)

    def __init__(self, **data):
        super().__init__(**data)
        if not self.slug:
            self.slug = slugify(self.name)


def parse_project_line(line: Tag, category: str) -> Project | None:
    """Parse a project line from the markdown and return a Project object."""
    try:
        # Find the project link
        link = line.find("a")
        if not link:
            return None

        name = link.text.strip()
        url = link.get("href", "").strip()

        # Get description (text after the link)
        description = line.text.replace(name, "").strip()
        description = re.sub(r"^\s*-\s*", "", description)  # Remove leading dash
        description = re.sub(r"^\s*", "", description)  # Remove leading whitespace

        if not all([name, url, description]):
            return None

        return Project(name=name, description=description, url=url, category=category)
    except Exception as e:
        print(f"[red]Error parsing project line: {e}[/red]")
        return None


def read_readme(file_path: Path) -> str:
    """Read README content from local file and convert to HTML."""
    markdown_content = file_path.read_text()
    md = MarkdownIt()
    html_content = md.render(markdown_content)
    return html_content


def parse_readme(content: str) -> list[Project]:
    """Parse README content and extract projects."""
    soup = BeautifulSoup(content, "html.parser")
    projects = []
    current_category = ""

    for element in soup.find_all(["h2", "h3", "li"]):
        if element.name in ["h2", "h3"]:
            current_category = element.text.strip()
        elif element.name == "li" and current_category:
            if current_category == "Contents":
                continue

            project = parse_project_line(element, current_category)
            if project:
                projects.append(project)

    return projects


def merge_project_data(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]:
    """
    Merge existing project data with new data, preserving existing values
    while updating with new information where appropriate.
    """
    # Start with the existing data
    merged = existing.copy()

    # Always update core fields from the README
    core_fields = {"name", "url", "category"}
    for field in core_fields:
        if field in new:
            # If URL is changing, store the old URL in previous_urls
            if field == "url" and new["url"] != existing.get("url"):
                previous_urls = merged.get("previous_urls", [])
                old_url = existing.get("url")
                if old_url and old_url not in previous_urls:
                    previous_urls.append(old_url)
                merged["previous_urls"] = previous_urls
            merged[field] = new[field]

    # Smart merge for description - update only if meaningfully different
    if "description" in new and new["description"] != existing.get("description", ""):
        merged["description"] = new["description"]

    # Update GitHub metrics if they exist in new data
    github_fields = {"github_stars", "github_forks", "github_last_update"}
    for field in github_fields:
        if field in new and new[field] is not None:
            merged[field] = new[field]

    return merged


def save_project(project: Project, output_dir: Path):
    """Save project as a markdown file with frontmatter, preserving and merging existing content."""
    output_file = output_dir / f"{project.slug}.md"
    project_data = project.model_dump(exclude_none=True)

    if output_file.exists():
        try:
            # Load existing file
            existing_post = frontmatter.load(output_file)
            existing_data = dict(existing_post.metadata)

            # Merge data, favoring preservation of existing content
            merged_data = merge_project_data(existing_data, project_data)

            # Create new post with merged data but keep existing content
            post = frontmatter.Post(existing_post.content, **merged_data)
        except Exception as e:
            print(
                f"[yellow]Warning: Could not load existing file {output_file}, creating new: {e}[/yellow]"
            )
            post = frontmatter.Post(project.description, **project_data)
    else:
        # Create new file
        post = frontmatter.Post(project.description, **project_data)

    output_file.write_text(frontmatter.dumps(post))


def extract_github_info(url: str) -> dict[str, str] | None:
    """Extract owner and repo from a GitHub URL."""
    parsed = urlparse(url)
    if parsed.netloc != "github.com":
        return None

    parts = parsed.path.strip("/").split("/")
    if len(parts) >= 2:
        return {"owner": parts[0], "repo": parts[1]}
    return None


def get_github_metrics(
    owner: str, repo: str, client: httpx.Client
) -> tuple[dict, str | None]:
    """
    Fetch GitHub metrics for a repository.
    Returns a tuple of (metrics_dict, new_url) where new_url is set if the repo has moved.
    """
    headers = {}
    if github_token := os.environ.get("GITHUB_TOKEN"):
        headers["Authorization"] = f"token {github_token}"

    api_url = f"https://api.github.com/repos/{owner}/{repo}"
    try:
        response = client.get(
            api_url,
            headers=headers,
            timeout=10.0,
            follow_redirects=True,  # Enable following redirects
        )

        # Check if we followed a redirect
        new_url = None
        if len(response.history) > 0:
            for r in response.history:
                if r.status_code == 301:
                    # Get the new location from the API response
                    data = response.json()
                    new_url = data.get("html_url")
                    if new_url:
                        print(
                            f"[yellow]Repository moved: {owner}/{repo} -> {new_url}[/yellow]"
                        )
                    break

        response.raise_for_status()
        data = response.json()

        return {
            "github_stars": data["stargazers_count"],
            "github_forks": data["forks_count"],
            "github_last_update": data["updated_at"],
        }, new_url

    except httpx.HTTPError as e:
        print(f"[red]Error fetching GitHub metrics for {owner}/{repo}: {str(e)}[/red]")
        return {}, None


def load_project(file_path: Path) -> Project | None:
    """Load a project from a markdown file."""
    try:
        post = frontmatter.load(file_path)
        return Project(**post.metadata)
    except Exception as e:
        print(f"[red]Error loading project from {file_path}: {str(e)}[/red]")
        return None


@app.command()
def parse(readme_path: Path = Path("README.md"), output_dir: str = "_projects"):
    """
    Parse local Awesome Django README and create individual project files with frontmatter.
    Preserves existing file content and metadata while updating with new information from README.
    """
    if not readme_path.exists():
        print(f"[red]Error: README file not found at {readme_path}[/red]")
        raise typer.Exit(1)

    print(f"[bold blue]Reading README from {readme_path}...[/bold blue]")

    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # Read and parse README
    content = read_readme(readme_path)
    projects = parse_readme(content)

    print(f"[green]Found {len(projects)} projects[/green]")

    # Save individual project files
    for project in projects:
        save_project(project, output_path)
        print(f"[green]Updated {project.name} in {project.slug}.md[/green]")


@app.command()
def update_metrics(projects_dir: Path = Path("_projects"), batch_size: int = 50):
    """
    Update GitHub metrics (stars, forks, last update) for all projects.
    """
    if not projects_dir.exists():
        print(f"[red]Error: Projects directory not found at {projects_dir}[/red]")
        raise typer.Exit(1)

    print(
        f"[bold blue]Updating GitHub metrics for projects in {projects_dir}...[/bold blue]"
    )

    # Load all projects
    project_files = list(projects_dir.glob("*.md"))
    projects = []
    for file in project_files:
        if project := load_project(file):
            projects.append((file, project))

    print(f"[green]Found {len(projects)} projects to update[/green]")

    # Update metrics in batches to avoid rate limiting
    with httpx.Client() as client:
        for i in track(
            range(0, len(projects), batch_size), description="Updating projects"
        ):
            batch = projects[i : i + batch_size]
            for file_path, project in batch:
                if github_info := extract_github_info(project.url):
                    metrics, new_url = get_github_metrics(
                        github_info["owner"], github_info["repo"], client
                    )

                    if metrics:
                        # Update project with new metrics
                        for key, value in metrics.items():
                            setattr(project, key, value)

                        # Update URL if repository has moved
                        if new_url and new_url != project.url:
                            # Store the old URL in previous_urls
                            if not hasattr(project, "previous_urls"):
                                project.previous_urls = []
                            project.previous_urls.append(project.url)
                            # Update to new URL
                            project.url = new_url
                            print(
                                f"[yellow]Updated URL for {project.name}: {project.url}[/yellow]"
                            )

                        save_project(project, projects_dir)
                        print(f"[green]Updated metrics for {project.name}[/green]")

    print("[bold blue]Finished updating GitHub metrics![/bold blue]")


if __name__ == "__main__":
    app()
:robot: Quick attempt at a file to help us slice the project up 2025-01-04 18:32:31 +00:00			`#!/usr/bin/env -S uv --quiet run --script`
			`# /// script`
			`# requires-python = ">=3.12"`
			`# dependencies = [`
			`# "bs4",`
			`# "httpx",`
			`# "pydantic",`
			`# "python-dateutil",`
			`# "python-frontmatter",`
			`# "python-slugify",`
			`# "pytz",`
			`# "rich",`
			`# "typer",`
			`# "markdown-it-py",`
			`# ]`
			`# ///`
			`import os`
			`import re`
			`from pathlib import Path`
			`from typing import Any`
			`from urllib.parse import urlparse`

			`import frontmatter`
			`import httpx`
			`import typer`
			`from bs4 import BeautifulSoup`
			`from bs4 import Tag`
			`from markdown_it import MarkdownIt`
			`from pydantic import BaseModel`
			`from pydantic import ConfigDict`
			`from pydantic import Field`
			`from rich import print`
			`from rich.progress import track`
			`from slugify import slugify`


			`app = typer.Typer(`
			`add_help_option=False,`
			`no_args_is_help=True,`
			`rich_markup_mode="rich",`
			`)`


			`class Project(BaseModel):`
			`"""Model representing a Django project from the awesome list."""`

			`model_config = ConfigDict(extra="allow")`

			`name: str`
			`description: str`
			`url: str`
			`category: str`
			`slug: str = Field(default="")`
			`tags: list[str] = Field(default_factory=list)`
			`github_stars: int \| None = None`
			`github_forks: int \| None = None`
			`github_last_update: str \| None = None`
			`previous_urls: list[str] = Field(default_factory=list)`

			`def __init__(self, **data):`
			`super().__init__(**data)`
			`if not self.slug:`
			`self.slug = slugify(self.name)`


			`def parse_project_line(line: Tag, category: str) -> Project \| None:`
			`"""Parse a project line from the markdown and return a Project object."""`
			`try:`
			`# Find the project link`
			`link = line.find("a")`
			`if not link:`
			`return None`

			`name = link.text.strip()`
			`url = link.get("href", "").strip()`

			`# Get description (text after the link)`
			`description = line.text.replace(name, "").strip()`
			`description = re.sub(r"^\s-\s", "", description) # Remove leading dash`
			`description = re.sub(r"^\s*", "", description) # Remove leading whitespace`

			`if not all([name, url, description]):`
			`return None`

			`return Project(name=name, description=description, url=url, category=category)`
			`except Exception as e:`
			`print(f"[red]Error parsing project line: {e}[/red]")`
			`return None`


			`def read_readme(file_path: Path) -> str:`
			`"""Read README content from local file and convert to HTML."""`
			`markdown_content = file_path.read_text()`
			`md = MarkdownIt()`
			`html_content = md.render(markdown_content)`
			`return html_content`


			`def parse_readme(content: str) -> list[Project]:`
			`"""Parse README content and extract projects."""`
			`soup = BeautifulSoup(content, "html.parser")`
			`projects = []`
			`current_category = ""`

			`for element in soup.find_all(["h2", "h3", "li"]):`
			`if element.name in ["h2", "h3"]:`
			`current_category = element.text.strip()`
			`elif element.name == "li" and current_category:`
			`if current_category == "Contents":`
			`continue`

			`project = parse_project_line(element, current_category)`
			`if project:`
			`projects.append(project)`

			`return projects`


			`def merge_project_data(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]:`
			`"""`
			`Merge existing project data with new data, preserving existing values`
			`while updating with new information where appropriate.`
			`"""`
			`# Start with the existing data`
			`merged = existing.copy()`

			`# Always update core fields from the README`
			`core_fields = {"name", "url", "category"}`
			`for field in core_fields:`
			`if field in new:`
			`# If URL is changing, store the old URL in previous_urls`
			`if field == "url" and new["url"] != existing.get("url"):`
			`previous_urls = merged.get("previous_urls", [])`
			`old_url = existing.get("url")`
			`if old_url and old_url not in previous_urls:`
			`previous_urls.append(old_url)`
			`merged["previous_urls"] = previous_urls`
			`merged[field] = new[field]`

			`# Smart merge for description - update only if meaningfully different`
			`if "description" in new and new["description"] != existing.get("description", ""):`
			`merged["description"] = new["description"]`

			`# Update GitHub metrics if they exist in new data`
			`github_fields = {"github_stars", "github_forks", "github_last_update"}`
			`for field in github_fields:`
			`if field in new and new[field] is not None:`
			`merged[field] = new[field]`

			`return merged`


			`def save_project(project: Project, output_dir: Path):`
			`"""Save project as a markdown file with frontmatter, preserving and merging existing content."""`
			`output_file = output_dir / f"{project.slug}.md"`
			`project_data = project.model_dump(exclude_none=True)`

			`if output_file.exists():`
			`try:`
			`# Load existing file`
			`existing_post = frontmatter.load(output_file)`
			`existing_data = dict(existing_post.metadata)`

			`# Merge data, favoring preservation of existing content`
			`merged_data = merge_project_data(existing_data, project_data)`

			`# Create new post with merged data but keep existing content`
			`post = frontmatter.Post(existing_post.content, **merged_data)`
			`except Exception as e:`
			`print(`
			`f"[yellow]Warning: Could not load existing file {output_file}, creating new: {e}[/yellow]"`
			`)`
			`post = frontmatter.Post(project.description, **project_data)`
			`else:`
			`# Create new file`
			`post = frontmatter.Post(project.description, **project_data)`

			`output_file.write_text(frontmatter.dumps(post))`


			`def extract_github_info(url: str) -> dict[str, str] \| None:`
			`"""Extract owner and repo from a GitHub URL."""`
			`parsed = urlparse(url)`
			`if parsed.netloc != "github.com":`
			`return None`

			`parts = parsed.path.strip("/").split("/")`
			`if len(parts) >= 2:`
			`return {"owner": parts[0], "repo": parts[1]}`
			`return None`


			`def get_github_metrics(`
			`owner: str, repo: str, client: httpx.Client`
			`) -> tuple[dict, str \| None]:`
			`"""`
			`Fetch GitHub metrics for a repository.`
			`Returns a tuple of (metrics_dict, new_url) where new_url is set if the repo has moved.`
			`"""`
			`headers = {}`
			`if github_token := os.environ.get("GITHUB_TOKEN"):`
			`headers["Authorization"] = f"token {github_token}"`

			`api_url = f"https://api.github.com/repos/{owner}/{repo}"`
			`try:`
			`response = client.get(`
			`api_url,`
			`headers=headers,`
			`timeout=10.0,`
			`follow_redirects=True, # Enable following redirects`
			`)`

			`# Check if we followed a redirect`
			`new_url = None`
			`if len(response.history) > 0:`
			`for r in response.history:`
			`if r.status_code == 301:`
			`# Get the new location from the API response`
			`data = response.json()`
			`new_url = data.get("html_url")`
			`if new_url:`
			`print(`
			`f"[yellow]Repository moved: {owner}/{repo} -> {new_url}[/yellow]"`
			`)`
			`break`

			`response.raise_for_status()`
			`data = response.json()`

			`return {`
			`"github_stars": data["stargazers_count"],`
			`"github_forks": data["forks_count"],`
			`"github_last_update": data["updated_at"],`
			`}, new_url`

			`except httpx.HTTPError as e:`
			`print(f"[red]Error fetching GitHub metrics for {owner}/{repo}: {str(e)}[/red]")`
			`return {}, None`


			`def load_project(file_path: Path) -> Project \| None:`
			`"""Load a project from a markdown file."""`
			`try:`
			`post = frontmatter.load(file_path)`
			`return Project(**post.metadata)`
			`except Exception as e:`
			`print(f"[red]Error loading project from {file_path}: {str(e)}[/red]")`
			`return None`


			`@app.command()`
			`def parse(readme_path: Path = Path("README.md"), output_dir: str = "_projects"):`
			`"""`
			`Parse local Awesome Django README and create individual project files with frontmatter.`
			`Preserves existing file content and metadata while updating with new information from README.`
			`"""`
			`if not readme_path.exists():`
			`print(f"[red]Error: README file not found at {readme_path}[/red]")`
			`raise typer.Exit(1)`

			`print(f"[bold blue]Reading README from {readme_path}...[/bold blue]")`

			`# Create output directory`
			`output_path = Path(output_dir)`
			`output_path.mkdir(exist_ok=True)`

			`# Read and parse README`
			`content = read_readme(readme_path)`
			`projects = parse_readme(content)`

			`print(f"[green]Found {len(projects)} projects[/green]")`

			`# Save individual project files`
			`for project in projects:`
			`save_project(project, output_path)`
			`print(f"[green]Updated {project.name} in {project.slug}.md[/green]")`


			`@app.command()`
			`def update_metrics(projects_dir: Path = Path("_projects"), batch_size: int = 50):`
			`"""`
			`Update GitHub metrics (stars, forks, last update) for all projects.`
			`"""`
			`if not projects_dir.exists():`
			`print(f"[red]Error: Projects directory not found at {projects_dir}[/red]")`
			`raise typer.Exit(1)`

			`print(`
			`f"[bold blue]Updating GitHub metrics for projects in {projects_dir}...[/bold blue]"`
			`)`

			`# Load all projects`
			`project_files = list(projects_dir.glob("*.md"))`
			`projects = []`
			`for file in project_files:`
			`if project := load_project(file):`
			`projects.append((file, project))`

			`print(f"[green]Found {len(projects)} projects to update[/green]")`

			`# Update metrics in batches to avoid rate limiting`
			`with httpx.Client() as client:`
			`for i in track(`
			`range(0, len(projects), batch_size), description="Updating projects"`
			`):`
			`batch = projects[i : i + batch_size]`
			`for file_path, project in batch:`
			`if github_info := extract_github_info(project.url):`
			`metrics, new_url = get_github_metrics(`
			`github_info["owner"], github_info["repo"], client`
			`)`

			`if metrics:`
			`# Update project with new metrics`
			`for key, value in metrics.items():`
			`setattr(project, key, value)`

			`# Update URL if repository has moved`
			`if new_url and new_url != project.url:`
			`# Store the old URL in previous_urls`
			`if not hasattr(project, "previous_urls"):`
			`project.previous_urls = []`
			`project.previous_urls.append(project.url)`
			`# Update to new URL`
			`project.url = new_url`
			`print(`
			`f"[yellow]Updated URL for {project.name}: {project.url}[/yellow]"`
			`)`

			`save_project(project, projects_dir)`
			`print(f"[green]Updated metrics for {project.name}[/green]")`

			`print("[bold blue]Finished updating GitHub metrics![/bold blue]")`


			`if __name__ == "__main__":`
			`app()`