🤖 Quick attempt at a file to help us slice the project up

This commit is contained in:
Jeff Triplett 2025-01-04 12:32:31 -06:00
parent 087823c88e
commit 5a897c9095
No known key found for this signature in database

337
scripts/main.py Normal file
View File

@ -0,0 +1,337 @@
#!/usr/bin/env -S uv --quiet run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "bs4",
# "httpx",
# "pydantic",
# "python-dateutil",
# "python-frontmatter",
# "python-slugify",
# "pytz",
# "rich",
# "typer",
# "markdown-it-py",
# ]
# ///
import os
import re
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
import frontmatter
import httpx
import typer
from bs4 import BeautifulSoup
from bs4 import Tag
from markdown_it import MarkdownIt
from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic import Field
from rich import print
from rich.progress import track
from slugify import slugify
app = typer.Typer(
add_help_option=False,
no_args_is_help=True,
rich_markup_mode="rich",
)
class Project(BaseModel):
"""Model representing a Django project from the awesome list."""
model_config = ConfigDict(extra="allow")
name: str
description: str
url: str
category: str
slug: str = Field(default="")
tags: list[str] = Field(default_factory=list)
github_stars: int | None = None
github_forks: int | None = None
github_last_update: str | None = None
previous_urls: list[str] = Field(default_factory=list)
def __init__(self, **data):
super().__init__(**data)
if not self.slug:
self.slug = slugify(self.name)
def parse_project_line(line: Tag, category: str) -> Project | None:
"""Parse a project line from the markdown and return a Project object."""
try:
# Find the project link
link = line.find("a")
if not link:
return None
name = link.text.strip()
url = link.get("href", "").strip()
# Get description (text after the link)
description = line.text.replace(name, "").strip()
description = re.sub(r"^\s*-\s*", "", description) # Remove leading dash
description = re.sub(r"^\s*", "", description) # Remove leading whitespace
if not all([name, url, description]):
return None
return Project(name=name, description=description, url=url, category=category)
except Exception as e:
print(f"[red]Error parsing project line: {e}[/red]")
return None
def read_readme(file_path: Path) -> str:
"""Read README content from local file and convert to HTML."""
markdown_content = file_path.read_text()
md = MarkdownIt()
html_content = md.render(markdown_content)
return html_content
def parse_readme(content: str) -> list[Project]:
"""Parse README content and extract projects."""
soup = BeautifulSoup(content, "html.parser")
projects = []
current_category = ""
for element in soup.find_all(["h2", "h3", "li"]):
if element.name in ["h2", "h3"]:
current_category = element.text.strip()
elif element.name == "li" and current_category:
if current_category == "Contents":
continue
project = parse_project_line(element, current_category)
if project:
projects.append(project)
return projects
def merge_project_data(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]:
"""
Merge existing project data with new data, preserving existing values
while updating with new information where appropriate.
"""
# Start with the existing data
merged = existing.copy()
# Always update core fields from the README
core_fields = {"name", "url", "category"}
for field in core_fields:
if field in new:
# If URL is changing, store the old URL in previous_urls
if field == "url" and new["url"] != existing.get("url"):
previous_urls = merged.get("previous_urls", [])
old_url = existing.get("url")
if old_url and old_url not in previous_urls:
previous_urls.append(old_url)
merged["previous_urls"] = previous_urls
merged[field] = new[field]
# Smart merge for description - update only if meaningfully different
if "description" in new and new["description"] != existing.get("description", ""):
merged["description"] = new["description"]
# Update GitHub metrics if they exist in new data
github_fields = {"github_stars", "github_forks", "github_last_update"}
for field in github_fields:
if field in new and new[field] is not None:
merged[field] = new[field]
return merged
def save_project(project: Project, output_dir: Path):
"""Save project as a markdown file with frontmatter, preserving and merging existing content."""
output_file = output_dir / f"{project.slug}.md"
project_data = project.model_dump(exclude_none=True)
if output_file.exists():
try:
# Load existing file
existing_post = frontmatter.load(output_file)
existing_data = dict(existing_post.metadata)
# Merge data, favoring preservation of existing content
merged_data = merge_project_data(existing_data, project_data)
# Create new post with merged data but keep existing content
post = frontmatter.Post(existing_post.content, **merged_data)
except Exception as e:
print(
f"[yellow]Warning: Could not load existing file {output_file}, creating new: {e}[/yellow]"
)
post = frontmatter.Post(project.description, **project_data)
else:
# Create new file
post = frontmatter.Post(project.description, **project_data)
output_file.write_text(frontmatter.dumps(post))
def extract_github_info(url: str) -> dict[str, str] | None:
"""Extract owner and repo from a GitHub URL."""
parsed = urlparse(url)
if parsed.netloc != "github.com":
return None
parts = parsed.path.strip("/").split("/")
if len(parts) >= 2:
return {"owner": parts[0], "repo": parts[1]}
return None
def get_github_metrics(
owner: str, repo: str, client: httpx.Client
) -> tuple[dict, str | None]:
"""
Fetch GitHub metrics for a repository.
Returns a tuple of (metrics_dict, new_url) where new_url is set if the repo has moved.
"""
headers = {}
if github_token := os.environ.get("GITHUB_TOKEN"):
headers["Authorization"] = f"token {github_token}"
api_url = f"https://api.github.com/repos/{owner}/{repo}"
try:
response = client.get(
api_url,
headers=headers,
timeout=10.0,
follow_redirects=True, # Enable following redirects
)
# Check if we followed a redirect
new_url = None
if len(response.history) > 0:
for r in response.history:
if r.status_code == 301:
# Get the new location from the API response
data = response.json()
new_url = data.get("html_url")
if new_url:
print(
f"[yellow]Repository moved: {owner}/{repo} -> {new_url}[/yellow]"
)
break
response.raise_for_status()
data = response.json()
return {
"github_stars": data["stargazers_count"],
"github_forks": data["forks_count"],
"github_last_update": data["updated_at"],
}, new_url
except httpx.HTTPError as e:
print(f"[red]Error fetching GitHub metrics for {owner}/{repo}: {str(e)}[/red]")
return {}, None
def load_project(file_path: Path) -> Project | None:
"""Load a project from a markdown file."""
try:
post = frontmatter.load(file_path)
return Project(**post.metadata)
except Exception as e:
print(f"[red]Error loading project from {file_path}: {str(e)}[/red]")
return None
@app.command()
def parse(readme_path: Path = Path("README.md"), output_dir: str = "_projects"):
"""
Parse local Awesome Django README and create individual project files with frontmatter.
Preserves existing file content and metadata while updating with new information from README.
"""
if not readme_path.exists():
print(f"[red]Error: README file not found at {readme_path}[/red]")
raise typer.Exit(1)
print(f"[bold blue]Reading README from {readme_path}...[/bold blue]")
# Create output directory
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# Read and parse README
content = read_readme(readme_path)
projects = parse_readme(content)
print(f"[green]Found {len(projects)} projects[/green]")
# Save individual project files
for project in projects:
save_project(project, output_path)
print(f"[green]Updated {project.name} in {project.slug}.md[/green]")
@app.command()
def update_metrics(projects_dir: Path = Path("_projects"), batch_size: int = 50):
"""
Update GitHub metrics (stars, forks, last update) for all projects.
"""
if not projects_dir.exists():
print(f"[red]Error: Projects directory not found at {projects_dir}[/red]")
raise typer.Exit(1)
print(
f"[bold blue]Updating GitHub metrics for projects in {projects_dir}...[/bold blue]"
)
# Load all projects
project_files = list(projects_dir.glob("*.md"))
projects = []
for file in project_files:
if project := load_project(file):
projects.append((file, project))
print(f"[green]Found {len(projects)} projects to update[/green]")
# Update metrics in batches to avoid rate limiting
with httpx.Client() as client:
for i in track(
range(0, len(projects), batch_size), description="Updating projects"
):
batch = projects[i : i + batch_size]
for file_path, project in batch:
if github_info := extract_github_info(project.url):
metrics, new_url = get_github_metrics(
github_info["owner"], github_info["repo"], client
)
if metrics:
# Update project with new metrics
for key, value in metrics.items():
setattr(project, key, value)
# Update URL if repository has moved
if new_url and new_url != project.url:
# Store the old URL in previous_urls
if not hasattr(project, "previous_urls"):
project.previous_urls = []
project.previous_urls.append(project.url)
# Update to new URL
project.url = new_url
print(
f"[yellow]Updated URL for {project.name}: {project.url}[/yellow]"
)
save_project(project, projects_dir)
print(f"[green]Updated metrics for {project.name}[/green]")
print("[bold blue]Finished updating GitHub metrics![/bold blue]")
if __name__ == "__main__":
app()