awesome-fastapi-projects/app/scrape.py

244 lines
7.8 KiB
Python
Raw Normal View History

Web App (#25) * Set up the web project dependencies - Add linters, pre-commit, and GitHub Actions - Add a Makefile - Add a pyproject.toml * Fix pyupgrade job - Remove continue_on_error everywhere * Remove old code * Rename a GithubActions job * Change README * Adjust pre-commit and GitHub actions * Add tables and set up alembic * Set up tests * Extend tests * Add coverage config * Adjust the GithubActions workflow * Fix GithubActions workflow * Try fixing pyproject-fmt config * Fix formatting of pyproject.toml * Fix formatting of pyproject.toml * Add coverage report * Test listing the repositories * Add a working prototype of SourceGraph client * Add parsing of the SourceGraph SSE data * Fix tests * Ged rid of packages replaced by ruff * Fix waits in the SourceGraph client * Refactor the models and add a mapper - A new mapper allows to create database repositories from the SourceGraph data * Add mypy * Try fixing mypy action * Remove redundant configs * Exclude tests from type checking * Fix mypy pre-commit and GitHub action * Ignore factories * Make upserting possible for source graph data * Add logic for parsing the dependencies and populating the database * Add a database and a cron GitHub Action job * Try manually trigger a workflow * Bring back the old config * Add ReadTimeout for errors to retry for in SourceGraph client * Add typer * Adjust the docstrings * Update the database * Refactor and optimize scraping and dependencies parsing * Make scraping run on push for now * Add a unique constraint for the repo url and source graph repo id * Change the index columns in on_conflict statement for repo creation * Optimize dependencies parsing - Do not parse dependencies for a repo when revision did not change * Scraped repositories from Source Graph * Refactor scraping * Set up frontend * Scraped repositories from Source Graph * Add TODOs * Skip scraping when testing * Fix a test with updating the repos * Scraped repositories from Source Graph * Add some more TODOs * Scraped repositories from Source Graph * Add some more TODO comments * Add chadcn/ui * Scraped repositories from Source Graph * Create index.json * Scraped repositories from Source Graph * Add a draft of data table and display all the repos * Scraped repositories from Source Graph * Implement stars badges and description with overflow * Format the links to Github repos * Fix link clicking * Scraped repositories from Source Graph * Add simple pagination and stars column sorting * Scraped repositories from Source Graph * Implement basic searching * Scraped repositories from Source Graph * Implement a multiselect for dependencies * Scraped repositories from Source Graph * Implement actual filtering by dependencies * Scraped repositories from Source Graph * Add a workflow to deploy nextjs on github pages * Try fixing the deployment job * Enable static exports for app router * Fix uploading arifacts for nextjs job * Set base path to properly load JS and CSS * Fix the base path * Scraped repositories from Source Graph * Add header * Remove language version * Scraped repositories from Source Graph * Add some more TODOs * Scraped repositories from Source Graph * Adjust the pre-commit config * Fix pipelines * Scraped repositories from Source Graph * Add a footer * Create the indexes * Scraped repositories from Source Graph * Add more TODOs * Introduce minor footer adjustments * Adjust the scraping actions * Scraped repositories from Source Graph, parsed the dependencies, and generated the indexes * Implement query params state * Scraped repositories from Source Graph, parsed the dependencies, and generated the indexes * Do not commit query state on unmount * Scraped repositories from Source Graph, parsed the dependencies, and generated the indexes * Hopefully fix query states and multiselect input * Scraped repositories from Source Graph, parsed the dependencies, and generated the indexes * Extend the Makefile * Resolve most of TODOs * Resolve the conflicts with anyio version, bring back httpx * Adjust the Makefile and README.md * Fix a typo in README.md * Adjust readme * Fix the Makefile * Fix some stuff * Make some adjustments * Possibly fix failing scraping jobs * Load the repo project URL from env --------- Co-authored-by: vladfedoriuk <vladfedoriuk@users.noreply.github.com> Co-authored-by: Vladyslav Fedoriuk <vladyslav.fedoriuk@deployed.pl>
2023-10-28 19:39:02 +00:00
"""The logic for scraping the source graph data processing it."""
import asyncio
import sqlalchemy.dialects.sqlite
import typer
from loguru import logger
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import Dependency, Repo, RepoDependency
from app.dependencies import acquire_dependencies_data_for_repository
from app.source_graph.client import AsyncSourceGraphSSEClient
from app.source_graph.mapper import create_or_update_repos_from_source_graph_repos_data
from app.types import RepoId
from app.uow import async_session_uow
async def _create_dependencies_for_repo(session: AsyncSession, repo: Repo) -> None:
"""
Create dependencies for a repo.
For each parsed dependency, creates a new record in the database, if such a
dependency does not exist.
Then, assigns the dependencies to the given repo.
:param session: An asynchronous session object
:param repo: A repo for which to create and assign the dependencies
"""
# Acquire the dependencies data for the repo
logger.info(
"Acquiring the dependencies data for the repo with id {repo_id}.",
repo_id=repo.id,
enqueue=True,
)
try:
(
revision,
dependencies_create_data,
) = await acquire_dependencies_data_for_repository(repo)
except RuntimeError:
# If the parsing fails,
# just skip creating the dependencies
logger.error(
"Failed to acquire the dependencies data for the repo with id {repo_id}.",
repo_id=repo.id,
enqueue=True,
)
return
if repo.last_checked_revision == revision:
# If the repo has already been updated,
# just skip creating the dependencies
logger.info(
"The repo with id {repo_id} has fresh dependencies.",
repo_id=repo.id,
enqueue=True,
)
return
if not dependencies_create_data:
# If there are no dependencies,
# just skip creating the dependencies
logger.info(
"The repo with id {repo_id} has no dependencies.",
repo_id=repo.id,
enqueue=True,
)
return
# Update the repo with the revision hash
logger.info(
"Updating the repo with id {repo_id} with the revision hash {revision}.",
repo_id=repo.id,
revision=revision,
enqueue=True,
)
update_repo_statement = (
sqlalchemy.update(Repo)
.where(Repo.id == repo.id)
.values(last_checked_revision=revision)
)
await session.execute(update_repo_statement)
# Create dependencies - on conflict do nothing.
# This is to avoid creating duplicate dependencies.
logger.info(
"Creating the dependencies for the repo with id {repo_id}.",
repo_id=repo.id,
enqueue=True,
)
insert_dependencies_statement = sqlalchemy.dialects.sqlite.insert(
Dependency
).on_conflict_do_nothing(index_elements=[Dependency.name])
await session.execute(
insert_dependencies_statement.returning(Dependency),
[
{
"name": dependency_data.name,
}
for dependency_data in dependencies_create_data
],
)
# Re-fetch the dependencies from the database
dependencies = (
await session.scalars(
sqlalchemy.select(Dependency).where(
Dependency.name.in_(
[
dependency_data.name
for dependency_data in dependencies_create_data
]
)
)
)
).all()
# Add the dependencies to the repo
insert_repo_dependencies_statement = sqlalchemy.dialects.sqlite.insert(
RepoDependency
).on_conflict_do_nothing([RepoDependency.repo_id, RepoDependency.dependency_id])
await session.execute(
insert_repo_dependencies_statement,
[
{
"repo_id": repo.id,
"dependency_id": dependency.id,
}
for dependency in dependencies
],
)
async def scrape_source_graph_repos() -> None:
"""
Iterate over the source graph repos and create or update them in the database.
:return: None
"""
async with AsyncSourceGraphSSEClient() as sg_client:
async with async_session_uow() as session:
async with asyncio.TaskGroup() as tg:
logger.info(
"Creating or updating repos from source graph repos data.",
enqueue=True,
)
async for sg_repos_data in sg_client.aiter_fastapi_repos():
logger.info(
"Received {count} repos.",
count=len(sg_repos_data),
enqueue=True,
)
tg.create_task(
create_or_update_repos_from_source_graph_repos_data(
session=session,
source_graph_repos_data=sg_repos_data,
)
)
await session.commit()
async def parse_dependencies_for_repo(
semaphore: asyncio.Semaphore, repo_id: RepoId
) -> None:
"""
Parse the dependencies for a given repo and create them in the database.
:param semaphore: A semaphore to limit the number of concurrent requests
:param repo_id: The id of the repo for which to parse the dependencies
:return: None
"""
async with async_session_uow() as session, semaphore:
# Fetch the repo from the database
logger.info(
"Fetching the repo with id {repo_id}.", repo_id=repo_id, enqueue=True
)
repo = (
await session.scalars(sqlalchemy.select(Repo).where(Repo.id == repo_id))
).one()
# Create the dependencies for the repo
logger.info(
"Creating the dependencies for the repo with id {repo_id}.",
repo_id=repo_id,
enqueue=True,
)
await _create_dependencies_for_repo(session=session, repo=repo)
await session.commit()
async def parse_dependencies_for_repos() -> None:
"""
Parse the dependencies for all the repos in the database.
:return: None.
"""
logger.info("Fetching the repos from the database.", enqueue=True)
async with async_session_uow() as session:
repo_ids = (
await session.scalars(
sqlalchemy.select(Repo.id).order_by(
Repo.last_checked_revision.is_(None).desc()
)
)
).all()
logger.info("Fetched {count} repos.", count=len(repo_ids), enqueue=True)
logger.info("Parsing the dependencies for the repos.", enqueue=True)
semaphore = asyncio.Semaphore(10)
async with asyncio.TaskGroup() as tg:
for repo_id in repo_ids:
logger.info(
"Parsing the dependencies for repo {repo_id}.",
repo_id=repo_id,
enqueue=True,
)
tg.create_task(
parse_dependencies_for_repo(
semaphore=semaphore, repo_id=RepoId(repo_id)
)
)
app = typer.Typer()
@app.command()
def scrape_repos() -> None:
"""
Scrape the FastAPI-related repositories utilizing the source graph API.
:return: None
"""
logger.info("Scraping the source graph repos.", enqueue=True)
asyncio.run(scrape_source_graph_repos())
@app.command()
def parse_dependencies() -> None:
"""
Parse the dependencies for all the repos in the database.
:return: None.
"""
logger.info(
"Parsing the dependencies for all the repos in the database.", enqueue=True
)
asyncio.run(parse_dependencies_for_repos())
if __name__ == "__main__":
app()