Fix repos scraping and parsing (#28)

This commit is contained in:
Vladyslav Fedoriuk 2023-11-19 23:17:57 +01:00 committed by GitHub
parent 0610553651
commit d4c00793b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 110 additions and 71 deletions

View File

@ -1,6 +1,6 @@
# Awesome FastAPI Projects # Awesome FastAPI Projects
View the website: https://Kludex.github.io/awesome-fastapi-projects/ View the website: https://kludex.github.io/awesome-fastapi-projects/
## Local Development ## Local Development
@ -42,7 +42,7 @@ make
#### Frontend #### Frontend
The frontend is built with [React](https://reactjs.org/) and [Next.js](https://nextjs.org/). The frontend is built with [React](https://reactjs.org/) and [Next.js](https://nextjs.org/).
It is being statically built and served on GitHub Pages: https://Kludex.github.io/awesome-fastapi-projects/ It is being statically built and served on GitHub Pages: https://kludex.github.io/awesome-fastapi-projects/
To run the frontend locally, you need to install [Node.js](https://nodejs.org/en/) and [pnpm](https://pnpm.io/). To run the frontend locally, you need to install [Node.js](https://nodejs.org/en/) and [pnpm](https://pnpm.io/).
The node version is specified in the `.node-version` file. The node version is specified in the `.node-version` file.

View File

@ -132,13 +132,12 @@ async def db_uow(
db_session: AsyncSession, db_session: AsyncSession,
) -> AsyncGenerator[AsyncSession, None]: ) -> AsyncGenerator[AsyncSession, None]:
"""Provide a transactional scope around a series of operations.""" """Provide a transactional scope around a series of operations."""
from app.uow import async_session_uow
# This context manager will start a transaction, and roll it back at the end # This context manager will start a transaction, and roll it back at the end
# https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#sqlalchemy.ext.asyncio.AsyncSessionTransaction # https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#sqlalchemy.ext.asyncio.AsyncSessionTransaction
async with db_session.begin() as transaction: async with async_session_uow(db_session) as session:
try: yield session
yield db_session
finally:
await transaction.rollback()
@pytest.fixture() @pytest.fixture()

View File

@ -18,7 +18,7 @@ import aiofiles
import sqlalchemy.orm import sqlalchemy.orm
import typer import typer
from app.database import Dependency, Repo from app.database import Dependency, Repo, async_session_maker
from app.models import DependencyDetail, RepoDetail from app.models import DependencyDetail, RepoDetail
from app.uow import async_session_uow from app.uow import async_session_uow
@ -38,9 +38,9 @@ async def create_repos_index() -> None:
:return: None :return: None
""" """
async with async_session_uow() as session, aiofiles.open( async with async_session_maker() as session, async_session_uow(
REPOS_INDEX_PATH, "w" session
) as index_file: ), aiofiles.open(REPOS_INDEX_PATH, "w") as index_file:
await index_file.write( await index_file.write(
json.dumps( json.dumps(
{ {
@ -66,9 +66,9 @@ async def create_dependencies_index() -> None:
:return: None :return: None
""" """
async with async_session_uow() as session, aiofiles.open( async with async_session_maker() as session, async_session_uow(
DEPENDENCIES_INDEX_PATH, "w" session
) as index_file: ) as session, aiofiles.open(DEPENDENCIES_INDEX_PATH, "w") as index_file:
dependencies = [ dependencies = [
DependencyDetail.model_validate(dependency).model_dump() DependencyDetail.model_validate(dependency).model_dump()
async for dependency in ( async for dependency in (

View File

@ -6,11 +6,11 @@ import typer
from loguru import logger from loguru import logger
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.database import Dependency, Repo, RepoDependency from app.database import Dependency, Repo, RepoDependency, async_session_maker
from app.dependencies import acquire_dependencies_data_for_repository from app.dependencies import acquire_dependencies_data_for_repository
from app.source_graph.client import AsyncSourceGraphSSEClient from app.source_graph.client import AsyncSourceGraphSSEClient
from app.source_graph.mapper import create_or_update_repos_from_source_graph_repos_data from app.source_graph.mapper import create_or_update_repos_from_source_graph_repos_data
from app.types import RepoId from app.source_graph.models import SourceGraphRepoData
from app.uow import async_session_uow from app.uow import async_session_uow
@ -124,56 +124,93 @@ async def _create_dependencies_for_repo(session: AsyncSession, repo: Repo) -> No
) )
async def _save_scraped_repos_from_source_graph_repos_data(
source_graph_repos_data: list[SourceGraphRepoData],
) -> None:
"""
Save the scraped repos from the source graph repos data.
.. note::
This function is meant to be used in a task group.
From the SQLAlchemy documentation:
::
https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#using-asyncsession-with-concurrent-tasks
The AsyncSession object is a mutable, stateful object
which represents a single, stateful database
transaction in progress. Using concurrent tasks with asyncio,
with APIs such as asyncio.gather() for example, should use
a separate AsyncSession per individual task.
:param source_graph_repos_data: The source graph repos data.
:return: None
""" # noqa: E501
async with async_session_maker() as session, async_session_uow(session):
saved_repos = await create_or_update_repos_from_source_graph_repos_data(
session=session,
source_graph_repos_data=source_graph_repos_data,
)
logger.info(
"Saving {count} repos.",
count=len(saved_repos),
enqueue=True,
)
await session.commit()
async def scrape_source_graph_repos() -> None: async def scrape_source_graph_repos() -> None:
""" """
Iterate over the source graph repos and create or update them in the database. Iterate over the source graph repos and create or update them in the database.
:return: None :return: None
""" """
async with AsyncSourceGraphSSEClient() as sg_client: async with AsyncSourceGraphSSEClient() as sg_client, asyncio.TaskGroup() as tg:
async with async_session_uow() as session: logger.info(
async with asyncio.TaskGroup() as tg: "Creating or updating repos from source graph repos data.",
logger.info( enqueue=True,
"Creating or updating repos from source graph repos data.", )
enqueue=True, async for sg_repos_data in sg_client.aiter_fastapi_repos():
logger.info(
"Received {count} repos.",
count=len(sg_repos_data),
enqueue=True,
)
tg.create_task(
_save_scraped_repos_from_source_graph_repos_data(
source_graph_repos_data=sg_repos_data
) )
async for sg_repos_data in sg_client.aiter_fastapi_repos(): )
logger.info(
"Received {count} repos.",
count=len(sg_repos_data),
enqueue=True,
)
tg.create_task(
create_or_update_repos_from_source_graph_repos_data(
session=session,
source_graph_repos_data=sg_repos_data,
)
)
await session.commit()
async def parse_dependencies_for_repo( async def parse_dependencies_for_repo(semaphore: asyncio.Semaphore, repo: Repo) -> None:
semaphore: asyncio.Semaphore, repo_id: RepoId
) -> None:
""" """
Parse the dependencies for a given repo and create them in the database. Parse the dependencies for a given repo and create them in the database.
.. note::
This function is meant to be used in a task group.
From the SQLAlchemy documentation:
::
https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#using-asyncsession-with-concurrent-tasks
The AsyncSession object is a mutable, stateful object
which represents a single, stateful database
transaction in progress. Using concurrent tasks with asyncio,
with APIs such as asyncio.gather() for example, should use
a separate AsyncSession per individual task.
:param semaphore: A semaphore to limit the number of concurrent requests :param semaphore: A semaphore to limit the number of concurrent requests
:param repo_id: The id of the repo for which to parse the dependencies :param repo: A repo for which to create and assign the dependencies
:return: None :return: None
""" """ # noqa: E501
async with async_session_uow() as session, semaphore: async with semaphore, async_session_maker() as session, async_session_uow(session):
# Fetch the repo from the database # Associate the repo object with a fresh session instance
logger.info( repo = await session.merge(repo)
"Fetching the repo with id {repo_id}.", repo_id=repo_id, enqueue=True
)
repo = (
await session.scalars(sqlalchemy.select(Repo).where(Repo.id == repo_id))
).one()
# Create the dependencies for the repo # Create the dependencies for the repo
logger.info( logger.info(
"Creating the dependencies for the repo with id {repo_id}.", "Creating the dependencies for the repo with id {repo_id}.",
repo_id=repo_id, repo_id=repo.id,
enqueue=True, enqueue=True,
) )
await _create_dependencies_for_repo(session=session, repo=repo) await _create_dependencies_for_repo(session=session, repo=repo)
@ -187,29 +224,25 @@ async def parse_dependencies_for_repos() -> None:
:return: None. :return: None.
""" """
logger.info("Fetching the repos from the database.", enqueue=True) logger.info("Fetching the repos from the database.", enqueue=True)
async with async_session_uow() as session: async with async_session_maker() as session:
repo_ids = ( repos = (
await session.scalars( await session.scalars(
sqlalchemy.select(Repo.id).order_by( sqlalchemy.select(Repo).order_by(
Repo.last_checked_revision.is_(None).desc() Repo.last_checked_revision.is_(None).desc()
) )
) )
).all() ).all()
logger.info("Fetched {count} repos.", count=len(repo_ids), enqueue=True) logger.info("Fetched {count} repos.", count=len(repos), enqueue=True)
logger.info("Parsing the dependencies for the repos.", enqueue=True) logger.info("Parsing the dependencies for the repos.", enqueue=True)
semaphore = asyncio.Semaphore(10) semaphore = asyncio.Semaphore(10)
async with asyncio.TaskGroup() as tg: async with asyncio.TaskGroup() as tg:
for repo_id in repo_ids: for repo in repos:
logger.info( logger.info(
"Parsing the dependencies for repo {repo_id}.", "Parsing the dependencies for repo {repo_id}.",
repo_id=repo_id, repo_id=repo.id,
enqueue=True, enqueue=True,
) )
tg.create_task( tg.create_task(parse_dependencies_for_repo(semaphore=semaphore, repo=repo))
parse_dependencies_for_repo(
semaphore=semaphore, repo_id=RepoId(repo_id)
)
)
app = typer.Typer() app = typer.Typer()

View File

@ -9,19 +9,25 @@ from contextlib import asynccontextmanager
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.database import async_session_maker
@asynccontextmanager @asynccontextmanager
async def async_session_uow() -> AsyncGenerator[AsyncSession, None]: async def async_session_uow(
session: AsyncSession,
) -> AsyncGenerator[AsyncSession, None]:
""" """
Provide a transactional scope around a series of operations. Provide a transactional scope around a series of operations.
:param session: The database session.
:return: a UoW instance :return: a UoW instance
""" """
async with async_session_maker() as session: async with session.begin():
async with session.begin() as transaction: try:
try: yield session
yield session finally:
finally: if session.in_transaction() and session.is_active:
await transaction.rollback() # session.is_active is True if this Session not in “partial rollback”
# state. If this Session is within a transaction, and that transaction
# has not been rolled back internally, the Session.is_active will also
# be True.
# https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#sqlalchemy.ext.asyncio.AsyncSession.is_active
await session.rollback()

View File

@ -1 +1 @@
NEXT_PUBLIC_PROJECT_REPO_URL="https://github.com/Kludex/awesome-fastapi-projects" NEXT_PUBLIC_PROJECT_REPO_URL="https://github.com/kludex/awesome-fastapi-projects"

View File

@ -10,6 +10,7 @@ import { useVirtual } from "@tanstack/react-virtual";
import { useDependenciesOrama } from "@/lib/search"; import { useDependenciesOrama } from "@/lib/search";
import { search } from "@orama/orama"; import { search } from "@orama/orama";
import { Dependency } from "@/lib/schemas"; import { Dependency } from "@/lib/schemas";
import { cn } from "@/lib/utils";
export function MultiSelect<DataType extends { id: string; name: string }>({ export function MultiSelect<DataType extends { id: string; name: string }>({
data, data,
@ -147,7 +148,7 @@ export function MultiSelect<DataType extends { id: string; name: string }>({
/> />
</div> </div>
</div> </div>
<div className="relative mt-2"> <div className={cn("relative", open && "mt-2")}>
{open && selectables.length > 0 ? ( {open && selectables.length > 0 ? (
<div className="absolute w-full z-10 top-0 rounded-md border bg-popover text-popover-foreground shadow-md outline-none animate-in"> <div className="absolute w-full z-10 top-0 rounded-md border bg-popover text-popover-foreground shadow-md outline-none animate-in">
<CommandGroup <CommandGroup