Fix repos scraping and parsing (#28)

This commit is contained in:
Vladyslav Fedoriuk 2023-11-19 23:17:57 +01:00 committed by GitHub
parent 0610553651
commit d4c00793b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 110 additions and 71 deletions

View File

@ -1,6 +1,6 @@
# Awesome FastAPI Projects
View the website: https://Kludex.github.io/awesome-fastapi-projects/
View the website: https://kludex.github.io/awesome-fastapi-projects/
## Local Development
@ -42,7 +42,7 @@ make
#### Frontend
The frontend is built with [React](https://reactjs.org/) and [Next.js](https://nextjs.org/).
It is being statically built and served on GitHub Pages: https://Kludex.github.io/awesome-fastapi-projects/
It is being statically built and served on GitHub Pages: https://kludex.github.io/awesome-fastapi-projects/
To run the frontend locally, you need to install [Node.js](https://nodejs.org/en/) and [pnpm](https://pnpm.io/).
The node version is specified in the `.node-version` file.

View File

@ -132,13 +132,12 @@ async def db_uow(
db_session: AsyncSession,
) -> AsyncGenerator[AsyncSession, None]:
"""Provide a transactional scope around a series of operations."""
from app.uow import async_session_uow
# This context manager will start a transaction, and roll it back at the end
# https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#sqlalchemy.ext.asyncio.AsyncSessionTransaction
async with db_session.begin() as transaction:
try:
yield db_session
finally:
await transaction.rollback()
async with async_session_uow(db_session) as session:
yield session
@pytest.fixture()

View File

@ -18,7 +18,7 @@ import aiofiles
import sqlalchemy.orm
import typer
from app.database import Dependency, Repo
from app.database import Dependency, Repo, async_session_maker
from app.models import DependencyDetail, RepoDetail
from app.uow import async_session_uow
@ -38,9 +38,9 @@ async def create_repos_index() -> None:
:return: None
"""
async with async_session_uow() as session, aiofiles.open(
REPOS_INDEX_PATH, "w"
) as index_file:
async with async_session_maker() as session, async_session_uow(
session
), aiofiles.open(REPOS_INDEX_PATH, "w") as index_file:
await index_file.write(
json.dumps(
{
@ -66,9 +66,9 @@ async def create_dependencies_index() -> None:
:return: None
"""
async with async_session_uow() as session, aiofiles.open(
DEPENDENCIES_INDEX_PATH, "w"
) as index_file:
async with async_session_maker() as session, async_session_uow(
session
) as session, aiofiles.open(DEPENDENCIES_INDEX_PATH, "w") as index_file:
dependencies = [
DependencyDetail.model_validate(dependency).model_dump()
async for dependency in (

View File

@ -6,11 +6,11 @@ import typer
from loguru import logger
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import Dependency, Repo, RepoDependency
from app.database import Dependency, Repo, RepoDependency, async_session_maker
from app.dependencies import acquire_dependencies_data_for_repository
from app.source_graph.client import AsyncSourceGraphSSEClient
from app.source_graph.mapper import create_or_update_repos_from_source_graph_repos_data
from app.types import RepoId
from app.source_graph.models import SourceGraphRepoData
from app.uow import async_session_uow
@ -124,15 +124,48 @@ async def _create_dependencies_for_repo(session: AsyncSession, repo: Repo) -> No
)
async def _save_scraped_repos_from_source_graph_repos_data(
source_graph_repos_data: list[SourceGraphRepoData],
) -> None:
"""
Save the scraped repos from the source graph repos data.
.. note::
This function is meant to be used in a task group.
From the SQLAlchemy documentation:
::
https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#using-asyncsession-with-concurrent-tasks
The AsyncSession object is a mutable, stateful object
which represents a single, stateful database
transaction in progress. Using concurrent tasks with asyncio,
with APIs such as asyncio.gather() for example, should use
a separate AsyncSession per individual task.
:param source_graph_repos_data: The source graph repos data.
:return: None
""" # noqa: E501
async with async_session_maker() as session, async_session_uow(session):
saved_repos = await create_or_update_repos_from_source_graph_repos_data(
session=session,
source_graph_repos_data=source_graph_repos_data,
)
logger.info(
"Saving {count} repos.",
count=len(saved_repos),
enqueue=True,
)
await session.commit()
async def scrape_source_graph_repos() -> None:
"""
Iterate over the source graph repos and create or update them in the database.
:return: None
"""
async with AsyncSourceGraphSSEClient() as sg_client:
async with async_session_uow() as session:
async with asyncio.TaskGroup() as tg:
async with AsyncSourceGraphSSEClient() as sg_client, asyncio.TaskGroup() as tg:
logger.info(
"Creating or updating repos from source graph repos data.",
enqueue=True,
@ -144,36 +177,40 @@ async def scrape_source_graph_repos() -> None:
enqueue=True,
)
tg.create_task(
create_or_update_repos_from_source_graph_repos_data(
session=session,
source_graph_repos_data=sg_repos_data,
_save_scraped_repos_from_source_graph_repos_data(
source_graph_repos_data=sg_repos_data
)
)
await session.commit()
async def parse_dependencies_for_repo(
semaphore: asyncio.Semaphore, repo_id: RepoId
) -> None:
async def parse_dependencies_for_repo(semaphore: asyncio.Semaphore, repo: Repo) -> None:
"""
Parse the dependencies for a given repo and create them in the database.
.. note::
This function is meant to be used in a task group.
From the SQLAlchemy documentation:
::
https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#using-asyncsession-with-concurrent-tasks
The AsyncSession object is a mutable, stateful object
which represents a single, stateful database
transaction in progress. Using concurrent tasks with asyncio,
with APIs such as asyncio.gather() for example, should use
a separate AsyncSession per individual task.
:param semaphore: A semaphore to limit the number of concurrent requests
:param repo_id: The id of the repo for which to parse the dependencies
:param repo: A repo for which to create and assign the dependencies
:return: None
"""
async with async_session_uow() as session, semaphore:
# Fetch the repo from the database
logger.info(
"Fetching the repo with id {repo_id}.", repo_id=repo_id, enqueue=True
)
repo = (
await session.scalars(sqlalchemy.select(Repo).where(Repo.id == repo_id))
).one()
""" # noqa: E501
async with semaphore, async_session_maker() as session, async_session_uow(session):
# Associate the repo object with a fresh session instance
repo = await session.merge(repo)
# Create the dependencies for the repo
logger.info(
"Creating the dependencies for the repo with id {repo_id}.",
repo_id=repo_id,
repo_id=repo.id,
enqueue=True,
)
await _create_dependencies_for_repo(session=session, repo=repo)
@ -187,29 +224,25 @@ async def parse_dependencies_for_repos() -> None:
:return: None.
"""
logger.info("Fetching the repos from the database.", enqueue=True)
async with async_session_uow() as session:
repo_ids = (
async with async_session_maker() as session:
repos = (
await session.scalars(
sqlalchemy.select(Repo.id).order_by(
sqlalchemy.select(Repo).order_by(
Repo.last_checked_revision.is_(None).desc()
)
)
).all()
logger.info("Fetched {count} repos.", count=len(repo_ids), enqueue=True)
logger.info("Fetched {count} repos.", count=len(repos), enqueue=True)
logger.info("Parsing the dependencies for the repos.", enqueue=True)
semaphore = asyncio.Semaphore(10)
async with asyncio.TaskGroup() as tg:
for repo_id in repo_ids:
for repo in repos:
logger.info(
"Parsing the dependencies for repo {repo_id}.",
repo_id=repo_id,
repo_id=repo.id,
enqueue=True,
)
tg.create_task(
parse_dependencies_for_repo(
semaphore=semaphore, repo_id=RepoId(repo_id)
)
)
tg.create_task(parse_dependencies_for_repo(semaphore=semaphore, repo=repo))
app = typer.Typer()

View File

@ -9,19 +9,25 @@ from contextlib import asynccontextmanager
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import async_session_maker
@asynccontextmanager
async def async_session_uow() -> AsyncGenerator[AsyncSession, None]:
async def async_session_uow(
session: AsyncSession,
) -> AsyncGenerator[AsyncSession, None]:
"""
Provide a transactional scope around a series of operations.
:param session: The database session.
:return: a UoW instance
"""
async with async_session_maker() as session:
async with session.begin() as transaction:
async with session.begin():
try:
yield session
finally:
await transaction.rollback()
if session.in_transaction() and session.is_active:
# session.is_active is True if this Session not in “partial rollback”
# state. If this Session is within a transaction, and that transaction
# has not been rolled back internally, the Session.is_active will also
# be True.
# https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html#sqlalchemy.ext.asyncio.AsyncSession.is_active
await session.rollback()

View File

@ -1 +1 @@
NEXT_PUBLIC_PROJECT_REPO_URL="https://github.com/Kludex/awesome-fastapi-projects"
NEXT_PUBLIC_PROJECT_REPO_URL="https://github.com/kludex/awesome-fastapi-projects"

View File

@ -10,6 +10,7 @@ import { useVirtual } from "@tanstack/react-virtual";
import { useDependenciesOrama } from "@/lib/search";
import { search } from "@orama/orama";
import { Dependency } from "@/lib/schemas";
import { cn } from "@/lib/utils";
export function MultiSelect<DataType extends { id: string; name: string }>({
data,
@ -147,7 +148,7 @@ export function MultiSelect<DataType extends { id: string; name: string }>({
/>
</div>
</div>
<div className="relative mt-2">
<div className={cn("relative", open && "mt-2")}>
{open && selectables.length > 0 ? (
<div className="absolute w-full z-10 top-0 rounded-md border bg-popover text-popover-foreground shadow-md outline-none animate-in">
<CommandGroup