diff --git a/.github/workflows/app.yaml b/.github/workflows/app.yaml index 3c45d38..f39397b 100644 --- a/.github/workflows/app.yaml +++ b/.github/workflows/app.yaml @@ -52,4 +52,4 @@ jobs: python -m pytest -v -s --failed-first --cov=app --cov-report=xml - name: Generate Coverage Report run: | - coverage report -m + python -m coverage report -m diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7590150..3e0c803 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,6 +20,7 @@ repos: rev: v0.0.280 hooks: - id: ruff + args: [--fix, --exit-non-zero-on-fix] - repo: https://github.com/PyCQA/isort rev: 5.12.0 hooks: diff --git a/app/scraper/client.py b/app/scraper/client.py index 2446ac1..ea186b2 100644 --- a/app/scraper/client.py +++ b/app/scraper/client.py @@ -1,23 +1,77 @@ """The client for the SourceGraph API.""" +import datetime from collections.abc import AsyncGenerator, Mapping from contextlib import asynccontextmanager from datetime import timedelta -from typing import Any, AnyStr, Final, Self +from typing import Any, AnyStr, Final, Literal, NewType, Self from urllib.parse import quote import httpx import stamina from httpx_sse import EventSource, ServerSentEvent, aconnect_sse -from pydantic import HttpUrl +from pydantic import ( + BaseModel, + Field, + HttpUrl, + NonNegativeInt, + TypeAdapter, + computed_field, +) #: The URL of the SourceGraph SSE API. SOURCE_GRAPH_STREAM_API_URL: Final[ HttpUrl ] = "https://sourcegraph.com/.api/search/stream" +#: The ID of a repository from the SourceGraph API. +SourceGraphRepoId = NewType("SourceGraphRepoId", int) + + +class SourceGraphRepoData(BaseModel): + """The data of a repository.""" + + type: Literal["repo"] + repo_id: SourceGraphRepoId = Field(..., alias="repositoryID") + repo_handle: str = Field(..., alias="repository") + stars: NonNegativeInt = Field(..., alias="repoStars") + last_fetched_at: datetime.datetime = Field(..., alias="repoLastFetched") + description: str = Field(default="") + + @computed_field + @property + def repo_url(self: Self) -> HttpUrl: + """The URL of the repository.""" + return TypeAdapter(HttpUrl).validate_python(f"https://{self.repo_handle}") + + +#: The type adapter for the SourceGraphRepoData. +SourceGraphRepoDataAdapter = TypeAdapter(SourceGraphRepoData) +#: The type adapter for the SourceGraphRepoData list. +SourceGraphRepoDataListAdapter = TypeAdapter(list[SourceGraphRepoData]) + +#: The query parameters for the SourceGraph SSE API. +FASTAPI_REPOS_QUERY_PARAMS: Final[Mapping[str, str]] = { + "q": quote( + " ".join( + [ + "repo:has.content(from fastapi import FastApi)", + "type:repo", + "visibility:public", + "archived:no", + "fork:no", + ] + ) + ), +} + class AsyncSourceGraphSSEClient: - """A client for the SourceGraph SSE API.""" + """ + A client for the SourceGraph SSE API. + + To learn more about the underlying API, see the ``SourceGraph SSE API`` + https://docs.sourcegraph.com/api/stream_api#sourcegraph-stream-api + """ def __init__(self: Self) -> None: """Initialize the client.""" @@ -68,18 +122,8 @@ class AsyncSourceGraphSSEClient: async def aiter_fastapi_repos(self: Self) -> AsyncGenerator[ServerSentEvent, None]: """Iterate over the SourceGraph SSE API with retries.""" - params = { - "q": quote( - " ".join( - [ - "repo:has.content(from fastapi import FastApi)", - "type:repo", - "visibility:public", - "archived:no", - "fork:no", - ] - ) - ), - } - async for event in self._aiter_sse_with_retries(params=params): - yield event + async for event in self._aiter_sse_with_retries( + params=FASTAPI_REPOS_QUERY_PARAMS + ): + if event.event == "matches": + yield SourceGraphRepoDataListAdapter.validate_python(event.json()) diff --git a/app/scraper/tests/__init__.py b/app/scraper/tests/__init__.py new file mode 100644 index 0000000..2ca8733 --- /dev/null +++ b/app/scraper/tests/__init__.py @@ -0,0 +1 @@ +"""Test the scraping of the SourceGraph API.""" diff --git a/app/scraper/tests/test_client.py b/app/scraper/tests/test_client.py new file mode 100644 index 0000000..1da10cb --- /dev/null +++ b/app/scraper/tests/test_client.py @@ -0,0 +1,118 @@ +"""Test the client module of the scraper app.""" +import pytest +from dirty_equals import HasLen, IsDatetime, IsInstance, IsPositiveInt +from pydantic import Json, TypeAdapter + +from app.scraper.client import SourceGraphRepoData + + +@pytest.fixture +def source_graph_matched_repos_data() -> Json: + """Return the sample data of the matched repositories.""" + return [ + { + "type": "repo", + "repositoryID": 55636527, + "repository": "github.com/tiangolo/sqlmodel", + "repoStars": 10277, + "repoLastFetched": "2023-07-31T18:47:22.875731Z", + "description": ( + "SQL databases in Python, designed " + "for simplicity, compatibility, " + "and robustness." + ), + "metadata": { + "fastapi": "null", + "json": "null", + "json-schema": "null", + "pydantic": "null", + "python": "null", + "sql": "null", + "sqlalchemy": "null", + }, + }, + { + "type": "repo", + "repositoryID": 59434622, + "repository": "github.com/reflex-dev/reflex", + "repoStars": 10061, + "repoLastFetched": "2023-07-31T08:58:42.692906Z", + "description": "(Previously Pynecone) πŸ•Έ Web apps in pure Python 🐍", + }, + { + "type": "repo", + "repositoryID": 42982149, + "repository": "github.com/PaddlePaddle/PaddleNLP", + "repoStars": 9804, + "repoLastFetched": "2023-07-31T16:48:08.839209Z", + "description": ( + "πŸ‘‘ Easy-to-use and powerful NLP library with πŸ€— " + "Awesome model zoo, supporting wide-range of NLP tasks " + "from research to industrial applications, including" + " πŸ—‚Text Classification, πŸ” Neural Search, ❓ Question " + "Answering, ℹ️ Information Extraction, " + "πŸ“„ Document Intelligence, πŸ’Œ Sentiment Analysis etc." + ), + "metadata": { + "bert": "null", + "embedding": "null", + "ernie": "null", + "information-extraction": "null", + "neural-search": "null", + "nlp": "null", + "paddlenlp": "null", + "pretrained-models": "null", + "question-answering": "null", + "search-engine": "null", + "semantic-analysis": "null", + "sentiment-analysis": "null", + "seq2seq": "null", + "transformer": "null", + "transformers": "null", + "uie": "null", + }, + }, + { + "type": "repo", + "repositoryID": 36246068, + "repository": "github.com/realpython/materials", + "repoStars": 4359, + "repoLastFetched": "2023-07-31T05:15:16.993896Z", + }, + ] + + +def test_source_graph_repo_data(source_graph_matched_repos_data: Json) -> None: + """Test the SourceGraphRepoData deserialization.""" + assert source_graph_matched_repos_data == HasLen(3) + _SourceGraphRepoDataListValidator = TypeAdapter(list[SourceGraphRepoData]) + repos_parsed = _SourceGraphRepoDataListValidator.validate_python( + source_graph_matched_repos_data + ) + assert repos_parsed == HasLen(3) + assert all(repo == IsInstance[SourceGraphRepoData] for repo in repos_parsed) + assert all( + repo.repo_id == repo_data["repositoryID"] + for repo, repo_data in zip( + repos_parsed, source_graph_matched_repos_data, strict=True + ) + ) + assert all( + repo.repo_handle == repo_data["repository"] + for repo, repo_data in zip( + repos_parsed, source_graph_matched_repos_data, strict=True + ) + ) + assert all( + repo.stars == IsPositiveInt and repo.stars == repo_data["repoStars"] + for repo, repo_data in zip( + repos_parsed, source_graph_matched_repos_data, strict=True + ) + ) + assert all( + str(repo.repo_url) == f"https://{repo_data['repository']}" + for repo, repo_data in zip( + repos_parsed, source_graph_matched_repos_data, strict=True + ) + ) + assert all(repo.last_fetched_at == IsDatetime for repo in repos_parsed)