mirror of
https://github.com/Kludex/awesome-fastapi-projects.git
synced 2025-05-14 05:07:04 +00:00
Add parsing of the SourceGraph SSE data
This commit is contained in:
parent
dae524ab0c
commit
9fa3b8e43b
2
.github/workflows/app.yaml
vendored
2
.github/workflows/app.yaml
vendored
@ -52,4 +52,4 @@ jobs:
|
||||
python -m pytest -v -s --failed-first --cov=app --cov-report=xml
|
||||
- name: Generate Coverage Report
|
||||
run: |
|
||||
coverage report -m
|
||||
python -m coverage report -m
|
||||
|
@ -20,6 +20,7 @@ repos:
|
||||
rev: v0.0.280
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix, --exit-non-zero-on-fix]
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 5.12.0
|
||||
hooks:
|
||||
|
@ -1,23 +1,77 @@
|
||||
"""The client for the SourceGraph API."""
|
||||
import datetime
|
||||
from collections.abc import AsyncGenerator, Mapping
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import timedelta
|
||||
from typing import Any, AnyStr, Final, Self
|
||||
from typing import Any, AnyStr, Final, Literal, NewType, Self
|
||||
from urllib.parse import quote
|
||||
|
||||
import httpx
|
||||
import stamina
|
||||
from httpx_sse import EventSource, ServerSentEvent, aconnect_sse
|
||||
from pydantic import HttpUrl
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
Field,
|
||||
HttpUrl,
|
||||
NonNegativeInt,
|
||||
TypeAdapter,
|
||||
computed_field,
|
||||
)
|
||||
|
||||
#: The URL of the SourceGraph SSE API.
|
||||
SOURCE_GRAPH_STREAM_API_URL: Final[
|
||||
HttpUrl
|
||||
] = "https://sourcegraph.com/.api/search/stream"
|
||||
|
||||
#: The ID of a repository from the SourceGraph API.
|
||||
SourceGraphRepoId = NewType("SourceGraphRepoId", int)
|
||||
|
||||
|
||||
class SourceGraphRepoData(BaseModel):
|
||||
"""The data of a repository."""
|
||||
|
||||
type: Literal["repo"]
|
||||
repo_id: SourceGraphRepoId = Field(..., alias="repositoryID")
|
||||
repo_handle: str = Field(..., alias="repository")
|
||||
stars: NonNegativeInt = Field(..., alias="repoStars")
|
||||
last_fetched_at: datetime.datetime = Field(..., alias="repoLastFetched")
|
||||
description: str = Field(default="")
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def repo_url(self: Self) -> HttpUrl:
|
||||
"""The URL of the repository."""
|
||||
return TypeAdapter(HttpUrl).validate_python(f"https://{self.repo_handle}")
|
||||
|
||||
|
||||
#: The type adapter for the SourceGraphRepoData.
|
||||
SourceGraphRepoDataAdapter = TypeAdapter(SourceGraphRepoData)
|
||||
#: The type adapter for the SourceGraphRepoData list.
|
||||
SourceGraphRepoDataListAdapter = TypeAdapter(list[SourceGraphRepoData])
|
||||
|
||||
#: The query parameters for the SourceGraph SSE API.
|
||||
FASTAPI_REPOS_QUERY_PARAMS: Final[Mapping[str, str]] = {
|
||||
"q": quote(
|
||||
" ".join(
|
||||
[
|
||||
"repo:has.content(from fastapi import FastApi)",
|
||||
"type:repo",
|
||||
"visibility:public",
|
||||
"archived:no",
|
||||
"fork:no",
|
||||
]
|
||||
)
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class AsyncSourceGraphSSEClient:
|
||||
"""A client for the SourceGraph SSE API."""
|
||||
"""
|
||||
A client for the SourceGraph SSE API.
|
||||
|
||||
To learn more about the underlying API, see the ``SourceGraph SSE API``
|
||||
https://docs.sourcegraph.com/api/stream_api#sourcegraph-stream-api
|
||||
"""
|
||||
|
||||
def __init__(self: Self) -> None:
|
||||
"""Initialize the client."""
|
||||
@ -68,18 +122,8 @@ class AsyncSourceGraphSSEClient:
|
||||
|
||||
async def aiter_fastapi_repos(self: Self) -> AsyncGenerator[ServerSentEvent, None]:
|
||||
"""Iterate over the SourceGraph SSE API with retries."""
|
||||
params = {
|
||||
"q": quote(
|
||||
" ".join(
|
||||
[
|
||||
"repo:has.content(from fastapi import FastApi)",
|
||||
"type:repo",
|
||||
"visibility:public",
|
||||
"archived:no",
|
||||
"fork:no",
|
||||
]
|
||||
)
|
||||
),
|
||||
}
|
||||
async for event in self._aiter_sse_with_retries(params=params):
|
||||
yield event
|
||||
async for event in self._aiter_sse_with_retries(
|
||||
params=FASTAPI_REPOS_QUERY_PARAMS
|
||||
):
|
||||
if event.event == "matches":
|
||||
yield SourceGraphRepoDataListAdapter.validate_python(event.json())
|
||||
|
1
app/scraper/tests/__init__.py
Normal file
1
app/scraper/tests/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Test the scraping of the SourceGraph API."""
|
118
app/scraper/tests/test_client.py
Normal file
118
app/scraper/tests/test_client.py
Normal file
@ -0,0 +1,118 @@
|
||||
"""Test the client module of the scraper app."""
|
||||
import pytest
|
||||
from dirty_equals import HasLen, IsDatetime, IsInstance, IsPositiveInt
|
||||
from pydantic import Json, TypeAdapter
|
||||
|
||||
from app.scraper.client import SourceGraphRepoData
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def source_graph_matched_repos_data() -> Json:
|
||||
"""Return the sample data of the matched repositories."""
|
||||
return [
|
||||
{
|
||||
"type": "repo",
|
||||
"repositoryID": 55636527,
|
||||
"repository": "github.com/tiangolo/sqlmodel",
|
||||
"repoStars": 10277,
|
||||
"repoLastFetched": "2023-07-31T18:47:22.875731Z",
|
||||
"description": (
|
||||
"SQL databases in Python, designed "
|
||||
"for simplicity, compatibility, "
|
||||
"and robustness."
|
||||
),
|
||||
"metadata": {
|
||||
"fastapi": "null",
|
||||
"json": "null",
|
||||
"json-schema": "null",
|
||||
"pydantic": "null",
|
||||
"python": "null",
|
||||
"sql": "null",
|
||||
"sqlalchemy": "null",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "repo",
|
||||
"repositoryID": 59434622,
|
||||
"repository": "github.com/reflex-dev/reflex",
|
||||
"repoStars": 10061,
|
||||
"repoLastFetched": "2023-07-31T08:58:42.692906Z",
|
||||
"description": "(Previously Pynecone) 🕸 Web apps in pure Python 🐍",
|
||||
},
|
||||
{
|
||||
"type": "repo",
|
||||
"repositoryID": 42982149,
|
||||
"repository": "github.com/PaddlePaddle/PaddleNLP",
|
||||
"repoStars": 9804,
|
||||
"repoLastFetched": "2023-07-31T16:48:08.839209Z",
|
||||
"description": (
|
||||
"👑 Easy-to-use and powerful NLP library with 🤗 "
|
||||
"Awesome model zoo, supporting wide-range of NLP tasks "
|
||||
"from research to industrial applications, including"
|
||||
" 🗂Text Classification, 🔍 Neural Search, ❓ Question "
|
||||
"Answering, ℹ️ Information Extraction, "
|
||||
"📄 Document Intelligence, 💌 Sentiment Analysis etc."
|
||||
),
|
||||
"metadata": {
|
||||
"bert": "null",
|
||||
"embedding": "null",
|
||||
"ernie": "null",
|
||||
"information-extraction": "null",
|
||||
"neural-search": "null",
|
||||
"nlp": "null",
|
||||
"paddlenlp": "null",
|
||||
"pretrained-models": "null",
|
||||
"question-answering": "null",
|
||||
"search-engine": "null",
|
||||
"semantic-analysis": "null",
|
||||
"sentiment-analysis": "null",
|
||||
"seq2seq": "null",
|
||||
"transformer": "null",
|
||||
"transformers": "null",
|
||||
"uie": "null",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "repo",
|
||||
"repositoryID": 36246068,
|
||||
"repository": "github.com/realpython/materials",
|
||||
"repoStars": 4359,
|
||||
"repoLastFetched": "2023-07-31T05:15:16.993896Z",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_source_graph_repo_data(source_graph_matched_repos_data: Json) -> None:
|
||||
"""Test the SourceGraphRepoData deserialization."""
|
||||
assert source_graph_matched_repos_data == HasLen(3)
|
||||
_SourceGraphRepoDataListValidator = TypeAdapter(list[SourceGraphRepoData])
|
||||
repos_parsed = _SourceGraphRepoDataListValidator.validate_python(
|
||||
source_graph_matched_repos_data
|
||||
)
|
||||
assert repos_parsed == HasLen(3)
|
||||
assert all(repo == IsInstance[SourceGraphRepoData] for repo in repos_parsed)
|
||||
assert all(
|
||||
repo.repo_id == repo_data["repositoryID"]
|
||||
for repo, repo_data in zip(
|
||||
repos_parsed, source_graph_matched_repos_data, strict=True
|
||||
)
|
||||
)
|
||||
assert all(
|
||||
repo.repo_handle == repo_data["repository"]
|
||||
for repo, repo_data in zip(
|
||||
repos_parsed, source_graph_matched_repos_data, strict=True
|
||||
)
|
||||
)
|
||||
assert all(
|
||||
repo.stars == IsPositiveInt and repo.stars == repo_data["repoStars"]
|
||||
for repo, repo_data in zip(
|
||||
repos_parsed, source_graph_matched_repos_data, strict=True
|
||||
)
|
||||
)
|
||||
assert all(
|
||||
str(repo.repo_url) == f"https://{repo_data['repository']}"
|
||||
for repo, repo_data in zip(
|
||||
repos_parsed, source_graph_matched_repos_data, strict=True
|
||||
)
|
||||
)
|
||||
assert all(repo.last_fetched_at == IsDatetime for repo in repos_parsed)
|
Loading…
x
Reference in New Issue
Block a user