Add parsing of the SourceGraph SSE data

This commit is contained in:
Vladyslav Fedoriuk 2023-07-31 23:16:37 +02:00
parent dae524ab0c
commit 9fa3b8e43b
5 changed files with 183 additions and 19 deletions

View File

@ -52,4 +52,4 @@ jobs:
python -m pytest -v -s --failed-first --cov=app --cov-report=xml
- name: Generate Coverage Report
run: |
coverage report -m
python -m coverage report -m

View File

@ -20,6 +20,7 @@ repos:
rev: v0.0.280
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
hooks:

View File

@ -1,23 +1,77 @@
"""The client for the SourceGraph API."""
import datetime
from collections.abc import AsyncGenerator, Mapping
from contextlib import asynccontextmanager
from datetime import timedelta
from typing import Any, AnyStr, Final, Self
from typing import Any, AnyStr, Final, Literal, NewType, Self
from urllib.parse import quote
import httpx
import stamina
from httpx_sse import EventSource, ServerSentEvent, aconnect_sse
from pydantic import HttpUrl
from pydantic import (
BaseModel,
Field,
HttpUrl,
NonNegativeInt,
TypeAdapter,
computed_field,
)
#: The URL of the SourceGraph SSE API.
SOURCE_GRAPH_STREAM_API_URL: Final[
HttpUrl
] = "https://sourcegraph.com/.api/search/stream"
#: The ID of a repository from the SourceGraph API.
SourceGraphRepoId = NewType("SourceGraphRepoId", int)
class SourceGraphRepoData(BaseModel):
"""The data of a repository."""
type: Literal["repo"]
repo_id: SourceGraphRepoId = Field(..., alias="repositoryID")
repo_handle: str = Field(..., alias="repository")
stars: NonNegativeInt = Field(..., alias="repoStars")
last_fetched_at: datetime.datetime = Field(..., alias="repoLastFetched")
description: str = Field(default="")
@computed_field
@property
def repo_url(self: Self) -> HttpUrl:
"""The URL of the repository."""
return TypeAdapter(HttpUrl).validate_python(f"https://{self.repo_handle}")
#: The type adapter for the SourceGraphRepoData.
SourceGraphRepoDataAdapter = TypeAdapter(SourceGraphRepoData)
#: The type adapter for the SourceGraphRepoData list.
SourceGraphRepoDataListAdapter = TypeAdapter(list[SourceGraphRepoData])
#: The query parameters for the SourceGraph SSE API.
FASTAPI_REPOS_QUERY_PARAMS: Final[Mapping[str, str]] = {
"q": quote(
" ".join(
[
"repo:has.content(from fastapi import FastApi)",
"type:repo",
"visibility:public",
"archived:no",
"fork:no",
]
)
),
}
class AsyncSourceGraphSSEClient:
"""A client for the SourceGraph SSE API."""
"""
A client for the SourceGraph SSE API.
To learn more about the underlying API, see the ``SourceGraph SSE API``
https://docs.sourcegraph.com/api/stream_api#sourcegraph-stream-api
"""
def __init__(self: Self) -> None:
"""Initialize the client."""
@ -68,18 +122,8 @@ class AsyncSourceGraphSSEClient:
async def aiter_fastapi_repos(self: Self) -> AsyncGenerator[ServerSentEvent, None]:
"""Iterate over the SourceGraph SSE API with retries."""
params = {
"q": quote(
" ".join(
[
"repo:has.content(from fastapi import FastApi)",
"type:repo",
"visibility:public",
"archived:no",
"fork:no",
]
)
),
}
async for event in self._aiter_sse_with_retries(params=params):
yield event
async for event in self._aiter_sse_with_retries(
params=FASTAPI_REPOS_QUERY_PARAMS
):
if event.event == "matches":
yield SourceGraphRepoDataListAdapter.validate_python(event.json())

View File

@ -0,0 +1 @@
"""Test the scraping of the SourceGraph API."""

View File

@ -0,0 +1,118 @@
"""Test the client module of the scraper app."""
import pytest
from dirty_equals import HasLen, IsDatetime, IsInstance, IsPositiveInt
from pydantic import Json, TypeAdapter
from app.scraper.client import SourceGraphRepoData
@pytest.fixture
def source_graph_matched_repos_data() -> Json:
"""Return the sample data of the matched repositories."""
return [
{
"type": "repo",
"repositoryID": 55636527,
"repository": "github.com/tiangolo/sqlmodel",
"repoStars": 10277,
"repoLastFetched": "2023-07-31T18:47:22.875731Z",
"description": (
"SQL databases in Python, designed "
"for simplicity, compatibility, "
"and robustness."
),
"metadata": {
"fastapi": "null",
"json": "null",
"json-schema": "null",
"pydantic": "null",
"python": "null",
"sql": "null",
"sqlalchemy": "null",
},
},
{
"type": "repo",
"repositoryID": 59434622,
"repository": "github.com/reflex-dev/reflex",
"repoStars": 10061,
"repoLastFetched": "2023-07-31T08:58:42.692906Z",
"description": "(Previously Pynecone) 🕸 Web apps in pure Python 🐍",
},
{
"type": "repo",
"repositoryID": 42982149,
"repository": "github.com/PaddlePaddle/PaddleNLP",
"repoStars": 9804,
"repoLastFetched": "2023-07-31T16:48:08.839209Z",
"description": (
"👑 Easy-to-use and powerful NLP library with 🤗 "
"Awesome model zoo, supporting wide-range of NLP tasks "
"from research to industrial applications, including"
" 🗂Text Classification, 🔍 Neural Search, ❓ Question "
"Answering, Information Extraction, "
"📄 Document Intelligence, 💌 Sentiment Analysis etc."
),
"metadata": {
"bert": "null",
"embedding": "null",
"ernie": "null",
"information-extraction": "null",
"neural-search": "null",
"nlp": "null",
"paddlenlp": "null",
"pretrained-models": "null",
"question-answering": "null",
"search-engine": "null",
"semantic-analysis": "null",
"sentiment-analysis": "null",
"seq2seq": "null",
"transformer": "null",
"transformers": "null",
"uie": "null",
},
},
{
"type": "repo",
"repositoryID": 36246068,
"repository": "github.com/realpython/materials",
"repoStars": 4359,
"repoLastFetched": "2023-07-31T05:15:16.993896Z",
},
]
def test_source_graph_repo_data(source_graph_matched_repos_data: Json) -> None:
"""Test the SourceGraphRepoData deserialization."""
assert source_graph_matched_repos_data == HasLen(3)
_SourceGraphRepoDataListValidator = TypeAdapter(list[SourceGraphRepoData])
repos_parsed = _SourceGraphRepoDataListValidator.validate_python(
source_graph_matched_repos_data
)
assert repos_parsed == HasLen(3)
assert all(repo == IsInstance[SourceGraphRepoData] for repo in repos_parsed)
assert all(
repo.repo_id == repo_data["repositoryID"]
for repo, repo_data in zip(
repos_parsed, source_graph_matched_repos_data, strict=True
)
)
assert all(
repo.repo_handle == repo_data["repository"]
for repo, repo_data in zip(
repos_parsed, source_graph_matched_repos_data, strict=True
)
)
assert all(
repo.stars == IsPositiveInt and repo.stars == repo_data["repoStars"]
for repo, repo_data in zip(
repos_parsed, source_graph_matched_repos_data, strict=True
)
)
assert all(
str(repo.repo_url) == f"https://{repo_data['repository']}"
for repo, repo_data in zip(
repos_parsed, source_graph_matched_repos_data, strict=True
)
)
assert all(repo.last_fetched_at == IsDatetime for repo in repos_parsed)