Add parsing of the SourceGraph SSE data

This commit is contained in:
Vladyslav Fedoriuk 2023-07-31 23:16:37 +02:00
parent dae524ab0c
commit 9fa3b8e43b
5 changed files with 183 additions and 19 deletions

View File

@ -52,4 +52,4 @@ jobs:
python -m pytest -v -s --failed-first --cov=app --cov-report=xml python -m pytest -v -s --failed-first --cov=app --cov-report=xml
- name: Generate Coverage Report - name: Generate Coverage Report
run: | run: |
coverage report -m python -m coverage report -m

View File

@ -20,6 +20,7 @@ repos:
rev: v0.0.280 rev: v0.0.280
hooks: hooks:
- id: ruff - id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.12.0 rev: 5.12.0
hooks: hooks:

View File

@ -1,23 +1,77 @@
"""The client for the SourceGraph API.""" """The client for the SourceGraph API."""
import datetime
from collections.abc import AsyncGenerator, Mapping from collections.abc import AsyncGenerator, Mapping
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from datetime import timedelta from datetime import timedelta
from typing import Any, AnyStr, Final, Self from typing import Any, AnyStr, Final, Literal, NewType, Self
from urllib.parse import quote from urllib.parse import quote
import httpx import httpx
import stamina import stamina
from httpx_sse import EventSource, ServerSentEvent, aconnect_sse from httpx_sse import EventSource, ServerSentEvent, aconnect_sse
from pydantic import HttpUrl from pydantic import (
BaseModel,
Field,
HttpUrl,
NonNegativeInt,
TypeAdapter,
computed_field,
)
#: The URL of the SourceGraph SSE API. #: The URL of the SourceGraph SSE API.
SOURCE_GRAPH_STREAM_API_URL: Final[ SOURCE_GRAPH_STREAM_API_URL: Final[
HttpUrl HttpUrl
] = "https://sourcegraph.com/.api/search/stream" ] = "https://sourcegraph.com/.api/search/stream"
#: The ID of a repository from the SourceGraph API.
SourceGraphRepoId = NewType("SourceGraphRepoId", int)
class SourceGraphRepoData(BaseModel):
"""The data of a repository."""
type: Literal["repo"]
repo_id: SourceGraphRepoId = Field(..., alias="repositoryID")
repo_handle: str = Field(..., alias="repository")
stars: NonNegativeInt = Field(..., alias="repoStars")
last_fetched_at: datetime.datetime = Field(..., alias="repoLastFetched")
description: str = Field(default="")
@computed_field
@property
def repo_url(self: Self) -> HttpUrl:
"""The URL of the repository."""
return TypeAdapter(HttpUrl).validate_python(f"https://{self.repo_handle}")
#: The type adapter for the SourceGraphRepoData.
SourceGraphRepoDataAdapter = TypeAdapter(SourceGraphRepoData)
#: The type adapter for the SourceGraphRepoData list.
SourceGraphRepoDataListAdapter = TypeAdapter(list[SourceGraphRepoData])
#: The query parameters for the SourceGraph SSE API.
FASTAPI_REPOS_QUERY_PARAMS: Final[Mapping[str, str]] = {
"q": quote(
" ".join(
[
"repo:has.content(from fastapi import FastApi)",
"type:repo",
"visibility:public",
"archived:no",
"fork:no",
]
)
),
}
class AsyncSourceGraphSSEClient: class AsyncSourceGraphSSEClient:
"""A client for the SourceGraph SSE API.""" """
A client for the SourceGraph SSE API.
To learn more about the underlying API, see the ``SourceGraph SSE API``
https://docs.sourcegraph.com/api/stream_api#sourcegraph-stream-api
"""
def __init__(self: Self) -> None: def __init__(self: Self) -> None:
"""Initialize the client.""" """Initialize the client."""
@ -68,18 +122,8 @@ class AsyncSourceGraphSSEClient:
async def aiter_fastapi_repos(self: Self) -> AsyncGenerator[ServerSentEvent, None]: async def aiter_fastapi_repos(self: Self) -> AsyncGenerator[ServerSentEvent, None]:
"""Iterate over the SourceGraph SSE API with retries.""" """Iterate over the SourceGraph SSE API with retries."""
params = { async for event in self._aiter_sse_with_retries(
"q": quote( params=FASTAPI_REPOS_QUERY_PARAMS
" ".join( ):
[ if event.event == "matches":
"repo:has.content(from fastapi import FastApi)", yield SourceGraphRepoDataListAdapter.validate_python(event.json())
"type:repo",
"visibility:public",
"archived:no",
"fork:no",
]
)
),
}
async for event in self._aiter_sse_with_retries(params=params):
yield event

View File

@ -0,0 +1 @@
"""Test the scraping of the SourceGraph API."""

View File

@ -0,0 +1,118 @@
"""Test the client module of the scraper app."""
import pytest
from dirty_equals import HasLen, IsDatetime, IsInstance, IsPositiveInt
from pydantic import Json, TypeAdapter
from app.scraper.client import SourceGraphRepoData
@pytest.fixture
def source_graph_matched_repos_data() -> Json:
"""Return the sample data of the matched repositories."""
return [
{
"type": "repo",
"repositoryID": 55636527,
"repository": "github.com/tiangolo/sqlmodel",
"repoStars": 10277,
"repoLastFetched": "2023-07-31T18:47:22.875731Z",
"description": (
"SQL databases in Python, designed "
"for simplicity, compatibility, "
"and robustness."
),
"metadata": {
"fastapi": "null",
"json": "null",
"json-schema": "null",
"pydantic": "null",
"python": "null",
"sql": "null",
"sqlalchemy": "null",
},
},
{
"type": "repo",
"repositoryID": 59434622,
"repository": "github.com/reflex-dev/reflex",
"repoStars": 10061,
"repoLastFetched": "2023-07-31T08:58:42.692906Z",
"description": "(Previously Pynecone) 🕸 Web apps in pure Python 🐍",
},
{
"type": "repo",
"repositoryID": 42982149,
"repository": "github.com/PaddlePaddle/PaddleNLP",
"repoStars": 9804,
"repoLastFetched": "2023-07-31T16:48:08.839209Z",
"description": (
"👑 Easy-to-use and powerful NLP library with 🤗 "
"Awesome model zoo, supporting wide-range of NLP tasks "
"from research to industrial applications, including"
" 🗂Text Classification, 🔍 Neural Search, ❓ Question "
"Answering, Information Extraction, "
"📄 Document Intelligence, 💌 Sentiment Analysis etc."
),
"metadata": {
"bert": "null",
"embedding": "null",
"ernie": "null",
"information-extraction": "null",
"neural-search": "null",
"nlp": "null",
"paddlenlp": "null",
"pretrained-models": "null",
"question-answering": "null",
"search-engine": "null",
"semantic-analysis": "null",
"sentiment-analysis": "null",
"seq2seq": "null",
"transformer": "null",
"transformers": "null",
"uie": "null",
},
},
{
"type": "repo",
"repositoryID": 36246068,
"repository": "github.com/realpython/materials",
"repoStars": 4359,
"repoLastFetched": "2023-07-31T05:15:16.993896Z",
},
]
def test_source_graph_repo_data(source_graph_matched_repos_data: Json) -> None:
"""Test the SourceGraphRepoData deserialization."""
assert source_graph_matched_repos_data == HasLen(3)
_SourceGraphRepoDataListValidator = TypeAdapter(list[SourceGraphRepoData])
repos_parsed = _SourceGraphRepoDataListValidator.validate_python(
source_graph_matched_repos_data
)
assert repos_parsed == HasLen(3)
assert all(repo == IsInstance[SourceGraphRepoData] for repo in repos_parsed)
assert all(
repo.repo_id == repo_data["repositoryID"]
for repo, repo_data in zip(
repos_parsed, source_graph_matched_repos_data, strict=True
)
)
assert all(
repo.repo_handle == repo_data["repository"]
for repo, repo_data in zip(
repos_parsed, source_graph_matched_repos_data, strict=True
)
)
assert all(
repo.stars == IsPositiveInt and repo.stars == repo_data["repoStars"]
for repo, repo_data in zip(
repos_parsed, source_graph_matched_repos_data, strict=True
)
)
assert all(
str(repo.repo_url) == f"https://{repo_data['repository']}"
for repo, repo_data in zip(
repos_parsed, source_graph_matched_repos_data, strict=True
)
)
assert all(repo.last_fetched_at == IsDatetime for repo in repos_parsed)