feat: new script

This commit is contained in:
Marcelo Trylesinski 2020-11-20 04:39:14 +01:00
parent 233b94976a
commit cc9dbf28c4
14 changed files with 237 additions and 15533 deletions

10
.gitignore vendored
View File

@ -3,3 +3,13 @@ links.txt
unique_links.txt
imports.txt
reps/
# Text editor
.vscode
# Temporary files
tmp/*
!tmp/.gitkeep
# Python
**/__pycache__/

57
app.py Normal file
View File

@ -0,0 +1,57 @@
import json
import dash
import dash_html_components as html
import dash_table
from dash.dependencies import Input, Output
HEADERS = ("name", "age", "dependencies")
app = dash.Dash()
with open("results.json") as json_file:
data = json.load(json_file)
print(data)
app.layout = html.Div(
[
dash_table.DataTable(
id="datatable-interactivity",
columns=[
{"name": i.capitalize(), "id": i, "deletable": True, "selectable": True}
for i in HEADERS
],
data=data,
editable=True,
filter_action="native",
sort_action="native",
sort_mode="multi",
column_selectable="single",
row_selectable="multi",
row_deletable=True,
selected_columns=[],
selected_rows=[],
page_action="native",
page_current=0,
page_size=10,
),
html.Div(id="datatable-interactivity-container"),
]
)
@app.callback(
Output("datatable-interactivity", "style_data_conditional"),
[Input("datatable-interactivity", "selected_columns")],
)
def update_styles(selected_columns):
return [
{"if": {"column_id": i}, "background_color": "#D2F3FF"}
for i in selected_columns
]
if __name__ == "__main__":
app.run_server(debug=True)

2
env
View File

@ -1,2 +0,0 @@
USERNAME=YOUR_USERNAME
PASSWORD=YOUR_PASSWORD

View File

@ -0,0 +1,166 @@
import logging
import os
import re
import shutil
from contextlib import contextmanager
from typing import Set
from git import Git
from git.exc import GitCommandError
from github import Github
from github.Repository import Repository
from sqlalchemy import ForeignKey, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker
from sqlalchemy.sql.schema import Column, UniqueConstraint
from sqlalchemy.sql.sqltypes import Integer, String
logging.basicConfig(level=logging.INFO)
# Github
github_access_token = os.getenv("ACCESS_TOKEN_GITHUB")
g = Github(github_access_token)
MAX_SIZE = 100 * 1000 # 100 MB
# Directory
dir = os.getcwd()
clone_dir = os.path.join(dir, "tmp")
INVALID_FOLDERS = ("site-packages", "venv")
# Database
engine = create_engine("sqlite:///packages.db")
SessionLocal = sessionmaker(bind=engine)
Base = declarative_base()
class Association(Base):
__tablename__ = "association"
package_id = Column(Integer, ForeignKey("package.id"), primary_key=True)
project_id = Column(Integer, ForeignKey("project.id"), primary_key=True)
package = relationship("Package", backref="package_associations")
project = relationship("Project", backref="project_associations")
class Package(Base):
__tablename__ = "package"
__table_args__ = (UniqueConstraint("name"),)
id = Column(Integer, primary_key=True)
name = Column(String, nullable=False)
class Project(Base):
__tablename__ = "project"
__table_args__ = (UniqueConstraint("name", "owner"),)
id = Column(Integer, primary_key=True)
name = Column(String, nullable=False)
owner = Column(String)
packages = relationship("Package", secondary="association")
Base.metadata.create_all(engine, checkfirst=True)
@contextmanager
def get_session():
session = SessionLocal()
yield session
session.close()
# Functions
def clone(repository: Repository):
try:
clone_url = repository.clone_url
Git(clone_dir).clone(clone_url)
except GitCommandError:
pass
def get_packages_from_file(path: str) -> Set[str]:
packages = set()
logging.info("Reading file '%s'.", path)
try:
with open(path, "r") as file:
for line in file.readlines():
result = re.search(r"from (\w+)[\.\w+]*|:[ ]*import (\w*)\n", line)
if result:
if result.group(1):
packages.add(result.group(1))
if result.group(2):
packages.add(result.group(2))
except FileNotFoundError:
logging.info("File not found '%s'.", path)
except UnicodeDecodeError:
logging.info("Invalid character on file '%s'.", path)
return packages
def extract_data(repository: Repository) -> dict:
data = {}
for (root, _, files) in os.walk(os.path.join(clone_dir, repository.name)):
for file in files:
path = os.path.join(root, file)
if file.endswith(".py") and all(
folder not in path for folder in INVALID_FOLDERS
):
data["packages"] = get_packages_from_file(path)
return data
def run():
snippets = g.search_code('l=Python&q="from+fastapi+import+FastAPI"&type=Code')
for snippet in snippets:
repository = snippet.repository
name = repository.name
owner = repository.owner.name
logging.info("Got repository '%s'.", name)
with get_session() as session:
if (
session.query(Project)
.filter(Project.name == name, Project.owner == owner)
.first()
):
continue
# NOTE: When deployed! Ignore repositories that didn't change.
# from datetime import datetime
# commits = repository.get_commits()
# last_commit_date = [commit.commit.author.date for commit in commits][0]
# if (datetime.today() - last_commit_date).days > 7:
# continue
if repository.size > MAX_SIZE:
continue
logging.info("Cloning repository '%s'.", name)
clone(repository)
logging.info("Extracting data from '%s'.", name)
data = extract_data(repository)
with get_session() as session:
project = Project(name=name, owner=owner)
for package_name in data.get("packages", {}):
package = (
session.query(Package).filter(Package.name == package_name).first()
)
if package is None:
package = Package(name=package_name)
project.packages.append(package)
session.add(project)
session.commit()
logging.info("Removing repository '%s'.", name)
shutil.rmtree(os.path.join(clone_dir, name))
# Run!
run()

BIN
packages.db Normal file

Binary file not shown.

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
dash==1.17.0
PyGithub==1.53
gitpython==3.1.11
sqlalchemy==1.3.20

15385
results.json

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +0,0 @@
import shutil
import git
from git.repo.base import Repo
from giturlparse import parse
# class Progress(git.remote.RemoteProgress):
# def update(self, op_code, cur_count, max_count=None, message=''):
# print(self._cur_line)
with open("unique_links.txt") as fp:
links = fp.readlines()
for i, link in enumerate(links, start=1):
link = link.rstrip()
name = parse(link).name
print(f"File num: {i}")
Repo.clone_from(link, name)
try:
shutil.move(name, "reps")
except:
shutil.rmtree(name)

View File

@ -1,13 +0,0 @@
f_in = open("links.txt", "r")
f_out = open("unique_links.txt", "w")
links = set()
for line in f_in.readlines():
links.add(line)
for link in links:
f_out.write(link)
f_in.close()
f_out.close()

View File

@ -1,29 +0,0 @@
import json
import re
from typing import Dict, Union
f_in = open("imports.txt", "r")
mp: Dict[str, Union[set, list]] = {}
for line in f_in.readlines():
try:
rep_name = line.split("/")[1]
except IndexError:
rep_name = ""
mp[rep_name] = mp.get(rep_name, set())
result = re.search(r"from (\w+)[\.\w+]*|:[ ]*import (\w*)\n", line)
if result:
if result.group(1):
mp[rep_name].add(result.group(1))
if result.group(2):
mp[rep_name].add(result.group(2))
for key in mp:
mp[key] = list(mp[key])
with open("results.json", "w") as f:
json.dump(mp, f, sort_keys=True, indent=2)
print(len(mp))
f_in.close()

View File

@ -1,4 +0,0 @@
for file in $(find reps -maxdepth 1 -type d); do
grep -r "import" --include \*.py $file > imports.txt
done

View File

@ -1,57 +0,0 @@
import json
import os
import sys
from time import sleep
import requests
from dotenv import load_dotenv
load_dotenv()
username = os.getenv("GITHUB_USERNAME")
password = os.getenv("GITHUB_PASSWORD")
API_URL = "https://api.github.com"
def get_response(page: int) -> dict:
res = requests.get(
f"{API_URL}/search/code",
auth=(username, password),
params={"q": "fastapi language:Python", "per_page": 100, "page": page},
)
return res
def get_next_link(link_header: str) -> str:
return getattr(
{
rel: link
for (link, rel) in re.findall(r'<(http.*?)>; rel="(.*?)"', link_header)
},
"next",
None,
)
filename = "links.txt"
file1 = open(filename, "a") # append mode
has_next = True
page = 1
while has_next:
sleep(1)
res = get_response(page)
res_json = res.json()
if "items" in res_json:
for item in res_json["items"]:
file1.write(f"{item['repository']['html_url']}\n")
print(f"Page: {page}")
print(res.headers)
# print(json.dumps(res_json, indent=4, sort_keys=True))
# print(res.headers.get('X-RateLimit-Reset', 0))
if int(
res.headers.get("X-RateLimit-Remaining", 0)
) == 0 or "422" in res.headers.get("Status", "422"):
has_next = False
page += 1
file1.close()

View File

@ -1,22 +0,0 @@
import re
import sys
filename_in = sys.argv[1]
filename_out = sys.argv[2]
file_in = open(filename_in, "r")
lines = file_in.readlines()
file_out = open(filename_out, "w")
imports = set()
for line in lines:
match1 = re.search(r"(from *(?!\.)(.+?)(?= |\.))", line)
match2 = re.search(r"(: *(import) (.+))", line)
if match1 is not None:
imports.add(match1.group(2))
if match2 is not None:
imports.add(match2.group(3))
for imp in sorted(list(imports)):
file_out.write(f"{imp}\n")

0
tmp/.gitkeep Normal file
View File