feat: add full scripts setup

This commit is contained in:
Marcelo Trylesinski 2020-07-10 00:20:38 +02:00
parent 2a70435cc9
commit 795e9fe782
9 changed files with 15535 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
.env
links.txt
unique_links.txt
imports.txt
reps/

2
env Normal file
View File

@ -0,0 +1,2 @@
USERNAME=YOUR_USERNAME
PASSWORD=YOUR_PASSWORD

29
extract.py Normal file
View File

@ -0,0 +1,29 @@
import json
import re
from typing import Dict, Union
f_in = open("imports.txt", "r")
mp: Dict[str, Union[set, list]] = {}
for line in f_in.readlines():
try:
rep_name = line.split('/')[1]
except IndexError:
rep_name = ""
mp[rep_name] = mp.get(rep_name, set())
result = re.search(r'from (\w+)[\.\w+]*|:[ ]*import (\w*)\n', line)
if result:
if result.group(1):
mp[rep_name].add(result.group(1))
if result.group(2):
mp[rep_name].add(result.group(2))
for key in mp:
mp[key] = list(mp[key])
with open('results.json', 'w') as f:
json.dump(mp, f, sort_keys=True, indent=2)
print(len(mp))
f_in.close()

15385
results.json Normal file

File diff suppressed because it is too large Load Diff

21
scripts/clone_all.py Normal file
View File

@ -0,0 +1,21 @@
import shutil
import git
from git.repo.base import Repo
from giturlparse import parse
# class Progress(git.remote.RemoteProgress):
# def update(self, op_code, cur_count, max_count=None, message=''):
# print(self._cur_line)
with open('unique_links.txt') as fp:
links = fp.readlines()
for i, link in enumerate(links, start=1):
link = link.rstrip()
name = parse(link).name
print(f'File num: {i}')
Repo.clone_from(link, name)
try:
shutil.move(name, 'reps')
except:
shutil.rmtree(name)

View File

@ -0,0 +1,13 @@
f_in = open("links.txt", "r")
f_out = open("unique_links.txt", "w")
links = set()
for line in f_in.readlines():
links.add(line)
for link in links:
f_out.write(link)
f_in.close()
f_out.close()

4
scripts/list_imports.sh Normal file
View File

@ -0,0 +1,4 @@
for file in $(find reps -maxdepth 1 -type d); do
grep -r "import" --include \*.py $file > imports.txt
done

54
scripts/query.py Normal file
View File

@ -0,0 +1,54 @@
import json
import os
import sys
from time import sleep
import requests
from dotenv import load_dotenv
load_dotenv()
username = os.getenv('GITHUB_USERNAME')
password = os.getenv('GITHUB_PASSWORD')
API_URL = 'https://api.github.com'
def get_response(page: int) -> dict:
res = requests.get(
f'{API_URL}/search/code',
auth=(username, password),
params={
'q': 'fastapi language:Python',
'per_page': 100,
'page': page
}
)
return res
def get_next_link(link_header: str) -> str:
return getattr({
rel: link
for (link, rel) in re.findall(r'<(http.*?)>; rel="(.*?)"', link_header)
}, 'next', None)
filename = "links.txt"
file1 = open(filename, "a") # append mode
has_next = True
page = 1
while has_next:
sleep(1)
res = get_response(page)
res_json = res.json()
if 'items' in res_json:
for item in res_json['items']:
file1.write(f"{item['repository']['html_url']}\n")
print(f"Page: {page}")
print(res.headers)
# print(json.dumps(res_json, indent=4, sort_keys=True))
# print(res.headers.get('X-RateLimit-Reset', 0))
if int(res.headers.get('X-RateLimit-Remaining', 0)) == 0 or '422' in res.headers.get('Status', '422'):
has_next = False
page += 1
file1.close()

22
scripts/unique_imports.py Normal file
View File

@ -0,0 +1,22 @@
import re
import sys
filename_in = sys.argv[1]
filename_out = sys.argv[2]
file_in = open(filename_in, "r")
lines = file_in.readlines()
file_out = open(filename_out, "w")
imports = set()
for line in lines:
match1 = re.search(r'(from *(?!\.)(.+?)(?= |\.))', line)
match2 = re.search(r'(: *(import) (.+))', line)
if match1 is not None:
imports.add(match1.group(2))
if match2 is not None:
imports.add(match2.group(3))
for imp in sorted(list(imports)):
file_out.write(f"{imp}\n")