-
Notifications
You must be signed in to change notification settings - Fork 247
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Squashed 21 commits
- Loading branch information
Showing
3 changed files
with
86 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Repo Info Helper | ||
|
||
This script scans a given .csv file (works on both `pr-data.csv` and `py-data.csv`), and outputs another .csv file, with 3 columns: | ||
|
||
* Repo URL | ||
* Months since latest commit to master/main | ||
* Number of stars | ||
|
||
... which are sorted in descending order of number of stars and ascending order of months since last commit to master. | ||
|
||
The latter 2 values will help us in shortlisting a project to fix flaky tests in. The chances of your PR getting accepted are higher for a repository that is actively maintained and has a high number of stars. This script will only scan URLs that have an empty `Status` column. | ||
|
||
## To run: | ||
|
||
* Requires a github access token if there are more than 60 requests made (i.e. more than 60 unique repositories in the file), which is highly likely, since both `pr-data.csv` and `py-data.csv` each contain 300+ unique repositories at the time of writing this (Nov 2022). | ||
|
||
* Following are the commands to run the script from the root directory. Remember to use a github access token to overcome the rate limit: | ||
* For `pr-data.csv`: `repo-info/get_repo_info.py -f pr-data.csv -c 'Project URL' -t <github-access-token>` | ||
* For `py-data.csv`: `repo-info/get_repo_info.py -f py-data.csv -c 'Project URL' -t <github-access-token>` | ||
|
||
The new file will be saved with the name `repo_info.csv` inside the `repo-info` directory. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import os | ||
import argparse | ||
import datetime | ||
import pandas as pd | ||
from tqdm import tqdm | ||
from github import Github | ||
|
||
tqdm.pandas() | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-t', '--github_access_token', help='GitHub access token to overcome API rate limitations') | ||
parser.add_argument('-f', '--filepath', help='Filepath of .csv file containing repo data') | ||
parser.add_argument('-c', '--colname', help='Column name in CSV file pertaining to repo URL') | ||
args = parser.parse_args() | ||
|
||
GITHUB_API_RATE_LIMIT = 5000 | ||
FILEPATH, COLNAME, GITHUB_ACCESS_TOKEN = args.filepath, args.colname, args.github_access_token | ||
|
||
data = pd.read_csv(FILEPATH) | ||
data = data[data['Status'].isna()] | ||
REPO_URLS = data[COLNAME].unique() | ||
NUM_REPOS = REPO_URLS.shape[0] | ||
|
||
def check_number_repos(): | ||
if NUM_REPOS > GITHUB_API_RATE_LIMIT: | ||
print(f'You can only make {GITHUB_API_RATE_LIMIT} requests per hour. Your file has {NUM_REPOS} unique repositories. Exiting.') | ||
exit(0) | ||
|
||
def get_diff_month(d1, d2): | ||
return (d1.year - d2.year) * 12 + d1.month - d2.month | ||
|
||
def get_repo_object(repo_url): | ||
try: | ||
repo_name = repo_url.split('github.com/')[1] | ||
return Github(GITHUB_ACCESS_TOKEN).get_repo(repo_name) | ||
except Exception as e: | ||
print(e) | ||
return None | ||
|
||
def get_months_since_last_commit(repo): | ||
try: | ||
default_branch = repo.get_branch(repo.default_branch) | ||
latest_commit_date = default_branch.commit.commit.author.date | ||
months_since_commit = get_diff_month(datetime.datetime.now(), latest_commit_date) | ||
return months_since_commit | ||
except Exception as e: | ||
print(e) | ||
return None | ||
|
||
def get_maintained_repos(): | ||
check_number_repos() | ||
print(f'Analyzing {NUM_REPOS} repositories...') | ||
df = pd.DataFrame() | ||
df['REPO_URL'] = REPO_URLS | ||
df['REPO_OBJECT'] = df['REPO_URL'].progress_apply(lambda url: get_repo_object(url)) | ||
df['MONTHS_SINCE_LAST_COMMIT'] = df['REPO_OBJECT'].progress_apply(lambda repo_object: get_months_since_last_commit(repo_object)) | ||
df['STARS'] = df['REPO_OBJECT'].progress_apply(lambda repo_object: repo_object.stargazers_count if repo_object is not None else None) | ||
df = df.sort_values(by=['MONTHS_SINCE_LAST_COMMIT', 'STARS'], ascending=[True, False]).drop(columns=['REPO_OBJECT', 'Unnamed: 0'], errors='ignore') | ||
df.to_csv(f'{os.getcwd()}/repo-info/repo-info.csv', index=False) | ||
|
||
if __name__ == '__main__': | ||
get_maintained_repos() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pandas==1.5.2 | ||
PyGithub==1.57 | ||
tqdm==4.64.1 |