-
Notifications
You must be signed in to change notification settings - Fork 310
/
get_top_pypi.py
executable file
·110 lines (93 loc) · 3.58 KB
/
get_top_pypi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
import os, json
import argparse
from bs4 import BeautifulSoup
from ghapi.core import GhApi
from selenium import webdriver
from selenium.webdriver.common.by import By
gh_token = os.environ.get("GITHUB_TOKEN")
if not gh_token:
msg = "Please set the GITHUB_TOKEN environment variable."
raise ValueError(msg)
api = GhApi(token="gh_token")
def get_package_stats(data_tasks, f):
"""
Get package stats from pypi page
Args:
data_tasks (list): List of packages + HTML
f (str): File to write to
"""
# Adjust access type if file already exists
content = None
access_type = "w"
if os.path.exists(f):
with open(f) as fp_:
content = fp_.read()
access_type = "a"
fp_.close()
# Extra package title, pypi URL, stars, pulls, and github URL
with open(f, access_type) as fp_:
for idx, chunk in enumerate(data_tasks):
# Get package name and pypi URL
package_name = chunk["title"]
package_url = chunk["href"]
if content is not None and package_url in content:
continue
# Get github URL
package_github = None
driver.get(package_url)
soup = BeautifulSoup(driver.page_source, "html.parser")
for link in soup.find_all("a", class_="vertical-tabs__tab--with-icon"):
found = False
for x in ["Source", "Code", "Homepage"]:
if (
x.lower() in link.get_text().lower()
and "github" in link["href"].lower()
):
package_github = link["href"]
found = True
break
if found:
break
# Get stars and pulls from github API
stars_count, pulls_count = None, None
if package_github is not None:
repo_parts = package_github.split("/")[-2:]
owner, name = repo_parts[0], repo_parts[1]
try:
repo = api.repos.get(owner, name)
stars_count = int(repo["stargazers_count"])
issues = api.issues.list_for_repo(owner, name)
pulls_count = int(issues[0]["number"])
except:
pass
# Write to file
print(
json.dumps(
{
"rank": idx,
"name": package_name,
"url": package_url,
"github": package_github,
"stars": stars_count,
"pulls": pulls_count,
}
),
file=fp_,
flush=True,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--max-repos", help="Maximum number of repos to get", type=int, default=5000)
args = parser.parse_args()
# Start selenium driver to get top 5000 pypi page
url_top_pypi = "https://hugovk.github.io/top-pypi-packages/"
driver = webdriver.Chrome()
driver.get(url_top_pypi)
button = driver.find_element(By.CSS_SELECTOR, 'button[ng-click="show(8000)"]')
button.click()
# Retrieve HTML for packages from page
soup = BeautifulSoup(driver.page_source, "html.parser")
package_list = soup.find("div", {"class": "list"})
packages = package_list.find_all("a", class_="ng-scope")
get_package_stats(packages[:args.max_repos], "pypi_rankings.jsonl")