This repository has been archived by the owner on Jul 27, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 218
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
fc3bb76
commit de523c6
Showing
7 changed files
with
134 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -135,3 +135,4 @@ ENV/ | |
# Private files | ||
private_* | ||
Downloads/ | ||
scripts/*.txt |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import os | ||
|
||
from playstore.playstore import Playstore | ||
|
||
|
||
def main(): | ||
# Use the private credentials for this script. | ||
api = Playstore( | ||
os.path.join( | ||
os.path.dirname(os.path.realpath(__file__)), | ||
os.path.pardir, | ||
"private_credentials.json", | ||
) | ||
) | ||
|
||
# This list has to contain the exact developer(s) name(s). | ||
developer_list = ["Spotify Ltd.", "WhatsApp Inc.", "Mozilla"] | ||
|
||
for developer in developer_list: | ||
for package_name in api.list_app_by_developer(developer): | ||
# Print package name and developer name. | ||
print(f"{package_name}|{developer}") | ||
|
||
|
||
if __name__ == "__main__": | ||
# Run the script from the main directory of the project by using this command: | ||
# python3 -m scripts.crawl_apps_by_developers | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import os | ||
from urllib.parse import urlparse, parse_qs | ||
|
||
from playstore.playstore import Playstore | ||
|
||
|
||
def main(): | ||
# Use the private credentials for this script. | ||
api = Playstore( | ||
os.path.join( | ||
os.path.dirname(os.path.realpath(__file__)), | ||
os.path.pardir, | ||
"private_credentials.json", | ||
) | ||
) | ||
|
||
# Get the categories in the Google Play Store. | ||
res = api.protobuf_to_dict(api.get_store_categories())["category"] | ||
store_categories = set( | ||
map(lambda x: parse_qs(urlparse(x["dataUrl"]).query)["cat"][0], res) | ||
) | ||
|
||
# Get the top top_num free apps in each category. | ||
top_num = 100 | ||
for cat in store_categories: | ||
apps = api.list_app_by_category(cat, "apps_topselling_free", top_num) | ||
for app in apps.doc[0].child: | ||
downloads = app.details.appDetails.numDownloads | ||
rating = app.aggregateRating.starRating | ||
|
||
# Print package name, category, number of downloads and rating. | ||
print(f"{app.docid}|{cat}|{downloads}|{rating}") | ||
|
||
|
||
if __name__ == "__main__": | ||
# Run the script from the main directory of the project by using this command: | ||
# python3 -m scripts.crawl_top_apps_by_category | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/usr/bin/env bash | ||
|
||
# The directory containing this script. | ||
DIR=$(dirname "$(readlink -f "${0}")") | ||
|
||
# Set the working directory to the directory where this script is run. | ||
cd -P -- "${DIR}" || exit | ||
|
||
declare -A found_packages | ||
declare -A not_explored_packages | ||
|
||
# Initial list with package names to explore. | ||
not_explored_packages["com.whatsapp"]=1 | ||
not_explored_packages["com.facebook.katana"]=1 | ||
not_explored_packages["com.spotify.music"]=1 | ||
|
||
# This function takes a package name string as input, visits the corresponding Google | ||
# Play Store page and crawls all the other package names in that page. | ||
get_more_packages() { | ||
# Make sure to have lynx installed. | ||
output="$(lynx --dump -nonumbers \ | ||
"https://play.google.com/store/apps/details?id=$1" \ | ||
| grep -oP "(?<=://play.google.com/store/apps/details\?id=)[a-zA-Z][a-zA-Z0-9_]*(\.[a-zA-Z][a-zA-Z0-9_]*)+" \ | ||
| sort | uniq)" | ||
|
||
# Return all the package names found when visiting the Google Play Store page | ||
# of the input package. | ||
echo "${output}" | ||
} | ||
|
||
# While there are package names to explore... | ||
while [[ "${#not_explored_packages[@]}" -ne 0 ]]; do | ||
# For each package name p1 not yet explored... | ||
for p1 in "${!not_explored_packages[@]}"; do | ||
# For each package name p2 found in p1's details page in the Google Play Store... | ||
while read -r p2; do | ||
if [[ -n "${p2// }" ]] && ! [[ ${found_packages["${p2}"]+_} ]]; then | ||
# Package name p2 was not explored yet. | ||
not_explored_packages["${p2}"]=1; | ||
found_packages["${p2}"]=1; | ||
echo "${p2}" | ||
echo "${p2}" >>"${DIR}/packages.txt" | ||
fi | ||
done <<< "$(get_more_packages "${p1}")" | ||
# This package name is now explored, now it can be removed from the list. | ||
unset not_explored_packages["${p1}"] | ||
done | ||
done | ||
|
||
echo "+++ Crawling completed! +++" |