Skip to content
This repository has been archived by the owner on Jul 27, 2024. It is now read-only.

Commit

Permalink
Add auxiliary scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
ClaudiuGeorgiu committed Apr 17, 2020
1 parent fc3bb76 commit de523c6
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ branch = True
omit =
# Omit virtualenv directory.
venv/*
# Omit scripts directory.
scripts/*
# Omit test directory.
test/*
# Omit protobuf generated files.
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,4 @@ ENV/
# Private files
private_*
Downloads/
scripts/*.txt
1 change: 1 addition & 0 deletions .idea/webResources.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions scripts/crawl_apps_by_developers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3

import os

from playstore.playstore import Playstore


def main():
# Use the private credentials for this script.
api = Playstore(
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
os.path.pardir,
"private_credentials.json",
)
)

# This list has to contain the exact developer(s) name(s).
developer_list = ["Spotify Ltd.", "WhatsApp Inc.", "Mozilla"]

for developer in developer_list:
for package_name in api.list_app_by_developer(developer):
# Print package name and developer name.
print(f"{package_name}|{developer}")


if __name__ == "__main__":
# Run the script from the main directory of the project by using this command:
# python3 -m scripts.crawl_apps_by_developers
main()
40 changes: 40 additions & 0 deletions scripts/crawl_top_apps_by_category.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python3

import os
from urllib.parse import urlparse, parse_qs

from playstore.playstore import Playstore


def main():
# Use the private credentials for this script.
api = Playstore(
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
os.path.pardir,
"private_credentials.json",
)
)

# Get the categories in the Google Play Store.
res = api.protobuf_to_dict(api.get_store_categories())["category"]
store_categories = set(
map(lambda x: parse_qs(urlparse(x["dataUrl"]).query)["cat"][0], res)
)

# Get the top top_num free apps in each category.
top_num = 100
for cat in store_categories:
apps = api.list_app_by_category(cat, "apps_topselling_free", top_num)
for app in apps.doc[0].child:
downloads = app.details.appDetails.numDownloads
rating = app.aggregateRating.starRating

# Print package name, category, number of downloads and rating.
print(f"{app.docid}|{cat}|{downloads}|{rating}")


if __name__ == "__main__":
# Run the script from the main directory of the project by using this command:
# python3 -m scripts.crawl_top_apps_by_category
main()
14 changes: 10 additions & 4 deletions download_from_file.sh → scripts/download_from_file.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/bin/bash
#!/usr/bin/env bash

# The directory containing this script.
DIR=$(dirname "$(readlink -f "${0}")")

# Set the working directory to the directory where this script is run.
cd -P -- "${DIR}" || exit

USAGE="$(basename "$0") [-h] FILE
Download all the Android applications whose package names are
Expand Down Expand Up @@ -41,7 +44,7 @@ while [[ $# -gt 0 ]]; do
done

# Read the file line by line and remove from the file the
# packages names after they are processed.
# package names after they are processed.
while read -r line || [[ -n "${line}" ]]; do

# Skip empty lines.
Expand All @@ -52,7 +55,10 @@ while read -r line || [[ -n "${line}" ]]; do

# Try to download the Android application and keep track
# of the package names for which the download failed.
if python3 download.py "${line}"; then

# TODO: adapt the command depending on your file paths.
# if ../venv/bin/python ../download.py -c ../private_credentials.json "${line}"; then
if ../venv/Scripts/python.exe ../download.py -c ../private_credentials.json "${line}"; then
:
else
echo "${line}" >>"${DIR}/errors.txt"
Expand All @@ -64,6 +70,6 @@ while read -r line || [[ -n "${line}" ]]; do
sed -i '1,1 d' "${filename}"

# Don't stress the server too much.
sleep $((($RANDOM % 10) + 1))s
sleep $(((RANDOM % 10) + 1))s

done <"${filename}"
50 changes: 50 additions & 0 deletions scripts/package_crawler.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash

# The directory containing this script.
DIR=$(dirname "$(readlink -f "${0}")")

# Set the working directory to the directory where this script is run.
cd -P -- "${DIR}" || exit

declare -A found_packages
declare -A not_explored_packages

# Initial list with package names to explore.
not_explored_packages["com.whatsapp"]=1
not_explored_packages["com.facebook.katana"]=1
not_explored_packages["com.spotify.music"]=1

# This function takes a package name string as input, visits the corresponding Google
# Play Store page and crawls all the other package names in that page.
get_more_packages() {
# Make sure to have lynx installed.
output="$(lynx --dump -nonumbers \
"https://play.google.com/store/apps/details?id=$1" \
| grep -oP "(?<=://play.google.com/store/apps/details\?id=)[a-zA-Z][a-zA-Z0-9_]*(\.[a-zA-Z][a-zA-Z0-9_]*)+" \
| sort | uniq)"

# Return all the package names found when visiting the Google Play Store page
# of the input package.
echo "${output}"
}

# While there are package names to explore...
while [[ "${#not_explored_packages[@]}" -ne 0 ]]; do
# For each package name p1 not yet explored...
for p1 in "${!not_explored_packages[@]}"; do
# For each package name p2 found in p1's details page in the Google Play Store...
while read -r p2; do
if [[ -n "${p2// }" ]] && ! [[ ${found_packages["${p2}"]+_} ]]; then
# Package name p2 was not explored yet.
not_explored_packages["${p2}"]=1;
found_packages["${p2}"]=1;
echo "${p2}"
echo "${p2}" >>"${DIR}/packages.txt"
fi
done <<< "$(get_more_packages "${p1}")"
# This package name is now explored, now it can be removed from the list.
unset not_explored_packages["${p1}"]
done
done

echo "+++ Crawling completed! +++"

0 comments on commit de523c6

Please sign in to comment.