Add auxiliary scripts

ClaudiuGeorgiu · Apr 17, 2020 · de523c6 · de523c6
1 parent fc3bb76
commit de523c6
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 4 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -3,6 +3,8 @@ branch = True
 omit =
     # Omit virtualenv directory.
     venv/*
+    # Omit scripts directory.
+    scripts/*
     # Omit test directory.
     test/*
     # Omit protobuf generated files.

diff --git a/.gitignore b/.gitignore
@@ -135,3 +135,4 @@ ENV/
 # Private files
 private_*
 Downloads/
+scripts/*.txt
diff --git a/.idea/webResources.xml b/.idea/webResources.xml
diff --git a/scripts/crawl_apps_by_developers.py b/scripts/crawl_apps_by_developers.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+import os
+
+from playstore.playstore import Playstore
+
+
+def main():
+    # Use the private credentials for this script.
+    api = Playstore(
+        os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            os.path.pardir,
+            "private_credentials.json",
+        )
+    )
+
+    # This list has to contain the exact developer(s) name(s).
+    developer_list = ["Spotify Ltd.", "WhatsApp Inc.", "Mozilla"]
+
+    for developer in developer_list:
+        for package_name in api.list_app_by_developer(developer):
+            # Print package name and developer name.
+            print(f"{package_name}|{developer}")
+
+
+if __name__ == "__main__":
+    # Run the script from the main directory of the project by using this command:
+    # python3 -m scripts.crawl_apps_by_developers
+    main()
diff --git a/scripts/crawl_top_apps_by_category.py b/scripts/crawl_top_apps_by_category.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+import os
+from urllib.parse import urlparse, parse_qs
+
+from playstore.playstore import Playstore
+
+
+def main():
+    # Use the private credentials for this script.
+    api = Playstore(
+        os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            os.path.pardir,
+            "private_credentials.json",
+        )
+    )
+
+    # Get the categories in the Google Play Store.
+    res = api.protobuf_to_dict(api.get_store_categories())["category"]
+    store_categories = set(
+        map(lambda x: parse_qs(urlparse(x["dataUrl"]).query)["cat"][0], res)
+    )
+
+    # Get the top top_num free apps in each category.
+    top_num = 100
+    for cat in store_categories:
+        apps = api.list_app_by_category(cat, "apps_topselling_free", top_num)
+        for app in apps.doc[0].child:
+            downloads = app.details.appDetails.numDownloads
+            rating = app.aggregateRating.starRating
+
+            # Print package name, category, number of downloads and rating.
+            print(f"{app.docid}|{cat}|{downloads}|{rating}")
+
+
+if __name__ == "__main__":
+    # Run the script from the main directory of the project by using this command:
+    # python3 -m scripts.crawl_top_apps_by_category
+    main()
diff --git a/download_from_file.sh → scripts/download_from_file.sh b/download_from_file.sh → scripts/download_from_file.sh
@@ -1,8 +1,11 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # The directory containing this script.
 DIR=$(dirname "$(readlink -f "${0}")")
 
+# Set the working directory to the directory where this script is run.
+cd -P -- "${DIR}" || exit
+
 USAGE="$(basename "$0") [-h] FILE
 
 Download all the Android applications whose package names are
@@ -41,7 +44,7 @@ while [[ $# -gt 0 ]]; do
 done
 
 # Read the file line by line and remove from the file the
-# packages names after they are processed.
+# package names after they are processed.
 while read -r line || [[ -n "${line}" ]]; do
 
   # Skip empty lines.
@@ -52,7 +55,10 @@ while read -r line || [[ -n "${line}" ]]; do
 
   # Try to download the Android application and keep track
   # of the package names for which the download failed.
-  if python3 download.py "${line}"; then
+
+  # TODO: adapt the command depending on your file paths.
+  # if ../venv/bin/python ../download.py -c ../private_credentials.json "${line}"; then
+  if ../venv/Scripts/python.exe ../download.py -c ../private_credentials.json "${line}"; then
     :
   else
     echo "${line}" >>"${DIR}/errors.txt"
@@ -64,6 +70,6 @@ while read -r line || [[ -n "${line}" ]]; do
   sed -i '1,1 d' "${filename}"
 
   # Don't stress the server too much.
-  sleep $((($RANDOM % 10) + 1))s
+  sleep $(((RANDOM % 10) + 1))s
 
 done <"${filename}"
diff --git a/scripts/package_crawler.sh b/scripts/package_crawler.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+# The directory containing this script.
+DIR=$(dirname "$(readlink -f "${0}")")
+
+# Set the working directory to the directory where this script is run.
+cd -P -- "${DIR}" || exit
+
+declare -A found_packages
+declare -A not_explored_packages
+
+# Initial list with package names to explore.
+not_explored_packages["com.whatsapp"]=1
+not_explored_packages["com.facebook.katana"]=1
+not_explored_packages["com.spotify.music"]=1
+
+# This function takes a package name string as input, visits the corresponding Google
+# Play Store page and crawls all the other package names in that page.
+get_more_packages() {
+  # Make sure to have lynx installed.
+  output="$(lynx --dump -nonumbers \
+  "https://play.google.com/store/apps/details?id=$1" \
+  | grep -oP "(?<=://play.google.com/store/apps/details\?id=)[a-zA-Z][a-zA-Z0-9_]*(\.[a-zA-Z][a-zA-Z0-9_]*)+" \
+  | sort | uniq)"
+
+  # Return all the package names found when visiting the Google Play Store page
+  # of the input package.
+  echo "${output}"
+}
+
+# While there are package names to explore...
+while [[ "${#not_explored_packages[@]}" -ne 0 ]]; do
+  # For each package name p1 not yet explored...
+  for p1 in "${!not_explored_packages[@]}"; do
+    # For each package name p2 found in p1's details page in the Google Play Store...
+    while read -r p2; do
+      if [[ -n "${p2// }" ]] && ! [[ ${found_packages["${p2}"]+_} ]]; then
+        # Package name p2 was not explored yet.
+        not_explored_packages["${p2}"]=1;
+        found_packages["${p2}"]=1;
+        echo "${p2}"
+        echo "${p2}" >>"${DIR}/packages.txt"
+      fi
+    done <<< "$(get_more_packages "${p1}")"
+    # This package name is now explored, now it can be removed from the list.
+    unset not_explored_packages["${p1}"]
+  done
+done
+
+echo "+++ Crawling completed! +++"