diff --git a/.gitignore b/.gitignore index 9e7df469c..b9bf1801b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Tranco list cache directory +.tranco/ + # Docker volume docker-volume/ diff --git a/README.md b/README.md index c6f020c06..dc3c4e0d8 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,11 @@ Once installed, it is very easy to run a quick test of OpenWPM. Check out `openwpm/config.py::BrowserParams`, with the exception of the changes specified in `demo.py`. +The demo script also includes a sample of how to use the +[Tranco](https://tranco-list.eu/) top sites list via the optional command line +flag `demo.py --tranco`. Note that since this is a real top sites list it will +include NSFW websites, some of which will be highly ranked. + More information on the instrumentation and configuration parameters is given below. diff --git a/demo.py b/demo.py index a1517ea46..e284d46e6 100644 --- a/demo.py +++ b/demo.py @@ -1,5 +1,8 @@ +import argparse from pathlib import Path +import tranco + from custom_command import LinkCountingCommand from openwpm.command_sequence import CommandSequence from openwpm.commands.browser_commands import GetCommand @@ -7,17 +10,26 @@ from openwpm.storage.sql_provider import SQLiteStorageProvider from openwpm.task_manager import TaskManager -# The list of sites that we wish to crawl -NUM_BROWSERS = 1 -sites = [ - "http://www.example.com", - "http://www.princeton.edu", - "http://citp.princeton.edu/", -] +parser = argparse.ArgumentParser() +parser.add_argument("--tranco", action="store_true", default=False), +args = parser.parse_args() + +if args.tranco: + # Load the latest tranco list. See https://tranco-list.eu/ + print("Loading tranco top sites list...") + t = tranco.Tranco(cache=True, cache_dir=".tranco") + latest_list = t.list() + sites = ["http://" + x for x in latest_list.top(10)] +else: + sites = [ + "http://www.example.com", + "http://www.princeton.edu", + "http://citp.princeton.edu/", + ] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams - +NUM_BROWSERS = 2 manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [BrowserParams(display_mode="native") for _ in range(NUM_BROWSERS)] diff --git a/environment.yaml b/environment.yaml index f4c4538fe..4ede500f3 100644 --- a/environment.yaml +++ b/environment.yaml @@ -3,46 +3,47 @@ channels: - main dependencies: - beautifulsoup4=4.11.1 -- black=22.8.0 +- black=22.10.0 - click=8.1.3 -- codecov=2.1.11 -- dill=0.3.5.1 +- codecov=2.1.12 +- dill=0.3.6 - easyprocess=1.1 -- gcsfs=2022.8.2 -- geckodriver=0.30.0 -- ipython=8.5.0 +- gcsfs=2022.11.0 +- geckodriver=0.32.0 +- ipython=8.6.0 - isort=5.10.1 - leveldb=1.23 -- multiprocess=0.70.13 -- mypy=0.982 -- nodejs=18.10.0 -- pandas=1.5.0 +- multiprocess=0.70.14 +- mypy=0.991 +- nodejs=18.12.1 +- pandas=1.5.1 - pillow=9.2.0 -- pip=22.2.2 +- pip=22.3.1 - pre-commit=2.20.0 -- psutil=5.9.2 +- psutil=5.9.4 - pyarrow=9.0.0 -- pytest-asyncio=0.19.0 +- pytest-asyncio=0.20.2 - pytest-cov=4.0.0 -- pytest=7.1.3 -- python=3.10.6 +- pytest=7.2.0 +- python=3.11.0 - pyvirtualdisplay=3.0 - recommonmark=0.7.1 - redis-py=4.3.4 -- s3fs=2022.8.2 -- selenium=4.5.0 -- sentry-sdk=1.9.10 +- s3fs=2022.11.0 +- selenium=4.6.0 +- sentry-sdk=1.11.0 - sphinx-markdown-tables=0.0.17 -- sphinx=5.2.3 +- sphinx=5.3.0 - tabulate=0.9.0 - tblib=1.7.0 - wget=1.20.3 - pip: - dataclasses-json==0.5.7 - domain-utils==0.7.1 - - jsonschema==4.16.0 - - plyvel==1.4.0 - - types-pyyaml==6.0.12 - - types-redis==4.3.21.1 + - jsonschema==4.17.0 + - plyvel==1.5.0 + - tranco==0.6 + - types-pyyaml==6.0.12.2 + - types-redis==4.3.21.4 - types-tabulate==0.9.0.0 name: openwpm diff --git a/scripts/environment-unpinned.yaml b/scripts/environment-unpinned.yaml index 303f91189..25934b66a 100644 --- a/scripts/environment-unpinned.yaml +++ b/scripts/environment-unpinned.yaml @@ -32,3 +32,4 @@ dependencies: - plyvel - domain-utils - dataclasses-json + - tranco