modin-project · gshimansky · Dec 3, 2020 · Nov 28, 2020 · Dec 1, 2020 · Dec 1, 2020
@@ -297,6 +297,62 @@ jobs:
       - shell: bash -l {0}
         run: bash <(curl -s https://codecov.io/bash)
 
+  test-asv-benchmarks:
+    needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
+    runs-on: ubuntu-latest
+    env:
+      MODIN_ENGINE: ray
+      MODIN_MEMORY: 1000000000
+      TestDatasetSize: small
+    name: test-asv-benchmarks
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 1
+      - name: Cache pip
+        uses: actions/cache@v1
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          activate-environment: modin
+          environment-file: environment.yml
+          python-version: 3.7
+          channel-priority: strict
+          use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
+      - name: Conda environment
+        shell: bash -l {0}
+        run: |
+          conda info
+          conda list
+
+      - name: Running benchmarks
+        shell: bash -l {0}
+        run: |
+          pip install -e .
+          cd asv_bench
+          asv check -E existing
+          git remote add upstream https://github.com/modin-project/modin.git
+          git fetch upstream
+          if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
+              asv machine --yes
+              asv run --quick --show-stderr --python=same --launch-method=spawn | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log
+              if grep "failed" benchmarks.log > /dev/null ; then
+                  exit 1
+              fi
+          else
+              echo "Benchmarks did not run, no changes detected"
+          fi
+        if: always()
+
+      - name: Publish benchmarks artifact
+        uses: actions/upload-artifact@master
+        with:
+          name: Benchmarks log
+          path: asv_bench/benchmarks.log
+        if: failure()
+
   test-all:
     needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
     runs-on: ubuntu-latest

@@ -172,3 +172,7 @@ cscope.out
 # Dask workspace
 dask-worker-space/
 node_modules
+
+# Asv stuff
+asv_bench/.asv/
+asv_bench/modin/
@@ -0,0 +1,159 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "modin",
+
+    // The project's homepage
+    "project_url": "https://modin.readthedocs.io/",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "..",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // Customizable commands for building, installing, and
+    // uninstalling the project. See asv.conf.json documentation.
+    //
+    // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
+    // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+    // "build_command": [
+    //     "python setup.py build",
+    //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
+    // ],
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    // "branches": ["master"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "https://github.com/modin-project/modin/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["3.7"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    "conda_channels": ["conda-forge", "defaults"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    "matrix": {
+        "pandas": ["1.1.4"],
+        "packaging": [""],
+        "pip+ray": ["1.0.1"],
+        "pyarrow": ["1.0"]
+    },
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
@@ -0,0 +1,14 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+"""Modin benchmarks"""
@@ -0,0 +1,136 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import modin.pandas as pd
+from modin.config import TestDatasetSize
+from .utils import generate_dataframe, RAND_LOW, RAND_HIGH
+
+pd.DEFAULT_NPARTITIONS = 4
+
+if TestDatasetSize.get() == "Big":
+    MERGE_DATA_SIZE = [
+        (5000, 5000, 5000, 5000),
+        (10, 1_000_000, 10, 1_000_000),
+        (1_000_000, 10, 1_000_000, 10),
+    ]
+    GROUPBY_DATA_SIZE = [
+        (5000, 5000),
+        (10, 1_000_000),
+        (1_000_000, 10),
+    ]
+else:
+    MERGE_DATA_SIZE = [
+        (2000, 100, 2000, 100),
+    ]
+    GROUPBY_DATA_SIZE = [
+        (2000, 100),
+    ]
+
+JOIN_DATA_SIZE = MERGE_DATA_SIZE
+ARITHMETIC_DATA_SIZE = GROUPBY_DATA_SIZE
+
+
+class TimeGroupBy:
+    param_names = ["impl", "data_type", "data_size"]
+    params = [
+        ["modin", "pandas"],
+        ["int"],
+        GROUPBY_DATA_SIZE,
+    ]
+
+    def setup(self, impl, data_type, data_size):
+        self.df = generate_dataframe(
+            impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
+        )
+
+    def time_groupby_sum(self, impl, data_type, data_size):
+        self.df.groupby(by=self.df.columns[0]).sum()
+
+    def time_groupby_mean(self, impl, data_type, data_size):
+        self.df.groupby(by=self.df.columns[0]).mean()
+
+    def time_groupby_count(self, impl, data_type, data_size):
+        self.df.groupby(by=self.df.columns[0]).count()
+
+
+class TimeJoin:
+    param_names = ["impl", "data_type", "data_size", "how", "sort"]
+    params = [
+        ["modin", "pandas"],
+        ["int"],
+        JOIN_DATA_SIZE,
+        ["left", "right", "outer", "inner"],
+        [False, True],
+    ]
+
+    def setup(self, impl, data_type, data_size, how, sort):
+        self.df1 = generate_dataframe(
+            impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
+        )
+        self.df2 = generate_dataframe(
+            impl, data_type, data_size[2], data_size[3], RAND_LOW, RAND_HIGH
+        )
+
+    def time_join(self, impl, data_type, data_size, how, sort):
+        self.df1.join(
+            self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort
+        )
+
+
+class TimeMerge:
+    param_names = ["impl", "data_type", "data_size", "how", "sort"]
+    params = [
+        ["modin", "pandas"],
+        ["int"],
+        MERGE_DATA_SIZE,
+        ["left", "right", "outer", "inner"],
+        [False, True],
+    ]
+
+    def setup(self, impl, data_type, data_size, how, sort):
+        self.df1 = generate_dataframe(
+            impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
+        )
+        self.df2 = generate_dataframe(
+            impl, data_type, data_size[2], data_size[3], RAND_LOW, RAND_HIGH
+        )
+
+    def time_merge(self, impl, data_type, data_size, how, sort):
+        self.df1.merge(self.df2, on=self.df1.columns[0], how=how, sort=sort)
+
+
+class TimeArithmetic:
+    param_names = ["impl", "data_type", "data_size", "axis"]
+    params = [
+        ["modin", "pandas"],
+        ["int"],
+        ARITHMETIC_DATA_SIZE,
+        [0, 1],
+    ]
+
+    def setup(self, impl, data_type, data_size, axis):
+        self.df = generate_dataframe(
+            impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
+        )
+
+    def time_sum(self, impl, data_type, data_size, axis):
+        self.df.sum(axis=axis)
+
+    def time_median(self, impl, data_type, data_size, axis):
+        self.df.median(axis=axis)
+
+    def time_nunique(self, impl, data_type, data_size, axis):
+        self.df.nunique(axis=axis)
+
+    def time_apply(self, impl, data_type, data_size, axis):
+        self.df.apply(lambda df: df.sum(), axis=axis)