Skip to content

Commit

Permalink
FEAT-#2479: integrate asv (#2484)
Browse files Browse the repository at this point in the history
* FEAT-#2479: integrate asv

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* FEAT-#2479: add merge pytest-benchmark in asv style

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* FEAT-#2479: add CI job for check asv benchmarks

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* FEAT-#2479: increase verbosity

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* FEAT-#2479: use launch-method=spawn

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* FEAT-#2479: add CpuCount usage to control number of partitions

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* FEAT-#2479: change: TestDatasetSize -> MODIN_TEST_DATASET_SIZE

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev committed Dec 3, 2020
1 parent db794e0 commit 7458746
Show file tree
Hide file tree
Showing 8 changed files with 468 additions and 0 deletions.
56 changes: 56 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,62 @@ jobs:
- shell: bash -l {0}
run: bash <(curl -s https://codecov.io/bash)

test-asv-benchmarks:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
runs-on: ubuntu-latest
env:
MODIN_ENGINE: ray
MODIN_MEMORY: 1000000000
MODIN_TEST_DATASET_SIZE: small
name: test-asv-benchmarks
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- name: Cache pip
uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
environment-file: environment.yml
python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
- name: Conda environment
shell: bash -l {0}
run: |
conda info
conda list
- name: Running benchmarks
shell: bash -l {0}
run: |
pip install -e .
cd asv_bench
asv check -E existing
git remote add upstream https://github.com/modin-project/modin.git
git fetch upstream
if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
asv machine --yes
asv run --quick --show-stderr --python=same --launch-method=spawn | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log
if grep "failed" benchmarks.log > /dev/null ; then
exit 1
fi
else
echo "Benchmarks did not run, no changes detected"
fi
if: always()

- name: Publish benchmarks artifact
uses: actions/upload-artifact@master
with:
name: Benchmarks log
path: asv_bench/benchmarks.log
if: failure()

test-all:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
runs-on: ubuntu-latest
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,7 @@ cscope.out
# Dask workspace
dask-worker-space/
node_modules

# Asv stuff
asv_bench/.asv/
asv_bench/modin/
159 changes: 159 additions & 0 deletions asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
{
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,

// The name of the project being benchmarked
"project": "modin",

// The project's homepage
"project_url": "https://modin.readthedocs.io/",

// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "..",

// The Python project's subdirectory in your repo. If missing or
// the empty string, the project is assumed to be located at the root
// of the repository.
// "repo_subdir": "",

// Customizable commands for building, installing, and
// uninstalling the project. See asv.conf.json documentation.
//
// "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
// "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
// "build_command": [
// "python setup.py build",
// "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
// ],

// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
// "branches": ["master"], // for git
// "branches": ["default"], // for mercurial

// The DVCS being used. If not set, it will be automatically
// determined from "repo" by looking at the protocol in the URL
// (if remote), or by looking for special directories, such as
// ".git" (if local).
// "dvcs": "git",

// The tool to use to create environments. May be "conda",
// "virtualenv" or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "conda",

// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
//"install_timeout": 600,

// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",

// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
// "pythons": ["3.7"],

// The list of conda channel names to be searched for benchmark
// dependency packages in the specified order
"conda_channels": ["conda-forge", "defaults"],

// The matrix of dependencies to test. Each key is the name of a
// package (in PyPI) and the values are version numbers. An empty
// list or empty string indicates to just test against the default
// (latest) version. null indicates that the package is to not be
// installed. If the package to be tested is only available from
// PyPi, and the 'environment_type' is conda, then you can preface
// the package name by 'pip+', and the package will be installed via
// pip (with all the conda available packages installed first,
// followed by the pip installed packages).
"matrix": {
"pandas": ["1.1.4"],
"packaging": [""],
"pip+ray": ["1.0.1"],
"pyarrow": ["1.0"]
},
// Combinations of libraries/python versions can be excluded/included
// from the set to test. Each entry is a dictionary containing additional
// key-value pairs to include/exclude.
//
// An exclude entry excludes entries where all values match. The
// values are regexps that should match the whole string.
//
// An include entry adds an environment. Only the packages listed
// are installed. The 'python' key is required. The exclude rules
// do not apply to includes.
//
// In addition to package names, the following keys are available:
//
// - python
// Python version, as in the *pythons* variable above.
// - environment_type
// Environment type, as above.
// - sys_platform
// Platform, as in sys.platform. Possible values for the common
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
//
// "exclude": [
// {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
// {"environment_type": "conda", "six": null}, // don't run without six on conda
// ],
//
// "include": [
// // additional env for python2.7
// {"python": "2.7", "numpy": "1.8"},
// // additional env if run on windows+conda
// {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
// ],

// The directory (relative to the current directory) that benchmarks are
// stored in. If not provided, defaults to "benchmarks"
// "benchmark_dir": "benchmarks",

// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
"env_dir": ".asv/env",

// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": ".asv/results",

// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
"html_dir": ".asv/html",

// The number of characters to retain in the commit hashes.
// "hash_length": 8,

// `asv` will cache results of the recent builds in each
// environment, making them faster to install next time. This is
// the number of builds to keep, per environment.
// "build_cache_size": 2,

// The commits after which the regression search in `asv publish`
// should start looking for regressions. Dictionary whose keys are
// regexps matching to benchmark names, and values corresponding to
// the commit (exclusive) after which to start looking for
// regressions. The default is to start from the first commit
// with results. If the commit is `null`, regression detection is
// skipped for the matching benchmark.
//
// "regressions_first_commits": {
// "some_benchmark": "352cdf", // Consider regressions only after this commit
// "another_benchmark": null, // Skip regression detection altogether
// },

// The thresholds for relative change in results, after which `asv
// publish` starts reporting regressions. Dictionary of the same
// form as in ``regressions_first_commits``, with values
// indicating the thresholds. If multiple entries match, the
// maximum is taken. If no entry matches, the default is 5%.
//
// "regressions_thresholds": {
// "some_benchmark": 0.01, // Threshold of 1%
// "another_benchmark": 0.5, // Threshold of 50%
// },
}
14 changes: 14 additions & 0 deletions asv_bench/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""Modin benchmarks"""
138 changes: 138 additions & 0 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import modin.pandas as pd
from modin.config import CpuCount, TestDatasetSize
from .utils import generate_dataframe, RAND_LOW, RAND_HIGH

# define `MODIN_CPUS` env var to control the number of partitions
# it should be defined before modin.pandas import
pd.DEFAULT_NPARTITIONS = CpuCount.get()

if TestDatasetSize.get() == "Big":
MERGE_DATA_SIZE = [
(5000, 5000, 5000, 5000),
(10, 1_000_000, 10, 1_000_000),
(1_000_000, 10, 1_000_000, 10),
]
GROUPBY_DATA_SIZE = [
(5000, 5000),
(10, 1_000_000),
(1_000_000, 10),
]
else:
MERGE_DATA_SIZE = [
(2000, 100, 2000, 100),
]
GROUPBY_DATA_SIZE = [
(2000, 100),
]

JOIN_DATA_SIZE = MERGE_DATA_SIZE
ARITHMETIC_DATA_SIZE = GROUPBY_DATA_SIZE


class TimeGroupBy:
param_names = ["impl", "data_type", "data_size"]
params = [
["modin", "pandas"],
["int"],
GROUPBY_DATA_SIZE,
]

def setup(self, impl, data_type, data_size):
self.df = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)

def time_groupby_sum(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).sum()

def time_groupby_mean(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).mean()

def time_groupby_count(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).count()


class TimeJoin:
param_names = ["impl", "data_type", "data_size", "how", "sort"]
params = [
["modin", "pandas"],
["int"],
JOIN_DATA_SIZE,
["left", "right", "outer", "inner"],
[False, True],
]

def setup(self, impl, data_type, data_size, how, sort):
self.df1 = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
impl, data_type, data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)

def time_join(self, impl, data_type, data_size, how, sort):
self.df1.join(
self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort
)


class TimeMerge:
param_names = ["impl", "data_type", "data_size", "how", "sort"]
params = [
["modin", "pandas"],
["int"],
MERGE_DATA_SIZE,
["left", "right", "outer", "inner"],
[False, True],
]

def setup(self, impl, data_type, data_size, how, sort):
self.df1 = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
impl, data_type, data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)

def time_merge(self, impl, data_type, data_size, how, sort):
self.df1.merge(self.df2, on=self.df1.columns[0], how=how, sort=sort)


class TimeArithmetic:
param_names = ["impl", "data_type", "data_size", "axis"]
params = [
["modin", "pandas"],
["int"],
ARITHMETIC_DATA_SIZE,
[0, 1],
]

def setup(self, impl, data_type, data_size, axis):
self.df = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)

def time_sum(self, impl, data_type, data_size, axis):
self.df.sum(axis=axis)

def time_median(self, impl, data_type, data_size, axis):
self.df.median(axis=axis)

def time_nunique(self, impl, data_type, data_size, axis):
self.df.nunique(axis=axis)

def time_apply(self, impl, data_type, data_size, axis):
self.df.apply(lambda df: df.sum(), axis=axis)
Loading

0 comments on commit 7458746

Please sign in to comment.