Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#2479: integrate asv #2484

Merged
merged 7 commits into from
Dec 3, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,62 @@ jobs:
- shell: bash -l {0}
run: bash <(curl -s https://codecov.io/bash)

test-asv-benchmarks:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
runs-on: ubuntu-latest
env:
MODIN_ENGINE: ray
MODIN_MEMORY: 1000000000
TestDatasetSize: small
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
name: test-asv-benchmarks
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- name: Cache pip
uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
environment-file: environment.yml
python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
- name: Conda environment
shell: bash -l {0}
run: |
conda info
conda list

- name: Running benchmarks
shell: bash -l {0}
run: |
pip install -e .
cd asv_bench
asv check -E existing
git remote add upstream https://github.com/modin-project/modin.git
git fetch upstream
if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
asv machine --yes
asv run --quick --show-stderr --python=same --launch-method=spawn | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log
if grep "failed" benchmarks.log > /dev/null ; then
exit 1
fi
else
echo "Benchmarks did not run, no changes detected"
fi
if: always()

- name: Publish benchmarks artifact
uses: actions/upload-artifact@master
with:
name: Benchmarks log
path: asv_bench/benchmarks.log
if: failure()

test-all:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
runs-on: ubuntu-latest
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,7 @@ cscope.out
# Dask workspace
dask-worker-space/
node_modules

# Asv stuff
asv_bench/.asv/
asv_bench/modin/
159 changes: 159 additions & 0 deletions asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
{
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,

// The name of the project being benchmarked
"project": "modin",

// The project's homepage
"project_url": "https://modin.readthedocs.io/",

// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "..",

// The Python project's subdirectory in your repo. If missing or
// the empty string, the project is assumed to be located at the root
// of the repository.
// "repo_subdir": "",

// Customizable commands for building, installing, and
// uninstalling the project. See asv.conf.json documentation.
//
// "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
// "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
// "build_command": [
// "python setup.py build",
// "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
// ],

// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
// "branches": ["master"], // for git
// "branches": ["default"], // for mercurial

// The DVCS being used. If not set, it will be automatically
// determined from "repo" by looking at the protocol in the URL
// (if remote), or by looking for special directories, such as
// ".git" (if local).
// "dvcs": "git",

// The tool to use to create environments. May be "conda",
// "virtualenv" or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "conda",

// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
//"install_timeout": 600,

// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",

// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
// "pythons": ["3.7"],

// The list of conda channel names to be searched for benchmark
// dependency packages in the specified order
"conda_channels": ["conda-forge", "defaults"],

// The matrix of dependencies to test. Each key is the name of a
// package (in PyPI) and the values are version numbers. An empty
// list or empty string indicates to just test against the default
// (latest) version. null indicates that the package is to not be
// installed. If the package to be tested is only available from
// PyPi, and the 'environment_type' is conda, then you can preface
// the package name by 'pip+', and the package will be installed via
// pip (with all the conda available packages installed first,
// followed by the pip installed packages).
"matrix": {
"pandas": ["1.1.4"],
"packaging": [""],
"pip+ray": ["1.0.1"],
"pyarrow": ["1.0"]
},
// Combinations of libraries/python versions can be excluded/included
// from the set to test. Each entry is a dictionary containing additional
// key-value pairs to include/exclude.
//
// An exclude entry excludes entries where all values match. The
// values are regexps that should match the whole string.
//
// An include entry adds an environment. Only the packages listed
// are installed. The 'python' key is required. The exclude rules
// do not apply to includes.
//
// In addition to package names, the following keys are available:
//
// - python
// Python version, as in the *pythons* variable above.
// - environment_type
// Environment type, as above.
// - sys_platform
// Platform, as in sys.platform. Possible values for the common
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
//
// "exclude": [
// {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
// {"environment_type": "conda", "six": null}, // don't run without six on conda
// ],
//
// "include": [
// // additional env for python2.7
// {"python": "2.7", "numpy": "1.8"},
// // additional env if run on windows+conda
// {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
// ],

// The directory (relative to the current directory) that benchmarks are
// stored in. If not provided, defaults to "benchmarks"
// "benchmark_dir": "benchmarks",

// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
"env_dir": ".asv/env",

// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": ".asv/results",

// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
"html_dir": ".asv/html",

// The number of characters to retain in the commit hashes.
// "hash_length": 8,

// `asv` will cache results of the recent builds in each
// environment, making them faster to install next time. This is
// the number of builds to keep, per environment.
// "build_cache_size": 2,

// The commits after which the regression search in `asv publish`
// should start looking for regressions. Dictionary whose keys are
// regexps matching to benchmark names, and values corresponding to
// the commit (exclusive) after which to start looking for
// regressions. The default is to start from the first commit
// with results. If the commit is `null`, regression detection is
// skipped for the matching benchmark.
//
// "regressions_first_commits": {
// "some_benchmark": "352cdf", // Consider regressions only after this commit
// "another_benchmark": null, // Skip regression detection altogether
// },

// The thresholds for relative change in results, after which `asv
// publish` starts reporting regressions. Dictionary of the same
// form as in ``regressions_first_commits``, with values
// indicating the thresholds. If multiple entries match, the
// maximum is taken. If no entry matches, the default is 5%.
//
// "regressions_thresholds": {
// "some_benchmark": 0.01, // Threshold of 1%
// "another_benchmark": 0.5, // Threshold of 50%
// },
}
14 changes: 14 additions & 0 deletions asv_bench/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""Modin benchmarks"""
136 changes: 136 additions & 0 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import modin.pandas as pd
from modin.config import TestDatasetSize
from .utils import generate_dataframe, RAND_LOW, RAND_HIGH

pd.DEFAULT_NPARTITIONS = 4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to somehow make pd.DEFAULT_NPARTITIONS a parameter to be able to run same benchmarks with different settings. This way we could see how they scale on different number of CPU cores.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you.

In order not to generate new modin environment variables - I suppose we can use MODIN_CPUS to adjust the number of partitions. How do you like this solution?


if TestDatasetSize.get() == "Big":
MERGE_DATA_SIZE = [
(5000, 5000, 5000, 5000),
(10, 1_000_000, 10, 1_000_000),
(1_000_000, 10, 1_000_000, 10),
]
GROUPBY_DATA_SIZE = [
(5000, 5000),
(10, 1_000_000),
(1_000_000, 10),
]
else:
MERGE_DATA_SIZE = [
(2000, 100, 2000, 100),
]
GROUPBY_DATA_SIZE = [
(2000, 100),
]

JOIN_DATA_SIZE = MERGE_DATA_SIZE
ARITHMETIC_DATA_SIZE = GROUPBY_DATA_SIZE


class TimeGroupBy:
param_names = ["impl", "data_type", "data_size"]
params = [
["modin", "pandas"],
["int"],
GROUPBY_DATA_SIZE,
]

def setup(self, impl, data_type, data_size):
self.df = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)

def time_groupby_sum(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).sum()

def time_groupby_mean(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).mean()

def time_groupby_count(self, impl, data_type, data_size):
self.df.groupby(by=self.df.columns[0]).count()


class TimeJoin:
param_names = ["impl", "data_type", "data_size", "how", "sort"]
params = [
["modin", "pandas"],
["int"],
JOIN_DATA_SIZE,
["left", "right", "outer", "inner"],
[False, True],
]

def setup(self, impl, data_type, data_size, how, sort):
self.df1 = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
impl, data_type, data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)

def time_join(self, impl, data_type, data_size, how, sort):
self.df1.join(
self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort
)


class TimeMerge:
param_names = ["impl", "data_type", "data_size", "how", "sort"]
params = [
["modin", "pandas"],
["int"],
MERGE_DATA_SIZE,
["left", "right", "outer", "inner"],
[False, True],
]

def setup(self, impl, data_type, data_size, how, sort):
self.df1 = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
impl, data_type, data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)

def time_merge(self, impl, data_type, data_size, how, sort):
self.df1.merge(self.df2, on=self.df1.columns[0], how=how, sort=sort)


class TimeArithmetic:
param_names = ["impl", "data_type", "data_size", "axis"]
params = [
["modin", "pandas"],
["int"],
ARITHMETIC_DATA_SIZE,
[0, 1],
]

def setup(self, impl, data_type, data_size, axis):
self.df = generate_dataframe(
impl, data_type, data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)

def time_sum(self, impl, data_type, data_size, axis):
self.df.sum(axis=axis)

def time_median(self, impl, data_type, data_size, axis):
self.df.median(axis=axis)

def time_nunique(self, impl, data_type, data_size, axis):
self.df.nunique(axis=axis)

def time_apply(self, impl, data_type, data_size, axis):
self.df.apply(lambda df: df.sum(), axis=axis)
Loading