diff --git a/.binstar.yml b/.binstar.yml
index 6f7c2c5ba4c7a..c70add11c55b0 100644
--- a/.binstar.yml
+++ b/.binstar.yml
@@ -1,22 +1,21 @@
package: pandas
user: jreback
-platform:
- #- osx-64
- #- linux-32
- - linux-64
- - win-64
- #- win-32
-
-engine:
- #- python=2.6
- - python=2.7
- #- python=3.3
- #- python=3.4
+install:
+ - conda config --add channels pandas
before_script:
- python -V
+platform:
+ - linux-64
+ #- linux-32
+ - osx-64
+ #- win-32
+ - win-64
+engine:
+ - python=2.7
+ #- python=3.4
script:
- conda build conda.recipe --quiet
@@ -27,12 +26,3 @@ build_targets: conda
notifications:
email:
recipients: ['jeff@reback.net']
-
----
-platform: win-32
-engine: python=2.6
-exclude: true
----
-platform: win-64
-engine: python=2.6
-exclude: true
diff --git a/.gitattributes b/.gitattributes
index 0ef16e42a0660..736fa09d070fe 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -13,3 +13,4 @@
*.dta binary
*.xls binary
*.xlsx binary
+pandas/_version.py export-subst
diff --git a/.gitignore b/.gitignore
index e8b557d68ac39..d33df2df6e548 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@
.idea
.vagrant
.noseids
+.ipynb_checkpoints
# Compiled source #
###################
@@ -41,6 +42,8 @@ doc/_build
dist
# Egg metadata
*.egg-info
+.eggs
+
# tox testing tool
.tox
# rope
@@ -76,9 +79,9 @@ scikits
*.c
*.cpp
-# Things specific to this project #
-###################################
-pandas/version.py
+# Performance Testing #
+#######################
+asv_bench/
# Documentation generated files #
#################################
diff --git a/.travis.yml b/.travis.yml
index 246154310a50f..b867601ba0b96 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -86,13 +86,6 @@ matrix:
- CLIPBOARD=xsel
- BUILD_TYPE=conda
- JOB_NAME: "34_slow"
- - python: 3.2
- env:
- - NOSE_ARGS="not slow and not network and not disabled"
- - FULL_DEPS=true
- - CLIPBOARD_GUI=qt4
- - BUILD_TYPE=pydata
- - JOB_NAME: "32_nslow"
- python: 2.7
env:
- EXPERIMENTAL=true
@@ -103,13 +96,6 @@ matrix:
- BUILD_TYPE=pydata
- PANDAS_TESTING_MODE="deprecate"
allow_failures:
- - python: 3.2
- env:
- - NOSE_ARGS="not slow and not network and not disabled"
- - FULL_DEPS=true
- - CLIPBOARD_GUI=qt4
- - BUILD_TYPE=pydata
- - JOB_NAME: "32_nslow"
- python: 2.7
env:
- NOSE_ARGS="slow and not network and not disabled"
diff --git a/MANIFEST.in b/MANIFEST.in
index 69174f7f05b98..2d26fbfd6adaf 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -24,3 +24,5 @@ global-exclude *.png
# recursive-include doc/source *
# recursive-include doc/sphinxext *
# recursive-include LICENSES *
+include versioneer.py
+include pandas/_version.py
diff --git a/README.md b/README.md
index bba31fef7a939..947dfc5928249 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,45 @@
# pandas: powerful Python data analysis toolkit
-[![Build Status](https://travis-ci.org/pydata/pandas.svg?branch=master)](https://travis-ci.org/pydata/pandas)
-[![Join the chat at
-https://gitter.im/pydata/pandas](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+
+
+ Latest Release |
+ |
+
+
+ Package Status |
+ |
+
+
+ License |
+ |
+
+
+ Build Status |
+
+
+
+
+ |
+
+
+ Conda |
+
+
+
+
+ |
+
+
+ PyPI |
+
+
+
+
+ |
+
+
+
+[![https://gitter.im/pydata/pandas](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
## What is it
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
new file mode 100644
index 0000000000000..760db2086b125
--- /dev/null
+++ b/asv_bench/asv.conf.json
@@ -0,0 +1,66 @@
+{
+ // The version of the config file format. Do not change, unless
+ // you know what you are doing.
+ "version": 1,
+
+ // The name of the project being benchmarked
+ "project": "pandas",
+
+ // The project's homepage
+ "project_url": "http://pandas.pydata.org/",
+
+ // The URL of the source code repository for the project being
+ // benchmarked
+ "repo": "..",
+
+ // The tool to use to create environments. May be "conda",
+ // "virtualenv" or other value depending on the plugins in use.
+ // If missing or the empty string, the tool will be automatically
+ // determined by looking for tools on the PATH environment
+ // variable.
+ "environment_type": "conda",
+
+ // the base URL to show a commit for the project.
+ "show_commit_url": "https://github.com/pydata/pandas/commit/",
+
+ // The Pythons you'd like to test against. If not provided, defaults
+ // to the current version of Python used to run `asv`.
+ // "pythons": ["2.7", "3.4"],
+ "pythons": ["2.7"],
+
+ // The matrix of dependencies to test. Each key is the name of a
+ // package (in PyPI) and the values are version numbers. An empty
+ // list indicates to just test against the default (latest)
+ // version.
+ "matrix": {
+ // To run against multiple versions, replace with
+ // "numpy": ["1.7", "1.9"],
+ "numpy": [],
+ "Cython": [],
+ "matplotlib": [],
+ "sqlalchemy": [],
+ "scipy": [],
+ "numexpr": [],
+ "pytables": [],
+ },
+
+ // The directory (relative to the current directory) that benchmarks are
+ // stored in. If not provided, defaults to "benchmarks"
+ // "benchmark_dir": "benchmarks",
+
+ // The directory (relative to the current directory) to cache the Python
+ // environments in. If not provided, defaults to "env"
+ // "env_dir": "env",
+
+
+ // The directory (relative to the current directory) that raw benchmark
+ // results are stored in. If not provided, defaults to "results".
+ // "results_dir": "results",
+
+ // The directory (relative to the current directory) that the html tree
+ // should be written to. If not provided, defaults to "html".
+ // "html_dir": "html",
+
+ // The number of characters to retain in the commit hashes.
+ // "hash_length": 8
+}
diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py
new file mode 100644
index 0000000000000..ecb91923dc663
--- /dev/null
+++ b/asv_bench/benchmarks/attrs_caching.py
@@ -0,0 +1,23 @@
+from pandas_vb_common import *
+
+
+class getattr_dataframe_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(10, 6))
+ self.cur_index = self.df.index
+
+ def time_getattr_dataframe_index(self):
+ self.foo = self.df.index
+
+
+class setattr_dataframe_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(10, 6))
+ self.cur_index = self.df.index
+
+ def time_setattr_dataframe_index(self):
+ self.df.index = self.cur_index
\ No newline at end of file
diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py
new file mode 100644
index 0000000000000..13976014ec6f1
--- /dev/null
+++ b/asv_bench/benchmarks/binary_ops.py
@@ -0,0 +1,236 @@
+from pandas_vb_common import *
+import pandas.computation.expressions as expr
+
+
+class frame_add(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+
+ def time_frame_add(self):
+ (self.df + self.df2)
+
+
+class frame_add_no_ne(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ expr.set_use_numexpr(False)
+
+ def time_frame_add_no_ne(self):
+ (self.df + self.df2)
+
+ def teardown(self):
+ expr.set_use_numexpr(True)
+
+
+class frame_add_st(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_frame_add_st(self):
+ (self.df + self.df2)
+
+ def teardown(self):
+ expr.set_numexpr_threads()
+
+
+class frame_float_div(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 1000))
+ self.df2 = DataFrame(np.random.randn(1000, 1000))
+
+ def time_frame_float_div(self):
+ (self.df // self.df2)
+
+
+class frame_float_div_by_zero(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 1000))
+
+ def time_frame_float_div_by_zero(self):
+ (self.df / 0)
+
+
+class frame_float_floor_by_zero(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 1000))
+
+ def time_frame_float_floor_by_zero(self):
+ (self.df // 0)
+
+
+class frame_float_mod(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 1000))
+ self.df2 = DataFrame(np.random.randn(1000, 1000))
+
+ def time_frame_float_mod(self):
+ (self.df / self.df2)
+
+
+class frame_int_div_by_zero(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000)))
+
+ def time_frame_int_div_by_zero(self):
+ (self.df / 0)
+
+
+class frame_int_mod(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000)))
+ self.df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000)))
+
+ def time_frame_int_mod(self):
+ (self.df / self.df2)
+
+
+class frame_mult(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+
+ def time_frame_mult(self):
+ (self.df * self.df2)
+
+
+class frame_mult_no_ne(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ expr.set_use_numexpr(False)
+
+ def time_frame_mult_no_ne(self):
+ (self.df * self.df2)
+
+ def teardown(self):
+ expr.set_use_numexpr(True)
+
+
+class frame_mult_st(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_frame_mult_st(self):
+ (self.df * self.df2)
+
+ def teardown(self):
+ expr.set_numexpr_threads()
+
+
+class frame_multi_and(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+
+ def time_frame_multi_and(self):
+ self.df[((self.df > 0) & (self.df2 > 0))]
+
+
+class frame_multi_and_no_ne(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ expr.set_use_numexpr(False)
+
+ def time_frame_multi_and_no_ne(self):
+ self.df[((self.df > 0) & (self.df2 > 0))]
+
+ def teardown(self):
+ expr.set_use_numexpr(True)
+
+
+class frame_multi_and_st(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_frame_multi_and_st(self):
+ self.df[((self.df > 0) & (self.df2 > 0))]
+
+ def teardown(self):
+ expr.set_numexpr_threads()
+
+
+class series_timestamp_compare(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.halfway = ((self.N // 2) - 1)
+ self.s = Series(date_range('20010101', periods=self.N, freq='T'))
+ self.ts = self.s[self.halfway]
+
+ def time_series_timestamp_compare(self):
+ (self.s <= self.ts)
+
+
+class timestamp_ops_diff1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.s = Series(date_range('20010101', periods=self.N, freq='s'))
+
+ def time_timestamp_ops_diff1(self):
+ self.s.diff()
+
+
+class timestamp_ops_diff2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.s = Series(date_range('20010101', periods=self.N, freq='s'))
+
+ def time_timestamp_ops_diff2(self):
+ (self.s - self.s.shift())
+
+
+class timestamp_series_compare(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.halfway = ((self.N // 2) - 1)
+ self.s = Series(date_range('20010101', periods=self.N, freq='T'))
+ self.ts = self.s[self.halfway]
+
+ def time_timestamp_series_compare(self):
+ (self.ts >= self.s)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
new file mode 100644
index 0000000000000..34caef221a340
--- /dev/null
+++ b/asv_bench/benchmarks/categoricals.py
@@ -0,0 +1,11 @@
+from pandas_vb_common import *
+
+
+class concat_categorical(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = pd.Series((list('aabbcd') * 1000000)).astype('category')
+
+ def time_concat_categorical(self):
+ concat([self.s, self.s])
\ No newline at end of file
diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py
new file mode 100644
index 0000000000000..b48211b3db83e
--- /dev/null
+++ b/asv_bench/benchmarks/ctors.py
@@ -0,0 +1,52 @@
+from pandas_vb_common import *
+
+
+class frame_constructor_ndarray(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.arr = np.random.randn(100, 100)
+
+ def time_frame_constructor_ndarray(self):
+ DataFrame(self.arr)
+
+
+class ctor_index_array_string(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.array(['foo', 'bar', 'baz'], dtype=object)
+
+ def time_ctor_index_array_string(self):
+ Index(self.data)
+
+
+class series_constructor_ndarray(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(100)
+ self.index = Index(np.arange(100))
+
+ def time_series_constructor_ndarray(self):
+ Series(self.data, index=self.index)
+
+
+class dtindex_from_series_ctor(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000))
+
+ def time_dtindex_from_series_ctor(self):
+ DatetimeIndex(self.s)
+
+
+class index_from_series_ctor(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000))
+
+ def time_index_from_series_ctor(self):
+ Index(self.s)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py
new file mode 100644
index 0000000000000..397312355aa47
--- /dev/null
+++ b/asv_bench/benchmarks/eval.py
@@ -0,0 +1,239 @@
+from pandas_vb_common import *
+import pandas.computation.expressions as expr
+import pandas as pd
+
+
+class eval_frame_add_all_threads(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+
+ def time_eval_frame_add_all_threads(self):
+ pd.eval('df + df2 + df3 + df4')
+
+
+class eval_frame_add_one_thread(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_eval_frame_add_one_thread(self):
+ pd.eval('df + df2 + df3 + df4')
+
+
+class eval_frame_add_python(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+
+ def time_eval_frame_add_python(self):
+ pd.eval('df + df2 + df3 + df4', engine='python')
+
+
+class eval_frame_add_python_one_thread(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_eval_frame_add_python_one_thread(self):
+ pd.eval('df + df2 + df3 + df4', engine='python')
+
+
+class eval_frame_and_all_threads(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+
+ def time_eval_frame_and_all_threads(self):
+ pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')
+
+
+class eval_frame_and_python_one_thread(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_eval_frame_and_python_one_thread(self):
+ pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
+
+
+class eval_frame_and_python(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+
+ def time_eval_frame_and_python(self):
+ pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
+
+
+class eval_frame_chained_cmp_all_threads(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+
+ def time_eval_frame_chained_cmp_all_threads(self):
+ pd.eval('df < df2 < df3 < df4')
+
+
+class eval_frame_chained_cmp_python_one_thread(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_eval_frame_chained_cmp_python_one_thread(self):
+ pd.eval('df < df2 < df3 < df4', engine='python')
+
+
+class eval_frame_chained_cmp_python(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+
+ def time_eval_frame_chained_cmp_python(self):
+ pd.eval('df < df2 < df3 < df4', engine='python')
+
+
+class eval_frame_mult_all_threads(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+
+ def time_eval_frame_mult_all_threads(self):
+ pd.eval('df * df2 * df3 * df4')
+
+
+class eval_frame_mult_one_thread(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_eval_frame_mult_one_thread(self):
+ pd.eval('df * df2 * df3 * df4')
+
+
+class eval_frame_mult_python(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+
+ def time_eval_frame_mult_python(self):
+ pd.eval('df * df2 * df3 * df4', engine='python')
+
+
+class eval_frame_mult_python_one_thread(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(20000, 100))
+ self.df2 = DataFrame(np.random.randn(20000, 100))
+ self.df3 = DataFrame(np.random.randn(20000, 100))
+ self.df4 = DataFrame(np.random.randn(20000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_eval_frame_mult_python_one_thread(self):
+ pd.eval('df * df2 * df3 * df4', engine='python')
+
+
+class query_datetime_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.halfway = ((self.N // 2) - 1)
+ self.index = date_range('20010101', periods=self.N, freq='T')
+ self.s = Series(self.index)
+ self.ts = self.s.iloc[self.halfway]
+ self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index)
+
+ def time_query_datetime_index(self):
+ self.df.query('index < @ts')
+
+
+class query_datetime_series(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.halfway = ((self.N // 2) - 1)
+ self.index = date_range('20010101', periods=self.N, freq='T')
+ self.s = Series(self.index)
+ self.ts = self.s.iloc[self.halfway]
+ self.df = DataFrame({'dates': self.s.values, })
+
+ def time_query_datetime_series(self):
+ self.df.query('dates < @ts')
+
+
+class query_with_boolean_selection(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.halfway = ((self.N // 2) - 1)
+ self.index = date_range('20010101', periods=self.N, freq='T')
+ self.s = Series(self.index)
+ self.ts = self.s.iloc[self.halfway]
+ self.N = 1000000
+ self.df = DataFrame({'a': np.random.randn(self.N), })
+ self.min_val = self.df['a'].min()
+ self.max_val = self.df['a'].max()
+
+ def time_query_with_boolean_selection(self):
+ self.df.query('(a >= @min_val) & (a <= @max_val)')
\ No newline at end of file
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
new file mode 100644
index 0000000000000..2cb337e0e6b9d
--- /dev/null
+++ b/asv_bench/benchmarks/frame_ctor.py
@@ -0,0 +1,1706 @@
+from pandas_vb_common import *
+try:
+ from pandas.tseries.offsets import *
+except:
+ from pandas.core.datetools import *
+
+
+class frame_ctor_dtindex_BDayx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BDay(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BDayx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BDayx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BDay(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BDayx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BMonthBeginx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BMonthBegin(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BMonthBeginx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BMonthBeginx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BMonthBegin(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BMonthBeginx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BMonthEndx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BMonthEnd(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BMonthEndx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BMonthEndx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BMonthEnd(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BMonthEndx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BQuarterBeginx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BQuarterBegin(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BQuarterBeginx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BQuarterBeginx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BQuarterBegin(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BQuarterBeginx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BQuarterEndx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BQuarterEnd(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BQuarterEndx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BQuarterEndx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BQuarterEnd(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BQuarterEndx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BYearBeginx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BYearBegin(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BYearBeginx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BYearBeginx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BYearBegin(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BYearBeginx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BYearEndx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BYearEnd(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BYearEndx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BYearEndx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BYearEnd(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BYearEndx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BusinessDayx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BusinessDay(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BusinessDayx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BusinessDayx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BusinessDay(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BusinessDayx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BusinessHourx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BusinessHour(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BusinessHourx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_BusinessHourx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(BusinessHour(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_BusinessHourx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_CBMonthBeginx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(CBMonthBegin(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_CBMonthBeginx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_CBMonthBeginx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(CBMonthBegin(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_CBMonthBeginx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_CBMonthEndx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(CBMonthEnd(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_CBMonthEndx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_CBMonthEndx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(CBMonthEnd(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_CBMonthEndx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_CDayx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(CDay(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_CDayx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_CDayx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(CDay(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_CDayx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_CustomBusinessDayx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(CustomBusinessDay(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_CustomBusinessDayx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_CustomBusinessDayx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(CustomBusinessDay(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_CustomBusinessDayx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_DateOffsetx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(DateOffset(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_DateOffsetx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_DateOffsetx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(DateOffset(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_DateOffsetx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Dayx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Day(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Dayx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Dayx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Day(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Dayx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Easterx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Easter(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Easterx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Easterx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Easter(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Easterx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_FY5253Quarterx1__variation_last(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_FY5253Quarterx1__variation_last(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_FY5253Quarterx2__variation_last(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_FY5253Quarterx2__variation_last(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_FY5253x1__variation_last(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_FY5253x1__variation_last(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_FY5253x1__variation_nearest(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_FY5253x1__variation_nearest(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_FY5253x2__variation_last(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_FY5253x2__variation_last(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_FY5253x2__variation_nearest(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_FY5253x2__variation_nearest(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Hourx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Hour(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Hourx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Hourx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Hour(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Hourx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_LastWeekOfMonthx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(LastWeekOfMonth(1, **{'week': 1, 'weekday': 1, }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_LastWeekOfMonthx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_LastWeekOfMonthx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(LastWeekOfMonth(2, **{'week': 1, 'weekday': 1, }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_LastWeekOfMonthx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Microx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Micro(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Microx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Microx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Micro(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Microx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Millix1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Milli(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Millix1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Millix2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Milli(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Millix2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Minutex1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Minute(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Minutex1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Minutex2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Minute(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Minutex2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_MonthBeginx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(MonthBegin(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_MonthBeginx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_MonthBeginx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(MonthBegin(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_MonthBeginx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_MonthEndx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(MonthEnd(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_MonthEndx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_MonthEndx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(MonthEnd(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_MonthEndx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Nanox1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Nano(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Nanox1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Nanox2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Nano(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Nanox2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_QuarterBeginx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(QuarterBegin(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_QuarterBeginx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_QuarterBeginx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(QuarterBegin(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_QuarterBeginx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_QuarterEndx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(QuarterEnd(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_QuarterEndx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_QuarterEndx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(QuarterEnd(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_QuarterEndx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Secondx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Second(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Secondx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Secondx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Second(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Secondx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_WeekOfMonthx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(WeekOfMonth(1, **{'week': 1, 'weekday': 1, }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_WeekOfMonthx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_WeekOfMonthx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(WeekOfMonth(2, **{'week': 1, 'weekday': 1, }))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_WeekOfMonthx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Weekx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Week(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Weekx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_Weekx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(Week(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_Weekx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_YearBeginx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(YearBegin(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_YearBeginx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_YearBeginx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(YearBegin(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_YearBeginx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_YearEndx1(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(YearEnd(1, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_YearEndx1(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_dtindex_YearEndx2(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_period_count(start_date, off):
+ self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days
+ if (self.ten_offsets_in_days == 0):
+ return 1000
+ else:
+ return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000)
+
+ def get_index_for_offset(off):
+ self.start_date = Timestamp('1/1/1900')
+ return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off)
+ self.idx = get_index_for_offset(YearEnd(2, **{}))
+ self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
+ self.d = dict([(col, self.df[col]) for col in self.df.columns])
+
+ def time_frame_ctor_dtindex_YearEndx2(self):
+ DataFrame(self.d)
+
+
+class frame_ctor_list_of_dict(object):
+ goal_time = 0.2
+
+ def setup(self):
+ (N, K) = (5000, 50)
+ self.index = tm.makeStringIndex(N)
+ self.columns = tm.makeStringIndex(K)
+ self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
+ try:
+ self.data = self.frame.to_dict()
+ except:
+ self.data = self.frame.toDict()
+ self.some_dict = self.data.values()[0]
+ self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values]
+
+ def time_frame_ctor_list_of_dict(self):
+ DataFrame(self.dict_list)
+
+
+class frame_ctor_nested_dict(object):
+ goal_time = 0.2
+
+ def setup(self):
+ (N, K) = (5000, 50)
+ self.index = tm.makeStringIndex(N)
+ self.columns = tm.makeStringIndex(K)
+ self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
+ try:
+ self.data = self.frame.to_dict()
+ except:
+ self.data = self.frame.toDict()
+ self.some_dict = self.data.values()[0]
+ self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values]
+
+ def time_frame_ctor_nested_dict(self):
+ DataFrame(self.data)
+
+
+class frame_ctor_nested_dict_int64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = dict(((i, dict(((j, float(j)) for j in xrange(100)))) for i in xrange(2000)))
+
+ def time_frame_ctor_nested_dict_int64(self):
+ DataFrame(self.data)
+
+
+class frame_from_series(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.mi = MultiIndex.from_tuples([(x, y) for x in range(100) for y in range(100)])
+ self.s = Series(randn(10000), index=self.mi)
+
+ def time_frame_from_series(self):
+ DataFrame(self.s)
+
+
+class frame_get_numeric_data(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 25))
+ self.df['foo'] = 'bar'
+ self.df['bar'] = 'baz'
+ self.df = self.df.consolidate()
+
+ def time_frame_get_numeric_data(self):
+ self.df._get_numeric_data()
+
+
+class series_ctor_from_dict(object):
+ goal_time = 0.2
+
+ def setup(self):
+ (N, K) = (5000, 50)
+ self.index = tm.makeStringIndex(N)
+ self.columns = tm.makeStringIndex(K)
+ self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
+ try:
+ self.data = self.frame.to_dict()
+ except:
+ self.data = self.frame.toDict()
+ self.some_dict = self.data.values()[0]
+ self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values]
+
+ def time_series_ctor_from_dict(self):
+ Series(self.some_dict)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
new file mode 100644
index 0000000000000..2bd51201b45ca
--- /dev/null
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -0,0 +1,936 @@
+from pandas_vb_common import *
+
+
+class frame_apply_axis_1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 100))
+
+ def time_frame_apply_axis_1(self):
+ self.df.apply((lambda x: (x + 1)), axis=1)
+
+
+class frame_apply_lambda_mean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 100))
+
+ def time_frame_apply_lambda_mean(self):
+ self.df.apply((lambda x: x.sum()))
+
+
+class frame_apply_np_mean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 100))
+
+ def time_frame_apply_np_mean(self):
+ self.df.apply(np.mean)
+
+
+class frame_apply_pass_thru(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 100))
+
+ def time_frame_apply_pass_thru(self):
+ self.df.apply((lambda x: x))
+
+
+class frame_apply_ref_by_name(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+
+ def time_frame_apply_ref_by_name(self):
+ self.df.apply((lambda x: (x['A'] + x['B'])), axis=1)
+
+
+class frame_apply_user_func(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.arange(1028.0))
+ self.df = DataFrame({i: self.s for i in range(1028)})
+
+ def time_frame_apply_user_func(self):
+ self.df.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)]))
+
+
+class frame_assign_timeseries_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = date_range('1/1/2000', periods=100000, freq='D')
+ self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx)
+
+ def f(x):
+ self.x = self.x.copy()
+ self.x['date'] = self.x.index
+
+ def time_frame_assign_timeseries_index(self):
+ f(self.df)
+
+
+class frame_boolean_row_select(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 100))
+ self.bool_arr = np.zeros(10000, dtype=bool)
+ self.bool_arr[:1000] = True
+
+ def time_frame_boolean_row_select(self):
+ self.df[self.bool_arr]
+
+
+class frame_count_level_axis0_mixed_dtypes_multi(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+ self.df['foo'] = 'bar'
+ self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x))))
+ self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x))))
+
+ def time_frame_count_level_axis0_mixed_dtypes_multi(self):
+ self.df.count(axis=0, level=1)
+
+
+class frame_count_level_axis0_multi(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+ self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x))))
+ self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x))))
+
+ def time_frame_count_level_axis0_multi(self):
+ self.df.count(axis=0, level=1)
+
+
+class frame_count_level_axis1_mixed_dtypes_multi(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+ self.df['foo'] = 'bar'
+ self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x))))
+ self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x))))
+
+ def time_frame_count_level_axis1_mixed_dtypes_multi(self):
+ self.df.count(axis=1, level=1)
+
+
+class frame_count_level_axis1_multi(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+ self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x))))
+ self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x))))
+
+ def time_frame_count_level_axis1_multi(self):
+ self.df.count(axis=1, level=1)
+
+
+class frame_dropna_axis0_all(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+
+ def time_frame_dropna_axis0_all(self):
+ self.df.dropna(how='all', axis=0)
+
+
+class frame_dropna_axis0_all_mixed_dtypes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+ self.df['foo'] = 'bar'
+
+ def time_frame_dropna_axis0_all_mixed_dtypes(self):
+ self.df.dropna(how='all', axis=0)
+
+
+class frame_dropna_axis0_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+
+ def time_frame_dropna_axis0_any(self):
+ self.df.dropna(how='any', axis=0)
+
+
+class frame_dropna_axis0_any_mixed_dtypes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+ self.df['foo'] = 'bar'
+
+ def time_frame_dropna_axis0_any_mixed_dtypes(self):
+ self.df.dropna(how='any', axis=0)
+
+
+class frame_dropna_axis1_all(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+
+ def time_frame_dropna_axis1_all(self):
+ self.df.dropna(how='all', axis=1)
+
+
+class frame_dropna_axis1_all_mixed_dtypes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+ self.df['foo'] = 'bar'
+
+ def time_frame_dropna_axis1_all_mixed_dtypes(self):
+ self.df.dropna(how='all', axis=1)
+
+
+class frame_dropna_axis1_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+
+ def time_frame_dropna_axis1_any(self):
+ self.df.dropna(how='any', axis=1)
+
+
+class frame_dropna_axis1_any_mixed_dtypes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(10000, 1000)
+ self.df = DataFrame(self.data)
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
+ self.df['foo'] = 'bar'
+
+ def time_frame_dropna_axis1_any_mixed_dtypes(self):
+ self.df.dropna(how='any', axis=1)
+
+
+class frame_dtypes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 1000))
+
+ def time_frame_dtypes(self):
+ self.df.dtypes
+
+
+class frame_duplicated(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = (1 << 20)
+ self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64))
+ self.xs = np.random.randn((self.n // 64)).round(2)
+ self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), })
+
+ def time_frame_duplicated(self):
+ self.df.duplicated()
+
+
+class frame_fancy_lookup(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh'))
+ self.df['foo'] = 'bar'
+ self.row_labels = list(self.df.index[::10])[:900]
+ self.col_labels = (list(self.df.columns) * 100)
+ self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object')
+ self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object')
+
+ def time_frame_fancy_lookup(self):
+ self.df.lookup(self.row_labels, self.col_labels)
+
+
+class frame_fancy_lookup_all(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh'))
+ self.df['foo'] = 'bar'
+ self.row_labels = list(self.df.index[::10])[:900]
+ self.col_labels = (list(self.df.columns) * 100)
+ self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object')
+ self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object')
+
+ def time_frame_fancy_lookup_all(self):
+ self.df.lookup(self.row_labels_all, self.col_labels_all)
+
+
+class frame_fillna_inplace(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 100))
+ self.df.values[::2] = np.nan
+
+ def time_frame_fillna_inplace(self):
+ self.df.fillna(0, inplace=True)
+
+
+class frame_float_equal(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_pair(frame):
+ self.df = frame
+ self.df2 = self.df.copy()
+ self.df2.ix[((-1), (-1))] = np.nan
+ return (self.df, self.df2)
+
+ def test_equal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df)
+
+ def test_unequal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df2)
+ self.float_df = DataFrame(np.random.randn(1000, 1000))
+ self.object_df = DataFrame(([(['foo'] * 1000)] * 1000))
+ self.nonunique_cols = self.object_df.copy()
+ self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns))
+ self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))])
+
+ def time_frame_float_equal(self):
+ test_equal('float_df')
+
+
+class frame_float_unequal(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_pair(frame):
+ self.df = frame
+ self.df2 = self.df.copy()
+ self.df2.ix[((-1), (-1))] = np.nan
+ return (self.df, self.df2)
+
+ def test_equal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df)
+
+ def test_unequal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df2)
+ self.float_df = DataFrame(np.random.randn(1000, 1000))
+ self.object_df = DataFrame(([(['foo'] * 1000)] * 1000))
+ self.nonunique_cols = self.object_df.copy()
+ self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns))
+ self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))])
+
+ def time_frame_float_unequal(self):
+ test_unequal('float_df')
+
+
+class frame_from_records_generator(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_data(n=100000):
+ return ((x, (x * 20), (x * 100)) for x in xrange(n))
+
+ def time_frame_from_records_generator(self):
+ self.df = DataFrame.from_records(get_data())
+
+
+class frame_from_records_generator_nrows(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def get_data(n=100000):
+ return ((x, (x * 20), (x * 100)) for x in xrange(n))
+
+ def time_frame_from_records_generator_nrows(self):
+ self.df = DataFrame.from_records(get_data(), nrows=1000)
+
+
+class frame_get_dtype_counts(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = pandas.DataFrame(np.random.randn(10, 10000))
+
+ def time_frame_get_dtype_counts(self):
+ self.df.get_dtype_counts()
+
+
+class frame_getitem_single_column(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 1000))
+ self.df2 = DataFrame(randn(3000, 1), columns=['A'])
+ self.df3 = DataFrame(randn(3000, 1))
+
+ def f():
+ if hasattr(self.df, '_item_cache'):
+ self.df._item_cache.clear()
+ for (name, col) in self.df.iteritems():
+ pass
+
+ def g():
+ for (name, col) in self.df.iteritems():
+ pass
+
+ def h():
+ for i in xrange(10000):
+ self.df2['A']
+
+ def j():
+ for i in xrange(10000):
+ self.df3[0]
+
+ def time_frame_getitem_single_column(self):
+ h()
+
+
+class frame_getitem_single_column2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 1000))
+ self.df2 = DataFrame(randn(3000, 1), columns=['A'])
+ self.df3 = DataFrame(randn(3000, 1))
+
+ def f():
+ if hasattr(self.df, '_item_cache'):
+ self.df._item_cache.clear()
+ for (name, col) in self.df.iteritems():
+ pass
+
+ def g():
+ for (name, col) in self.df.iteritems():
+ pass
+
+ def h():
+ for i in xrange(10000):
+ self.df2['A']
+
+ def j():
+ for i in xrange(10000):
+ self.df3[0]
+
+ def time_frame_getitem_single_column2(self):
+ j()
+
+
+class frame_html_repr_trunc_mi(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.nrows = 10000
+ self.data = randn(self.nrows, 10)
+ self.idx = MultiIndex.from_arrays(np.tile(randn(3, (self.nrows / 100)), 100))
+ self.df = DataFrame(self.data, index=self.idx)
+
+ def time_frame_html_repr_trunc_mi(self):
+ self.df._repr_html_()
+
+
+class frame_html_repr_trunc_si(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.nrows = 10000
+ self.data = randn(self.nrows, 10)
+ self.idx = randn(self.nrows)
+ self.df = DataFrame(self.data, index=self.idx)
+
+ def time_frame_html_repr_trunc_si(self):
+ self.df._repr_html_()
+
+
+class frame_insert_100_columns_begin(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000
+
+ def f(K=100):
+ self.df = DataFrame(index=range(self.N))
+ self.new_col = np.random.randn(self.N)
+ for i in range(K):
+ self.df.insert(0, i, self.new_col)
+
+ def time_frame_insert_100_columns_begin(self):
+ f()
+
+
+class frame_insert_500_columns_end(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000
+
+ def f(K=500):
+ self.df = DataFrame(index=range(self.N))
+ self.new_col = np.random.randn(self.N)
+ for i in range(K):
+ self.df[i] = self.new_col
+
+ def time_frame_insert_500_columns_end(self):
+ f()
+
+
+class frame_interpolate(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 100))
+ self.df.values[::2] = np.nan
+
+ def time_frame_interpolate(self):
+ self.df.interpolate()
+
+
+class frame_interpolate_some_good(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), })
+ self.df.loc[1::5, 'A'] = np.nan
+ self.df.loc[1::5, 'C'] = np.nan
+
+ def time_frame_interpolate_some_good(self):
+ self.df.interpolate()
+
+
+class frame_interpolate_some_good_infer(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), })
+ self.df.loc[1::5, 'A'] = np.nan
+ self.df.loc[1::5, 'C'] = np.nan
+
+ def time_frame_interpolate_some_good_infer(self):
+ self.df.interpolate(downcast='infer')
+
+
+class frame_isnull(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(1000, 1000)
+ self.df = DataFrame(self.data)
+
+ def time_frame_isnull(self):
+ isnull(self.df)
+
+
+class frame_iteritems(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 1000))
+ self.df2 = DataFrame(randn(3000, 1), columns=['A'])
+ self.df3 = DataFrame(randn(3000, 1))
+
+ def f():
+ if hasattr(self.df, '_item_cache'):
+ self.df._item_cache.clear()
+ for (name, col) in self.df.iteritems():
+ pass
+
+ def g():
+ for (name, col) in self.df.iteritems():
+ pass
+
+ def h():
+ for i in xrange(10000):
+ self.df2['A']
+
+ def j():
+ for i in xrange(10000):
+ self.df3[0]
+
+ def time_frame_iteritems(self):
+ f()
+
+
+class frame_iteritems_cached(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 1000))
+ self.df2 = DataFrame(randn(3000, 1), columns=['A'])
+ self.df3 = DataFrame(randn(3000, 1))
+
+ def f():
+ if hasattr(self.df, '_item_cache'):
+ self.df._item_cache.clear()
+ for (name, col) in self.df.iteritems():
+ pass
+
+ def g():
+ for (name, col) in self.df.iteritems():
+ pass
+
+ def h():
+ for i in xrange(10000):
+ self.df2['A']
+
+ def j():
+ for i in xrange(10000):
+ self.df3[0]
+
+ def time_frame_iteritems_cached(self):
+ g()
+
+
+class frame_mask_bools(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(1000, 500)
+ self.df = DataFrame(self.data)
+ self.df = self.df.where((self.df > 0))
+ self.bools = (self.df > 0)
+ self.mask = isnull(self.df)
+
+ def time_frame_mask_bools(self):
+ self.bools.mask(self.mask)
+
+
+class frame_mask_floats(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(1000, 500)
+ self.df = DataFrame(self.data)
+ self.df = self.df.where((self.df > 0))
+ self.bools = (self.df > 0)
+ self.mask = isnull(self.df)
+
+ def time_frame_mask_floats(self):
+ self.bools.astype(float).mask(self.mask)
+
+
+class frame_nonunique_equal(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_pair(frame):
+ self.df = frame
+ self.df2 = self.df.copy()
+ self.df2.ix[((-1), (-1))] = np.nan
+ return (self.df, self.df2)
+
+ def test_equal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df)
+
+ def test_unequal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df2)
+ self.float_df = DataFrame(np.random.randn(1000, 1000))
+ self.object_df = DataFrame(([(['foo'] * 1000)] * 1000))
+ self.nonunique_cols = self.object_df.copy()
+ self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns))
+ self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))])
+
+ def time_frame_nonunique_equal(self):
+ test_equal('nonunique_cols')
+
+
+class frame_nonunique_unequal(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_pair(frame):
+ self.df = frame
+ self.df2 = self.df.copy()
+ self.df2.ix[((-1), (-1))] = np.nan
+ return (self.df, self.df2)
+
+ def test_equal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df)
+
+ def test_unequal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df2)
+ self.float_df = DataFrame(np.random.randn(1000, 1000))
+ self.object_df = DataFrame(([(['foo'] * 1000)] * 1000))
+ self.nonunique_cols = self.object_df.copy()
+ self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns))
+ self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))])
+
+ def time_frame_nonunique_unequal(self):
+ test_unequal('nonunique_cols')
+
+
+class frame_object_equal(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_pair(frame):
+ self.df = frame
+ self.df2 = self.df.copy()
+ self.df2.ix[((-1), (-1))] = np.nan
+ return (self.df, self.df2)
+
+ def test_equal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df)
+
+ def test_unequal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df2)
+ self.float_df = DataFrame(np.random.randn(1000, 1000))
+ self.object_df = DataFrame(([(['foo'] * 1000)] * 1000))
+ self.nonunique_cols = self.object_df.copy()
+ self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns))
+ self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))])
+
+ def time_frame_object_equal(self):
+ test_equal('object_df')
+
+
+class frame_object_unequal(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_pair(frame):
+ self.df = frame
+ self.df2 = self.df.copy()
+ self.df2.ix[((-1), (-1))] = np.nan
+ return (self.df, self.df2)
+
+ def test_equal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df)
+
+ def test_unequal(name):
+ (self.df, self.df2) = pairs[name]
+ return self.df.equals(self.df2)
+ self.float_df = DataFrame(np.random.randn(1000, 1000))
+ self.object_df = DataFrame(([(['foo'] * 1000)] * 1000))
+ self.nonunique_cols = self.object_df.copy()
+ self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns))
+ self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))])
+
+ def time_frame_object_unequal(self):
+ test_unequal('object_df')
+
+
+class frame_reindex_axis0(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 10000))
+ self.idx = np.arange(4000, 7000)
+
+ def time_frame_reindex_axis0(self):
+ self.df.reindex(self.idx)
+
+
+class frame_reindex_axis1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 10000))
+ self.idx = np.arange(4000, 7000)
+
+ def time_frame_reindex_axis1(self):
+ self.df.reindex(columns=self.idx)
+
+
+class frame_reindex_both_axes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 10000))
+ self.idx = np.arange(4000, 7000)
+
+ def time_frame_reindex_both_axes(self):
+ self.df.reindex(index=self.idx, columns=self.idx)
+
+
+class frame_reindex_both_axes_ix(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(10000, 10000))
+ self.idx = np.arange(4000, 7000)
+
+ def time_frame_reindex_both_axes_ix(self):
+ self.df.ix[(self.idx, self.idx)]
+
+
+class frame_reindex_upcast(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), 1: randint(0, 1000, 1000).astype(np.int16), 2: randint(0, 1000, 1000).astype(np.int32), 3: randint(0, 1000, 1000).astype(np.int64), }[randint(0, 4)]) for c in range(1000)]))
+
+ def time_frame_reindex_upcast(self):
+ self.df.reindex(permutation(range(1200)))
+
+
+class frame_repr_tall(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = pandas.DataFrame(np.random.randn(10000, 10))
+
+ def time_frame_repr_tall(self):
+ repr(self.df)
+
+
+class frame_repr_wide(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = pandas.DataFrame(np.random.randn(10, 10000))
+
+ def time_frame_repr_wide(self):
+ repr(self.df)
+
+
+class frame_shift_axis0(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.rand(10000, 500))
+
+ def time_frame_shift_axis0(self):
+ self.df.shift(1, axis=0)
+
+
+class frame_shift_axis_1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.rand(10000, 500))
+
+ def time_frame_shift_axis_1(self):
+ self.df.shift(1, axis=1)
+
+
+class frame_to_html_mixed(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.nrows = 500
+ self.df = DataFrame(randn(self.nrows, 10))
+ self.df[0] = period_range('2000', '2010', self.nrows)
+ self.df[1] = range(self.nrows)
+
+ def time_frame_to_html_mixed(self):
+ self.df.to_html()
+
+
+class frame_to_string_floats(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(100, 10))
+
+ def time_frame_to_string_floats(self):
+ self.df.to_string()
+
+
+class frame_xs_col(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(1, 100000))
+
+ def time_frame_xs_col(self):
+ self.df.xs(50000, axis=1)
+
+
+class frame_xs_row(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(randn(100000, 1))
+
+ def time_frame_xs_row(self):
+ self.df.xs(50000)
+
+
+class series_string_vector_slice(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series((['abcdefg', np.nan] * 500000))
+
+ def time_series_string_vector_slice(self):
+ self.s.str[:5]
\ No newline at end of file
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
new file mode 100644
index 0000000000000..b0486617a52af
--- /dev/null
+++ b/asv_bench/benchmarks/gil.py
@@ -0,0 +1,267 @@
+from pandas_vb_common import *
+from pandas.core import common as com
+from pandas.util.testing import test_parallel
+
+
+class nogil_groupby_count_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ self.df.groupby('key')['data'].count()
+
+ def time_nogil_groupby_count_2(self):
+ pg2()
+
+
+class nogil_groupby_last_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ self.df.groupby('key')['data'].last()
+
+ def time_nogil_groupby_last_2(self):
+ pg2()
+
+
+class nogil_groupby_max_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ self.df.groupby('key')['data'].max()
+
+ def time_nogil_groupby_max_2(self):
+ pg2()
+
+
+class nogil_groupby_mean_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ self.df.groupby('key')['data'].mean()
+
+ def time_nogil_groupby_mean_2(self):
+ pg2()
+
+
+class nogil_groupby_min_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ self.df.groupby('key')['data'].min()
+
+ def time_nogil_groupby_min_2(self):
+ pg2()
+
+
+class nogil_groupby_prod_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ self.df.groupby('key')['data'].prod()
+
+ def time_nogil_groupby_prod_2(self):
+ pg2()
+
+
+class nogil_groupby_sum_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ self.df.groupby('key')['data'].sum()
+
+ def time_nogil_groupby_sum_2(self):
+ pg2()
+
+
+class nogil_groupby_sum_4(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ def f():
+ self.df.groupby('key')['data'].sum()
+
+ def g2():
+ for i in range(2):
+ f()
+
+ def g4():
+ for i in range(4):
+ f()
+
+ def g8():
+ for i in range(8):
+ f()
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ f()
+
+ @test_parallel(num_threads=4)
+ def pg4():
+ f()
+
+ @test_parallel(num_threads=8)
+ def pg8():
+ f()
+
+ def time_nogil_groupby_sum_4(self):
+ pg4()
+
+
+class nogil_groupby_sum_8(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ def f():
+ self.df.groupby('key')['data'].sum()
+
+ def g2():
+ for i in range(2):
+ f()
+
+ def g4():
+ for i in range(4):
+ f()
+
+ def g8():
+ for i in range(8):
+ f()
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ f()
+
+ @test_parallel(num_threads=4)
+ def pg4():
+ f()
+
+ @test_parallel(num_threads=8)
+ def pg8():
+ f()
+
+ def time_nogil_groupby_sum_8(self):
+ pg8()
+
+
+class nogil_groupby_var_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+
+ @test_parallel(num_threads=2)
+ def pg2():
+ self.df.groupby('key')['data'].var()
+
+ def time_nogil_groupby_var_2(self):
+ pg2()
+
+
+class nogil_take1d_float64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+ self.N = 10000000.0
+ self.df = DataFrame({'int64': np.arange(self.N, dtype='int64'), 'float64': np.arange(self.N, dtype='float64'), })
+ self.indexer = np.arange(100, (len(self.df) - 100))
+
+ @test_parallel(num_threads=2)
+ def take_1d_pg2_int64():
+ com.take_1d(self.df.int64.values, self.indexer)
+
+ @test_parallel(num_threads=2)
+ def take_1d_pg2_float64():
+ com.take_1d(self.df.float64.values, self.indexer)
+
+ def time_nogil_take1d_float64(self):
+ take_1d_pg2_int64()
+
+
+class nogil_take1d_int64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.ngroups = 1000
+ np.random.seed(1234)
+ self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
+ self.N = 10000000.0
+ self.df = DataFrame({'int64': np.arange(self.N, dtype='int64'), 'float64': np.arange(self.N, dtype='float64'), })
+ self.indexer = np.arange(100, (len(self.df) - 100))
+
+ @test_parallel(num_threads=2)
+ def take_1d_pg2_int64():
+ com.take_1d(self.df.int64.values, self.indexer)
+
+ @test_parallel(num_threads=2)
+ def take_1d_pg2_float64():
+ com.take_1d(self.df.float64.values, self.indexer)
+
+ def time_nogil_take1d_int64(self):
+ take_1d_pg2_float64()
\ No newline at end of file
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
new file mode 100644
index 0000000000000..4f1f4e46b4a31
--- /dev/null
+++ b/asv_bench/benchmarks/groupby.py
@@ -0,0 +1,1683 @@
+from pandas_vb_common import *
+from itertools import product
+from string import ascii_letters, digits
+
+
+class groupby_agg_builtins1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(27182)
+ self.n = 100000
+ self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie'])
+
+ def time_groupby_agg_builtins1(self):
+ self.df.groupby('jim').agg([sum, min, max])
+
+
+class groupby_agg_builtins2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(27182)
+ self.n = 100000
+ self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie'])
+
+ def time_groupby_agg_builtins2(self):
+ self.df.groupby(['jim', 'joe']).agg([sum, min, max])
+
+
+class groupby_apply_dict_return(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(1000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.f = (lambda x: {'first': x.values[0], 'last': x.values[(-1)], })
+
+ def time_groupby_apply_dict_return(self):
+ self.data.groupby(self.labels).apply(self.f)
+
+
+class groupby_dt_size(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 100000
+ self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]')
+ self.dates = (np.datetime64('now') + self.offsets)
+ self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, })
+
+ def time_groupby_dt_size(self):
+ self.df.groupby(['dates']).size()
+
+
+class groupby_dt_timegrouper_size(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 100000
+ self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]')
+ self.dates = (np.datetime64('now') + self.offsets)
+ self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, })
+
+ def time_groupby_dt_timegrouper_size(self):
+ self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
+
+
+class groupby_first_datetimes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), })
+
+ def time_groupby_first_datetimes(self):
+ self.df.groupby('b').first()
+
+
+class groupby_first_float32(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(10000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.data[::3] = np.nan
+ self.data[1::3] = np.nan
+ self.data2 = Series(randn(len(self.labels)), dtype='float32')
+ self.data2[::3] = np.nan
+ self.data2[1::3] = np.nan
+ self.labels = self.labels.take(np.random.permutation(len(self.labels)))
+
+ def time_groupby_first_float32(self):
+ self.data2.groupby(self.labels).first()
+
+
+class groupby_first_float64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(10000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.data[::3] = np.nan
+ self.data[1::3] = np.nan
+ self.data2 = Series(randn(len(self.labels)), dtype='float32')
+ self.data2[::3] = np.nan
+ self.data2[1::3] = np.nan
+ self.labels = self.labels.take(np.random.permutation(len(self.labels)))
+
+ def time_groupby_first_float64(self):
+ self.data.groupby(self.labels).first()
+
+
+class groupby_first_object(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), })
+
+ def time_groupby_first_object(self):
+ self.df.groupby('b').first()
+
+
+class groupby_frame_apply(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.labels = np.random.randint(0, 2000, size=self.N)
+ self.labels2 = np.random.randint(0, 3, size=self.N)
+ self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), })
+
+ def f(g):
+ return 1
+
+ def time_groupby_frame_apply(self):
+ self.df.groupby(['key', 'key2']).apply(f)
+
+
+class groupby_frame_apply_overhead(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.labels = np.random.randint(0, 2000, size=self.N)
+ self.labels2 = np.random.randint(0, 3, size=self.N)
+ self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), })
+
+ def f(g):
+ return 1
+
+ def time_groupby_frame_apply_overhead(self):
+ self.df.groupby('key').apply(f)
+
+
+class groupby_frame_cython_many_columns(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.random.randint(0, 100, size=1000)
+ self.df = DataFrame(randn(1000, 1000))
+
+ def time_groupby_frame_cython_many_columns(self):
+ self.df.groupby(self.labels).sum()
+
+
+class groupby_frame_median(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(100000, 2)
+ self.labels = np.random.randint(0, 1000, size=100000)
+ self.df = DataFrame(self.data)
+
+ def time_groupby_frame_median(self):
+ self.df.groupby(self.labels).median()
+
+
+class groupby_frame_nth_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randint(1, 100, (10000, 2)))
+
+ def time_groupby_frame_nth_any(self):
+ self.df.groupby(0).nth(0, dropna='any')
+
+
+class groupby_frame_nth_none(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randint(1, 100, (10000, 2)))
+
+ def time_groupby_frame_nth_none(self):
+ self.df.groupby(0).nth(0)
+
+
+class groupby_frame_singlekey_integer(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(100000, 1)
+ self.labels = np.random.randint(0, 1000, size=100000)
+ self.df = DataFrame(self.data)
+
+ def time_groupby_frame_singlekey_integer(self):
+ self.df.groupby(self.labels).sum()
+
+
+class groupby_indices(object):
+ goal_time = 0.2
+
+ def setup(self):
+ try:
+ self.rng = date_range('1/1/2000', '12/31/2005', freq='H')
+ (year, month, day) = (self.rng.year, self.rng.month, self.rng.day)
+ except:
+ self.rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour())
+ self.year = self.rng.map((lambda x: x.year))
+ self.month = self.rng.map((lambda x: x.month))
+ self.day = self.rng.map((lambda x: x.day))
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+
+ def time_groupby_indices(self):
+ len(self.ts.groupby([self.year, self.month, self.day]))
+
+
+class groupby_int64_overflow(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.arr = np.random.randint(((-1) << 12), (1 << 12), ((1 << 17), 5))
+ self.i = np.random.choice(len(self.arr), (len(self.arr) * 5))
+ self.arr = np.vstack((self.arr, self.arr[self.i]))
+ self.i = np.random.permutation(len(self.arr))
+ self.arr = self.arr[self.i]
+ self.df = DataFrame(self.arr, columns=list('abcde'))
+ (self.df['jim'], self.df['joe']) = (np.random.randn(2, len(self.df)) * 10)
+
+ def time_groupby_int64_overflow(self):
+ self.df.groupby(list('abcde')).max()
+
+
+class groupby_int_count(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 10000
+ self.df = DataFrame({'key1': randint(0, 500, size=self.n), 'key2': randint(0, 100, size=self.n), 'ints': randint(0, 1000, size=self.n), 'ints2': randint(0, 1000, size=self.n), })
+
+ def time_groupby_int_count(self):
+ self.df.groupby(['key1', 'key2']).count()
+
+
+class groupby_last_datetimes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), })
+
+ def time_groupby_last_datetimes(self):
+ self.df.groupby('b').last()
+
+
+class groupby_last_float32(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(10000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.data[::3] = np.nan
+ self.data[1::3] = np.nan
+ self.data2 = Series(randn(len(self.labels)), dtype='float32')
+ self.data2[::3] = np.nan
+ self.data2[1::3] = np.nan
+ self.labels = self.labels.take(np.random.permutation(len(self.labels)))
+
+ def time_groupby_last_float32(self):
+ self.data2.groupby(self.labels).last()
+
+
+class groupby_last_float64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(10000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.data[::3] = np.nan
+ self.data[1::3] = np.nan
+ self.data2 = Series(randn(len(self.labels)), dtype='float32')
+ self.data2[::3] = np.nan
+ self.data2[1::3] = np.nan
+ self.labels = self.labels.take(np.random.permutation(len(self.labels)))
+
+ def time_groupby_last_float64(self):
+ self.data.groupby(self.labels).last()
+
+
+class groupby_last_object(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), })
+
+ def time_groupby_last_object(self):
+ self.df.groupby('b').last()
+
+
+class groupby_multi_count(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 10000
+ self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]')
+ self.dates = (np.datetime64('now') + self.offsets)
+ self.dates[(np.random.rand(self.n) > 0.5)] = np.datetime64('nat')
+ self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat')
+ self.value2 = np.random.randn(self.n)
+ self.value2[(np.random.rand(self.n) > 0.5)] = np.nan
+ self.obj = tm.choice(list('ab'), size=self.n).astype(object)
+ self.obj[(np.random.randn(self.n) > 0.5)] = np.nan
+ self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'dates': self.dates, 'value2': self.value2, 'value3': np.random.randn(self.n), 'ints': np.random.randint(0, 1000, size=self.n), 'obj': self.obj, 'offsets': self.offsets, })
+
+ def time_groupby_multi_count(self):
+ self.df.groupby(['key1', 'key2']).count()
+
+
+class groupby_multi_cython(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.ngroups = 100
+
+ def get_test_data(ngroups=100, n=self.N):
+ self.unique_groups = range(self.ngroups)
+ self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object)
+ if (len(self.arr) < n):
+ self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object)
+ random.shuffle(self.arr)
+ return self.arr
+ self.df = DataFrame({'key1': get_test_data(ngroups=self.ngroups), 'key2': get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), })
+
+ def f():
+ self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum()))
+ self.simple_series = Series(np.random.randn(self.N))
+ self.key1 = self.df['key1']
+
+ def time_groupby_multi_cython(self):
+ self.df.groupby(['key1', 'key2']).sum()
+
+
+class groupby_multi_different_functions(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.fac1 = np.array(['A', 'B', 'C'], dtype='O')
+ self.fac2 = np.array(['one', 'two'], dtype='O')
+ self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), })
+
+ def time_groupby_multi_different_functions(self):
+ self.df.groupby(['key1', 'key2']).agg({'value1': 'mean', 'value2': 'var', 'value3': 'sum', })
+
+
+class groupby_multi_different_numpy_functions(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.fac1 = np.array(['A', 'B', 'C'], dtype='O')
+ self.fac2 = np.array(['one', 'two'], dtype='O')
+ self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), })
+
+ def time_groupby_multi_different_numpy_functions(self):
+ self.df.groupby(['key1', 'key2']).agg({'value1': np.mean, 'value2': np.var, 'value3': np.sum, })
+
+
+class groupby_multi_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = (((5 * 7) * 11) * (1 << 9))
+ self.alpha = list(map(''.join, product((ascii_letters + digits), repeat=4)))
+ self.f = (lambda k: np.repeat(np.random.choice(self.alpha, (self.n // k)), k))
+ self.df = DataFrame({'a': self.f(11), 'b': self.f(7), 'c': self.f(5), 'd': self.f(1), })
+ self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3)
+ self.i = np.random.permutation(len(self.df))
+ self.df = self.df.iloc[self.i].reset_index(drop=True).copy()
+
+ def time_groupby_multi_index(self):
+ self.df.groupby(list('abcd')).max()
+
+
+class groupby_multi_python(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.ngroups = 100
+
+ def get_test_data(ngroups=100, n=self.N):
+ self.unique_groups = range(self.ngroups)
+ self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object)
+ if (len(self.arr) < n):
+ self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object)
+ random.shuffle(self.arr)
+ return self.arr
+ self.df = DataFrame({'key1': get_test_data(ngroups=self.ngroups), 'key2': get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), })
+
+ def f():
+ self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum()))
+ self.simple_series = Series(np.random.randn(self.N))
+ self.key1 = self.df['key1']
+
+ def time_groupby_multi_python(self):
+ self.df.groupby(['key1', 'key2'])['data1'].agg((lambda x: x.values.sum()))
+
+
+class groupby_multi_series_op(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.ngroups = 100
+
+ def get_test_data(ngroups=100, n=self.N):
+ self.unique_groups = range(self.ngroups)
+ self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object)
+ if (len(self.arr) < n):
+ self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object)
+ random.shuffle(self.arr)
+ return self.arr
+ self.df = DataFrame({'key1': get_test_data(ngroups=self.ngroups), 'key2': get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), })
+
+ def f():
+ self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum()))
+ self.simple_series = Series(np.random.randn(self.N))
+ self.key1 = self.df['key1']
+
+ def time_groupby_multi_series_op(self):
+ self.df.groupby(['key1', 'key2'])['data1'].agg(np.std)
+
+
+class groupby_multi_size(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 100000
+ self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]')
+ self.dates = (np.datetime64('now') + self.offsets)
+ self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, })
+
+ def time_groupby_multi_size(self):
+ self.df.groupby(['key1', 'key2']).size()
+
+
+class groupby_ngroups_10000_all(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_all(self):
+ self.df.groupby('value')['timestamp'].all()
+
+
+class groupby_ngroups_10000_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_any(self):
+ self.df.groupby('value')['timestamp'].any()
+
+
+class groupby_ngroups_10000_count(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_count(self):
+ self.df.groupby('value')['timestamp'].count()
+
+
+class groupby_ngroups_10000_cumcount(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_cumcount(self):
+ self.df.groupby('value')['timestamp'].cumcount()
+
+
+class groupby_ngroups_10000_cummax(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_cummax(self):
+ self.df.groupby('value')['timestamp'].cummax()
+
+
+class groupby_ngroups_10000_cummin(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_cummin(self):
+ self.df.groupby('value')['timestamp'].cummin()
+
+
+class groupby_ngroups_10000_cumprod(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_cumprod(self):
+ self.df.groupby('value')['timestamp'].cumprod()
+
+
+class groupby_ngroups_10000_cumsum(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_cumsum(self):
+ self.df.groupby('value')['timestamp'].cumsum()
+
+
+class groupby_ngroups_10000_describe(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_describe(self):
+ self.df.groupby('value')['timestamp'].describe()
+
+
+class groupby_ngroups_10000_diff(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_diff(self):
+ self.df.groupby('value')['timestamp'].diff()
+
+
+class groupby_ngroups_10000_first(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_first(self):
+ self.df.groupby('value')['timestamp'].first()
+
+
+class groupby_ngroups_10000_head(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_head(self):
+ self.df.groupby('value')['timestamp'].head()
+
+
+class groupby_ngroups_10000_last(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_last(self):
+ self.df.groupby('value')['timestamp'].last()
+
+
+class groupby_ngroups_10000_mad(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_mad(self):
+ self.df.groupby('value')['timestamp'].mad()
+
+
+class groupby_ngroups_10000_max(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_max(self):
+ self.df.groupby('value')['timestamp'].max()
+
+
+class groupby_ngroups_10000_mean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_mean(self):
+ self.df.groupby('value')['timestamp'].mean()
+
+
+class groupby_ngroups_10000_median(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_median(self):
+ self.df.groupby('value')['timestamp'].median()
+
+
+class groupby_ngroups_10000_min(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_min(self):
+ self.df.groupby('value')['timestamp'].min()
+
+
+class groupby_ngroups_10000_nunique(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_nunique(self):
+ self.df.groupby('value')['timestamp'].nunique()
+
+
+class groupby_ngroups_10000_pct_change(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_pct_change(self):
+ self.df.groupby('value')['timestamp'].pct_change()
+
+
+class groupby_ngroups_10000_prod(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_prod(self):
+ self.df.groupby('value')['timestamp'].prod()
+
+
+class groupby_ngroups_10000_rank(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_rank(self):
+ self.df.groupby('value')['timestamp'].rank()
+
+
+class groupby_ngroups_10000_sem(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_sem(self):
+ self.df.groupby('value')['timestamp'].sem()
+
+
+class groupby_ngroups_10000_size(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_size(self):
+ self.df.groupby('value')['timestamp'].size()
+
+
+class groupby_ngroups_10000_skew(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_skew(self):
+ self.df.groupby('value')['timestamp'].skew()
+
+
+class groupby_ngroups_10000_std(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_std(self):
+ self.df.groupby('value')['timestamp'].std()
+
+
+class groupby_ngroups_10000_sum(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_sum(self):
+ self.df.groupby('value')['timestamp'].sum()
+
+
+class groupby_ngroups_10000_tail(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_tail(self):
+ self.df.groupby('value')['timestamp'].tail()
+
+
+class groupby_ngroups_10000_unique(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_unique(self):
+ self.df.groupby('value')['timestamp'].unique()
+
+
+class groupby_ngroups_10000_value_counts(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_value_counts(self):
+ self.df.groupby('value')['timestamp'].value_counts()
+
+
+class groupby_ngroups_10000_var(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 10000
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_10000_var(self):
+ self.df.groupby('value')['timestamp'].var()
+
+
+class groupby_ngroups_100_all(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_all(self):
+ self.df.groupby('value')['timestamp'].all()
+
+
+class groupby_ngroups_100_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_any(self):
+ self.df.groupby('value')['timestamp'].any()
+
+
+class groupby_ngroups_100_count(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_count(self):
+ self.df.groupby('value')['timestamp'].count()
+
+
+class groupby_ngroups_100_cumcount(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_cumcount(self):
+ self.df.groupby('value')['timestamp'].cumcount()
+
+
+class groupby_ngroups_100_cummax(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_cummax(self):
+ self.df.groupby('value')['timestamp'].cummax()
+
+
+class groupby_ngroups_100_cummin(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_cummin(self):
+ self.df.groupby('value')['timestamp'].cummin()
+
+
+class groupby_ngroups_100_cumprod(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_cumprod(self):
+ self.df.groupby('value')['timestamp'].cumprod()
+
+
+class groupby_ngroups_100_cumsum(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_cumsum(self):
+ self.df.groupby('value')['timestamp'].cumsum()
+
+
+class groupby_ngroups_100_describe(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_describe(self):
+ self.df.groupby('value')['timestamp'].describe()
+
+
+class groupby_ngroups_100_diff(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_diff(self):
+ self.df.groupby('value')['timestamp'].diff()
+
+
+class groupby_ngroups_100_first(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_first(self):
+ self.df.groupby('value')['timestamp'].first()
+
+
+class groupby_ngroups_100_head(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_head(self):
+ self.df.groupby('value')['timestamp'].head()
+
+
+class groupby_ngroups_100_last(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_last(self):
+ self.df.groupby('value')['timestamp'].last()
+
+
+class groupby_ngroups_100_mad(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_mad(self):
+ self.df.groupby('value')['timestamp'].mad()
+
+
+class groupby_ngroups_100_max(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_max(self):
+ self.df.groupby('value')['timestamp'].max()
+
+
+class groupby_ngroups_100_mean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_mean(self):
+ self.df.groupby('value')['timestamp'].mean()
+
+
+class groupby_ngroups_100_median(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_median(self):
+ self.df.groupby('value')['timestamp'].median()
+
+
+class groupby_ngroups_100_min(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_min(self):
+ self.df.groupby('value')['timestamp'].min()
+
+
+class groupby_ngroups_100_nunique(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_nunique(self):
+ self.df.groupby('value')['timestamp'].nunique()
+
+
+class groupby_ngroups_100_pct_change(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_pct_change(self):
+ self.df.groupby('value')['timestamp'].pct_change()
+
+
+class groupby_ngroups_100_prod(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_prod(self):
+ self.df.groupby('value')['timestamp'].prod()
+
+
+class groupby_ngroups_100_rank(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_rank(self):
+ self.df.groupby('value')['timestamp'].rank()
+
+
+class groupby_ngroups_100_sem(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_sem(self):
+ self.df.groupby('value')['timestamp'].sem()
+
+
+class groupby_ngroups_100_size(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_size(self):
+ self.df.groupby('value')['timestamp'].size()
+
+
+class groupby_ngroups_100_skew(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_skew(self):
+ self.df.groupby('value')['timestamp'].skew()
+
+
+class groupby_ngroups_100_std(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_std(self):
+ self.df.groupby('value')['timestamp'].std()
+
+
+class groupby_ngroups_100_sum(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_sum(self):
+ self.df.groupby('value')['timestamp'].sum()
+
+
+class groupby_ngroups_100_tail(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_tail(self):
+ self.df.groupby('value')['timestamp'].tail()
+
+
+class groupby_ngroups_100_unique(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_unique(self):
+ self.df.groupby('value')['timestamp'].unique()
+
+
+class groupby_ngroups_100_value_counts(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_value_counts(self):
+ self.df.groupby('value')['timestamp'].value_counts()
+
+
+class groupby_ngroups_100_var(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.ngroups = 100
+ self.size = (self.ngroups * 2)
+ self.rng = np.arange(self.ngroups)
+ self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+
+ def time_groupby_ngroups_100_var(self):
+ self.df.groupby('value')['timestamp'].var()
+
+
+class groupby_nth_datetimes_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), })
+
+ def time_groupby_nth_datetimes_any(self):
+ self.df.groupby('b').nth(0, dropna='all')
+
+
+class groupby_nth_datetimes_none(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), })
+
+ def time_groupby_nth_datetimes_none(self):
+ self.df.groupby('b').nth(0)
+
+
+class groupby_nth_float32_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(10000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.data[::3] = np.nan
+ self.data[1::3] = np.nan
+ self.data2 = Series(randn(len(self.labels)), dtype='float32')
+ self.data2[::3] = np.nan
+ self.data2[1::3] = np.nan
+ self.labels = self.labels.take(np.random.permutation(len(self.labels)))
+
+ def time_groupby_nth_float32_any(self):
+ self.data2.groupby(self.labels).nth(0, dropna='all')
+
+
+class groupby_nth_float32_none(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(10000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.data[::3] = np.nan
+ self.data[1::3] = np.nan
+ self.data2 = Series(randn(len(self.labels)), dtype='float32')
+ self.data2[::3] = np.nan
+ self.data2[1::3] = np.nan
+ self.labels = self.labels.take(np.random.permutation(len(self.labels)))
+
+ def time_groupby_nth_float32_none(self):
+ self.data2.groupby(self.labels).nth(0)
+
+
+class groupby_nth_float64_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(10000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.data[::3] = np.nan
+ self.data[1::3] = np.nan
+ self.data2 = Series(randn(len(self.labels)), dtype='float32')
+ self.data2[::3] = np.nan
+ self.data2[1::3] = np.nan
+ self.labels = self.labels.take(np.random.permutation(len(self.labels)))
+
+ def time_groupby_nth_float64_any(self):
+ self.data.groupby(self.labels).nth(0, dropna='all')
+
+
+class groupby_nth_float64_none(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.labels = np.arange(10000).repeat(10)
+ self.data = Series(randn(len(self.labels)))
+ self.data[::3] = np.nan
+ self.data[1::3] = np.nan
+ self.data2 = Series(randn(len(self.labels)), dtype='float32')
+ self.data2[::3] = np.nan
+ self.data2[1::3] = np.nan
+ self.labels = self.labels.take(np.random.permutation(len(self.labels)))
+
+ def time_groupby_nth_float64_none(self):
+ self.data.groupby(self.labels).nth(0)
+
+
+class groupby_nth_object_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), })
+
+ def time_groupby_nth_object_any(self):
+ self.df.groupby('b').nth(0, dropna='any')
+
+
+class groupby_nth_object_none(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), })
+
+ def time_groupby_nth_object_none(self):
+ self.df.groupby('b').nth(0)
+
+
+class groupby_pivot_table(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.fac1 = np.array(['A', 'B', 'C'], dtype='O')
+ self.fac2 = np.array(['one', 'two'], dtype='O')
+ self.ind1 = np.random.randint(0, 3, size=100000)
+ self.ind2 = np.random.randint(0, 2, size=100000)
+ self.df = DataFrame({'key1': self.fac1.take(self.ind1), 'key2': self.fac2.take(self.ind2), 'key3': self.fac2.take(self.ind2), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), })
+
+ def time_groupby_pivot_table(self):
+ self.df.pivot_table(index='key1', columns=['key2', 'key3'])
+
+
+class groupby_series_nth_any(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randint(1, 100, (10000, 2)))
+
+ def time_groupby_series_nth_any(self):
+ self.df[1].groupby(self.df[0]).nth(0, dropna='any')
+
+
+class groupby_series_nth_none(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randint(1, 100, (10000, 2)))
+
+ def time_groupby_series_nth_none(self):
+ self.df[1].groupby(self.df[0]).nth(0)
+
+
+class groupby_series_simple_cython(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.ngroups = 100
+
+ def get_test_data(ngroups=100, n=self.N):
+ self.unique_groups = range(self.ngroups)
+ self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object)
+ if (len(self.arr) < n):
+ self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object)
+ random.shuffle(self.arr)
+ return self.arr
+ self.df = DataFrame({'key1': get_test_data(ngroups=self.ngroups), 'key2': get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), })
+
+ def f():
+ self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum()))
+ self.simple_series = Series(np.random.randn(self.N))
+ self.key1 = self.df['key1']
+
+ def time_groupby_series_simple_cython(self):
+ self.df.groupby('key1').rank(pct=True)
+
+
+class groupby_simple_compress_timing(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = np.random.randn(1000000, 2)
+ self.labels = np.random.randint(0, 1000, size=1000000)
+ self.df = DataFrame(self.data)
+
+ def time_groupby_simple_compress_timing(self):
+ self.df.groupby(self.labels).mean()
+
+
+class groupby_sum_booleans(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500
+ self.df = DataFrame({'ii': range(self.N), 'bb': [True for x in range(self.N)], })
+
+ def time_groupby_sum_booleans(self):
+ self.df.groupby('ii').sum()
+
+
+class groupby_sum_multiindex(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 50
+ self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B'])
+
+ def time_groupby_sum_multiindex(self):
+ self.df.groupby(level=[0, 1]).sum()
+
+
+class groupby_transform(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n_dates = 400
+ self.n_securities = 250
+ self.n_columns = 3
+ self.share_na = 0.1
+ self.dates = date_range('1997-12-31', periods=self.n_dates, freq='B')
+ self.dates = Index(map((lambda x: (((x.year * 10000) + (x.month * 100)) + x.day)), self.dates))
+ self.secid_min = int('10000000', 16)
+ self.secid_max = int('F0000000', 16)
+ self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1))
+ self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step))
+ self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], labels=[[i for i in xrange(self.n_dates) for _ in xrange(self.n_securities)], (range(self.n_securities) * self.n_dates)], names=['date', 'security_id'])
+ self.n_data = len(self.data_index)
+ self.columns = Index(['factor{}'.format(i) for i in xrange(1, (self.n_columns + 1))])
+ self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns)
+ self.step = int((self.n_data * self.share_na))
+ for column_index in xrange(self.n_columns):
+ self.index = column_index
+ while (self.index < self.n_data):
+ self.data.set_value(self.data_index[self.index], self.columns[column_index], np.nan)
+ self.index += self.step
+ self.f_fillna = (lambda x: x.fillna(method='pad'))
+
+ def time_groupby_transform(self):
+ self.data.groupby(level='security_id').transform(self.f_fillna)
+
+
+class groupby_transform_multi_key1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(2718281)
+ self.n = 20000
+ self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie'])
+
+ def time_groupby_transform_multi_key1(self):
+ self.df.groupby(['jim', 'joe'])['jolie'].transform('max')
+
+
+class groupby_transform_multi_key2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(2718281)
+ self.n = 20000
+ self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie'])
+ self.df['jim'] = self.df['joe']
+
+ def time_groupby_transform_multi_key2(self):
+ self.df.groupby(['jim', 'joe'])['jolie'].transform('max')
+
+
+class groupby_transform_multi_key3(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(2718281)
+ self.n = 200000
+ self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie'])
+
+ def time_groupby_transform_multi_key3(self):
+ self.df.groupby(['jim', 'joe'])['jolie'].transform('max')
+
+
+class groupby_transform_multi_key4(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(2718281)
+ self.n = 200000
+ self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie'])
+ self.df['jim'] = self.df['joe']
+
+ def time_groupby_transform_multi_key4(self):
+ self.df.groupby(['jim', 'joe'])['jolie'].transform('max')
+
+
+class groupby_transform_series(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(0)
+ self.N = 120000
+ self.N_TRANSITIONS = 1400
+ self.transition_points = np.random.permutation(np.arange(self.N))[:self.N_TRANSITIONS]
+ self.transition_points.sort()
+ self.transitions = np.zeros((self.N,), dtype=np.bool)
+ self.transitions[self.transition_points] = True
+ self.g = self.transitions.cumsum()
+ self.df = DataFrame({'signal': np.random.rand(self.N), })
+
+ def time_groupby_transform_series(self):
+ self.df['signal'].groupby(self.g).transform(np.mean)
+
+
+class groupby_transform_series2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(0)
+ self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), })
+
+ def time_groupby_transform_series2(self):
+ self.df.groupby('id')['val'].transform(np.mean)
+
+
+class groupby_transform_ufunc(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n_dates = 400
+ self.n_securities = 250
+ self.n_columns = 3
+ self.share_na = 0.1
+ self.dates = date_range('1997-12-31', periods=self.n_dates, freq='B')
+ self.dates = Index(map((lambda x: (((x.year * 10000) + (x.month * 100)) + x.day)), self.dates))
+ self.secid_min = int('10000000', 16)
+ self.secid_max = int('F0000000', 16)
+ self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1))
+ self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step))
+ self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], labels=[[i for i in xrange(self.n_dates) for _ in xrange(self.n_securities)], (range(self.n_securities) * self.n_dates)], names=['date', 'security_id'])
+ self.n_data = len(self.data_index)
+ self.columns = Index(['factor{}'.format(i) for i in xrange(1, (self.n_columns + 1))])
+ self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns)
+ self.step = int((self.n_data * self.share_na))
+ for column_index in xrange(self.n_columns):
+ self.index = column_index
+ while (self.index < self.n_data):
+ self.data.set_value(self.data_index[self.index], self.columns[column_index], np.nan)
+ self.index += self.step
+ self.f_fillna = (lambda x: x.fillna(method='pad'))
+
+ def time_groupby_transform_ufunc(self):
+ self.data.groupby(level='date').transform(np.max)
+
+
+class series_value_counts_int64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.randint(0, 1000, size=100000))
+
+ def time_series_value_counts_int64(self):
+ self.s.value_counts()
+
+
+class series_value_counts_strings(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.K = 1000
+ self.N = 100000
+ self.uniques = tm.makeStringIndex(self.K).values
+ self.s = Series(np.tile(self.uniques, (self.N // self.K)))
+
+ def time_series_value_counts_strings(self):
+ self.s.value_counts()
\ No newline at end of file
diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py
new file mode 100644
index 0000000000000..9e36f735f8608
--- /dev/null
+++ b/asv_bench/benchmarks/hdfstore_bench.py
@@ -0,0 +1,351 @@
+from pandas_vb_common import *
+import os
+
+
+class query_store_table(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = date_range('1/1/2000', periods=25000)
+ self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+ self.store.append('df12', self.df)
+
+ def time_query_store_table(self):
+ self.store.select('df12', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])])
+
+ def teardown(self):
+ self.store.close()
+
+
+class query_store_table_wide(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = date_range('1/1/2000', periods=25000)
+ self.df = DataFrame(np.random.randn(25000, 100), index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+ self.store.append('df11', self.df)
+
+ def time_query_store_table_wide(self):
+ self.store.select('df11', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])])
+
+ def teardown(self):
+ self.store.close()
+
+
+class read_store(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = tm.makeStringIndex(25000)
+ self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+ self.store.put('df1', self.df)
+
+ def time_read_store(self):
+ self.store.get('df1')
+
+ def teardown(self):
+ self.store.close()
+
+
+class read_store_mixed(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = tm.makeStringIndex(25000)
+ self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+ self.store.put('df3', self.df)
+
+ def time_read_store_mixed(self):
+ self.store.get('df3')
+
+ def teardown(self):
+ self.store.close()
+
+
+class read_store_table(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = tm.makeStringIndex(25000)
+ self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+ self.store.append('df7', self.df)
+
+ def time_read_store_table(self):
+ self.store.select('df7')
+
+ def teardown(self):
+ self.store.close()
+
+
+class read_store_table_mixed(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 10000
+ self.index = tm.makeStringIndex(self.N)
+ self.df = DataFrame({'float1': randn(self.N), 'float2': randn(self.N), 'string1': (['foo'] * self.N), 'bool1': ([True] * self.N), 'int1': np.random.randint(0, self.N, size=self.N), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+ self.store.append('df5', self.df)
+
+ def time_read_store_table_mixed(self):
+ self.store.select('df5')
+
+ def teardown(self):
+ self.store.close()
+
+
+class read_store_table_panel(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in xrange(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in xrange(25)])
+ remove(self.f)
+ self.store = HDFStore(self.f)
+ self.store.append('p1', self.p)
+
+ def time_read_store_table_panel(self):
+ self.store.select('p1')
+
+ def teardown(self):
+ self.store.close()
+
+
+class read_store_table_wide(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.df = DataFrame(np.random.randn(25000, 100))
+ remove(self.f)
+ self.store = HDFStore(self.f)
+ self.store.append('df9', self.df)
+
+ def time_read_store_table_wide(self):
+ self.store.select('df9')
+
+ def teardown(self):
+ self.store.close()
+
+
+class write_store(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = tm.makeStringIndex(25000)
+ self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+
+ def time_write_store(self):
+ self.store.put('df2', self.df)
+
+ def teardown(self):
+ self.store.close()
+
+
+class write_store_mixed(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = tm.makeStringIndex(25000)
+ self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+
+ def time_write_store_mixed(self):
+ self.store.put('df4', self.df)
+
+ def teardown(self):
+ self.store.close()
+
+
+class write_store_table(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = tm.makeStringIndex(25000)
+ self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+
+ def time_write_store_table(self):
+ self.store.append('df8', self.df)
+
+ def teardown(self):
+ self.store.close()
+
+
+class write_store_table_dc(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.df = DataFrame(np.random.randn(10000, 10), columns=[('C%03d' % i) for i in xrange(10)])
+ remove(self.f)
+ self.store = HDFStore(self.f)
+
+ def time_write_store_table_dc(self):
+ self.store.append('df15', self.df, data_columns=True)
+
+ def teardown(self):
+ self.store.close()
+
+
+class write_store_table_mixed(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.index = tm.makeStringIndex(25000)
+ self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 25000, size=25000), }, index=self.index)
+ remove(self.f)
+ self.store = HDFStore(self.f)
+
+ def time_write_store_table_mixed(self):
+ self.store.append('df6', self.df)
+
+ def teardown(self):
+ self.store.close()
+
+
+class write_store_table_panel(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in xrange(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in xrange(25)])
+ remove(self.f)
+ self.store = HDFStore(self.f)
+
+ def time_write_store_table_panel(self):
+ self.store.append('p2', self.p)
+
+ def teardown(self):
+ self.store.close()
+
+
+class write_store_table_wide(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.h5'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.df = DataFrame(np.random.randn(25000, 100))
+ remove(self.f)
+ self.store = HDFStore(self.f)
+
+ def time_write_store_table_wide(self):
+ self.store.append('df10', self.df)
+
+ def teardown(self):
+ self.store.close()
\ No newline at end of file
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
new file mode 100644
index 0000000000000..9c181c92195ea
--- /dev/null
+++ b/asv_bench/benchmarks/index_object.py
@@ -0,0 +1,292 @@
+from pandas_vb_common import *
+
+
+class datetime_index_intersection(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=10000, freq='T')
+ self.rng2 = self.rng[:(-1)]
+
+ def time_datetime_index_intersection(self):
+ self.rng.intersection(self.rng2)
+
+
+class datetime_index_repr(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.dr = pd.date_range('20000101', freq='D', periods=100000)
+
+ def time_datetime_index_repr(self):
+ self.dr._is_dates_only
+
+
+class datetime_index_union(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=10000, freq='T')
+ self.rng2 = self.rng[:(-1)]
+
+ def time_datetime_index_union(self):
+ self.rng.union(self.rng2)
+
+
+class index_datetime_intersection(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute())
+ if (self.rng.dtype == object):
+ self.rng = self.rng.view(Index)
+ else:
+ self.rng = self.rng.asobject
+ self.rng2 = self.rng[:(-1)]
+
+ def time_index_datetime_intersection(self):
+ self.rng.intersection(self.rng2)
+
+
+class index_datetime_union(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute())
+ if (self.rng.dtype == object):
+ self.rng = self.rng.view(Index)
+ else:
+ self.rng = self.rng.asobject
+ self.rng2 = self.rng[:(-1)]
+
+ def time_index_datetime_union(self):
+ self.rng.union(self.rng2)
+
+
+class index_float64_boolean_indexer(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeFloatIndex(1000000)
+ self.mask = ((np.arange(self.idx.size) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_float64_boolean_indexer(self):
+ self.idx[self.mask]
+
+
+class index_float64_boolean_series_indexer(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeFloatIndex(1000000)
+ self.mask = ((np.arange(self.idx.size) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_float64_boolean_series_indexer(self):
+ self.idx[self.series_mask]
+
+
+class index_float64_construct(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.baseidx = np.arange(1000000.0)
+
+ def time_index_float64_construct(self):
+ Index(self.baseidx)
+
+
+class index_float64_div(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeFloatIndex(1000000)
+ self.mask = ((np.arange(self.idx.size) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_float64_div(self):
+ (self.idx / 2)
+
+
+class index_float64_get(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeFloatIndex(1000000)
+ self.mask = ((np.arange(self.idx.size) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_float64_get(self):
+ self.idx[1]
+
+
+class index_float64_mul(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeFloatIndex(1000000)
+ self.mask = ((np.arange(self.idx.size) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_float64_mul(self):
+ (self.idx * 2)
+
+
+class index_float64_slice_indexer_basic(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeFloatIndex(1000000)
+ self.mask = ((np.arange(self.idx.size) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_float64_slice_indexer_basic(self):
+ self.idx[:(-1)]
+
+
+class index_float64_slice_indexer_even(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeFloatIndex(1000000)
+ self.mask = ((np.arange(self.idx.size) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_float64_slice_indexer_even(self):
+ self.idx[::2]
+
+
+class index_int64_intersection(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.options = np.arange(self.N)
+ self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)]))
+ self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)]))
+
+ def time_index_int64_intersection(self):
+ self.left.intersection(self.right)
+
+
+class index_int64_union(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ self.options = np.arange(self.N)
+ self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)]))
+ self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)]))
+
+ def time_index_int64_union(self):
+ self.left.union(self.right)
+
+
+class index_str_boolean_indexer(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeStringIndex(1000000)
+ self.mask = ((np.arange(1000000) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_str_boolean_indexer(self):
+ self.idx[self.mask]
+
+
+class index_str_boolean_series_indexer(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeStringIndex(1000000)
+ self.mask = ((np.arange(1000000) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_str_boolean_series_indexer(self):
+ self.idx[self.series_mask]
+
+
+class index_str_slice_indexer_basic(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeStringIndex(1000000)
+ self.mask = ((np.arange(1000000) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_str_slice_indexer_basic(self):
+ self.idx[:(-1)]
+
+
+class index_str_slice_indexer_even(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.idx = tm.makeStringIndex(1000000)
+ self.mask = ((np.arange(1000000) % 3) == 0)
+ self.series_mask = Series(self.mask)
+
+ def time_index_str_slice_indexer_even(self):
+ self.idx[::2]
+
+
+class multiindex_duplicated(object):
+ goal_time = 0.2
+
+ def setup(self):
+ (n, k) = (200, 5000)
+ self.levels = [np.arange(n), tm.makeStringIndex(n).values, (1000 + np.arange(n))]
+ self.labels = [np.random.choice(n, (k * n)) for lev in self.levels]
+ self.mi = MultiIndex(levels=self.levels, labels=self.labels)
+
+ def time_multiindex_duplicated(self):
+ self.mi.duplicated()
+
+
+class multiindex_from_product(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.iterables = [tm.makeStringIndex(10000), xrange(20)]
+
+ def time_multiindex_from_product(self):
+ MultiIndex.from_product(self.iterables)
+
+
+class multiindex_sortlevel_int64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = ((((3 * 5) * 7) * 11) * (1 << 10))
+ (low, high) = (((-1) << 12), (1 << 12))
+ self.f = (lambda k: np.repeat(np.random.randint(low, high, (self.n // k)), k))
+ self.i = np.random.permutation(self.n)
+ self.mi = MultiIndex.from_arrays([self.f(11), self.f(7), self.f(5), self.f(3), self.f(1)])[self.i]
+
+ def time_multiindex_sortlevel_int64(self):
+ self.mi.sortlevel()
+
+
+class multiindex_with_datetime_level_full(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.level1 = range(1000)
+ self.level2 = date_range(start='1/1/2012', periods=100)
+ self.mi = MultiIndex.from_product([self.level1, self.level2])
+
+ def time_multiindex_with_datetime_level_full(self):
+ self.mi.copy().values
+
+
+class multiindex_with_datetime_level_sliced(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.level1 = range(1000)
+ self.level2 = date_range(start='1/1/2012', periods=100)
+ self.mi = MultiIndex.from_product([self.level1, self.level2])
+
+ def time_multiindex_with_datetime_level_sliced(self):
+ self.mi[:10].values
\ No newline at end of file
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
new file mode 100644
index 0000000000000..e76a87ab881c9
--- /dev/null
+++ b/asv_bench/benchmarks/indexing.py
@@ -0,0 +1,458 @@
+from pandas_vb_common import *
+import pandas.computation.expressions as expr
+
+
+class dataframe_getitem_scalar(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(1000)
+ self.columns = tm.makeStringIndex(30)
+ self.df = DataFrame(np.random.rand(1000, 30), index=self.index, columns=self.columns)
+ self.idx = self.index[100]
+ self.col = self.columns[10]
+
+ def time_dataframe_getitem_scalar(self):
+ self.df[self.col][self.idx]
+
+
+class datamatrix_getitem_scalar(object):
+ goal_time = 0.2
+
+ def setup(self):
+ try:
+ self.klass = DataMatrix
+ except:
+ self.klass = DataFrame
+ self.index = tm.makeStringIndex(1000)
+ self.columns = tm.makeStringIndex(30)
+ self.df = self.klass(np.random.rand(1000, 30), index=self.index, columns=self.columns)
+ self.idx = self.index[100]
+ self.col = self.columns[10]
+
+ def time_datamatrix_getitem_scalar(self):
+ self.df[self.col][self.idx]
+
+
+class series_get_value(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(1000)
+ self.s = Series(np.random.rand(1000), index=self.index)
+ self.idx = self.index[100]
+
+ def time_series_get_value(self):
+ self.s.get_value(self.idx)
+
+
+class time_series_getitem_scalar(object):
+ goal_time = 0.2
+
+ def setup(self):
+ tm.N = 1000
+ self.ts = tm.makeTimeSeries()
+ self.dt = self.ts.index[500]
+
+ def time_time_series_getitem_scalar(self):
+ self.ts[self.dt]
+
+
+class frame_iloc_big(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(dict(A=(['foo'] * 1000000)))
+
+ def time_frame_iloc_big(self):
+ self.df.iloc[:100, 0]
+
+
+class frame_iloc_dups(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), })
+ self.idx = (np.array(range(30)) * 99)
+ self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), })
+ self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)])
+
+ def time_frame_iloc_dups(self):
+ self.df2.iloc[self.idx]
+
+
+class frame_loc_dups(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), })
+ self.idx = (np.array(range(30)) * 99)
+ self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), })
+ self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)])
+
+ def time_frame_loc_dups(self):
+ self.df2.loc[self.idx]
+
+
+class frame_xs_mi_ix(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)])
+ self.s = Series(np.random.randn(1000000), index=self.mi)
+ self.df = DataFrame(self.s)
+
+ def time_frame_xs_mi_ix(self):
+ self.df.ix[999]
+
+
+class indexing_dataframe_boolean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(50000, 100))
+ self.df2 = DataFrame(np.random.randn(50000, 100))
+
+ def time_indexing_dataframe_boolean(self):
+ (self.df > self.df2)
+
+
+class indexing_dataframe_boolean_no_ne(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(50000, 100))
+ self.df2 = DataFrame(np.random.randn(50000, 100))
+ expr.set_use_numexpr(False)
+
+ def time_indexing_dataframe_boolean_no_ne(self):
+ (self.df > self.df2)
+
+ def teardown(self):
+ expr.set_use_numexpr(True)
+
+
+class indexing_dataframe_boolean_rows(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D'])
+ self.indexer = (self.df['B'] > 0)
+ self.obj_indexer = self.indexer.astype('O')
+
+ def time_indexing_dataframe_boolean_rows(self):
+ self.df[self.indexer]
+
+
+class indexing_dataframe_boolean_rows_object(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D'])
+ self.indexer = (self.df['B'] > 0)
+ self.obj_indexer = self.indexer.astype('O')
+
+ def time_indexing_dataframe_boolean_rows_object(self):
+ self.df[self.obj_indexer]
+
+
+class indexing_dataframe_boolean_st(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(50000, 100))
+ self.df2 = DataFrame(np.random.randn(50000, 100))
+ expr.set_numexpr_threads(1)
+
+ def time_indexing_dataframe_boolean_st(self):
+ (self.df > self.df2)
+
+ def teardown(self):
+ expr.set_numexpr_threads()
+
+
+class indexing_frame_get_value(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(1000)
+ self.columns = tm.makeStringIndex(30)
+ self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns)
+ self.idx = self.index[100]
+ self.col = self.columns[10]
+
+ def time_indexing_frame_get_value(self):
+ self.df.get_value(self.idx, self.col)
+
+
+class indexing_frame_get_value_ix(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(1000)
+ self.columns = tm.makeStringIndex(30)
+ self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns)
+ self.idx = self.index[100]
+ self.col = self.columns[10]
+
+ def time_indexing_frame_get_value_ix(self):
+ self.df.ix[(self.idx, self.col)]
+
+
+class indexing_panel_subset(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.p = Panel(np.random.randn(100, 100, 100))
+ self.inds = range(0, 100, 10)
+
+ def time_indexing_panel_subset(self):
+ self.p.ix[(self.inds, self.inds, self.inds)]
+
+
+class multiindex_slicers(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(1234)
+ self.idx = pd.IndexSlice
+ self.n = 100000
+ self.mdt = pandas.DataFrame()
+ self.mdt['A'] = np.random.choice(range(10000, 45000, 1000), self.n)
+ self.mdt['B'] = np.random.choice(range(10, 400), self.n)
+ self.mdt['C'] = np.random.choice(range(1, 150), self.n)
+ self.mdt['D'] = np.random.choice(range(10000, 45000), self.n)
+ self.mdt['x'] = np.random.choice(range(400), self.n)
+ self.mdt['y'] = np.random.choice(range(25), self.n)
+ self.test_A = 25000
+ self.test_B = 25
+ self.test_C = 40
+ self.test_D = 35000
+ self.eps_A = 5000
+ self.eps_B = 5
+ self.eps_C = 5
+ self.eps_D = 5000
+ self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
+
+ def time_multiindex_slicers(self):
+ self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
+
+
+class series_getitem_array(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_getitem_array(self):
+ self.s[np.arange(10000)]
+
+
+class series_getitem_label_slice(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(1000000)
+ self.s = Series(np.random.rand(1000000), index=self.index)
+ self.lbl = self.s.index[800000]
+
+ def time_series_getitem_label_slice(self):
+ self.s[:self.lbl]
+
+
+class series_getitem_list_like(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_getitem_list_like(self):
+ self.s[[800000]]
+
+
+class series_getitem_pos_slice(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(1000000)
+ self.s = Series(np.random.rand(1000000), index=self.index)
+
+ def time_series_getitem_pos_slice(self):
+ self.s[:800000]
+
+
+class series_getitem_scalar(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_getitem_scalar(self):
+ self.s[800000]
+
+
+class series_getitem_slice(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_getitem_slice(self):
+ self.s[:800000]
+
+
+class series_iloc_array(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_iloc_array(self):
+ self.s.iloc[np.arange(10000)]
+
+
+class series_iloc_list_like(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_iloc_list_like(self):
+ self.s.iloc[[800000]]
+
+
+class series_iloc_scalar(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_iloc_scalar(self):
+ self.s.iloc[800000]
+
+
+class series_iloc_slice(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_iloc_slice(self):
+ self.s.iloc[:800000]
+
+
+class series_ix_array(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_ix_array(self):
+ self.s.ix[np.arange(10000)]
+
+
+class series_ix_list_like(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_ix_list_like(self):
+ self.s.ix[[800000]]
+
+
+class series_ix_scalar(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_ix_scalar(self):
+ self.s.ix[800000]
+
+
+class series_ix_slice(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_ix_slice(self):
+ self.s.ix[:800000]
+
+
+class series_loc_array(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_loc_array(self):
+ self.s.loc[np.arange(10000)]
+
+
+class series_loc_list_like(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_loc_list_like(self):
+ self.s.loc[[800000]]
+
+
+class series_loc_scalar(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_loc_scalar(self):
+ self.s.loc[800000]
+
+
+class series_loc_slice(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.rand(1000000))
+
+ def time_series_loc_slice(self):
+ self.s.loc[:800000]
+
+
+class series_xs_mi_ix(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)])
+ self.s = Series(np.random.randn(1000000), index=self.mi)
+
+ def time_series_xs_mi_ix(self):
+ self.s.ix[999]
+
+
+class sort_level_one(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.a = np.repeat(np.arange(100), 1000)
+ self.b = np.tile(np.arange(1000), 100)
+ self.midx = MultiIndex.from_arrays([self.a, self.b])
+ self.midx = self.midx.take(np.random.permutation(np.arange(100000)))
+
+ def time_sort_level_one(self):
+ self.midx.sortlevel(1)
+
+
+class sort_level_zero(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.a = np.repeat(np.arange(100), 1000)
+ self.b = np.tile(np.arange(1000), 100)
+ self.midx = MultiIndex.from_arrays([self.a, self.b])
+ self.midx = self.midx.take(np.random.permutation(np.arange(100000)))
+
+ def time_sort_level_zero(self):
+ self.midx.sortlevel(0)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
new file mode 100644
index 0000000000000..2addc810a218f
--- /dev/null
+++ b/asv_bench/benchmarks/inference.py
@@ -0,0 +1,138 @@
+from pandas_vb_common import *
+import pandas as pd
+
+
+class dtype_infer_datetime64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500000
+ self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64')))
+ self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32')))
+ self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32')))
+ self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64')))
+ self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32')))
+ self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
+ self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
+
+ def time_dtype_infer_datetime64(self):
+ (self.df_datetime64['A'] - self.df_datetime64['B'])
+
+
+class dtype_infer_float32(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500000
+ self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64')))
+ self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32')))
+ self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32')))
+ self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64')))
+ self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32')))
+ self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
+ self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
+
+ def time_dtype_infer_float32(self):
+ (self.df_float32['A'] + self.df_float32['B'])
+
+
+class dtype_infer_float64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500000
+ self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64')))
+ self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32')))
+ self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32')))
+ self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64')))
+ self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32')))
+ self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
+ self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
+
+ def time_dtype_infer_float64(self):
+ (self.df_float64['A'] + self.df_float64['B'])
+
+
+class dtype_infer_int32(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500000
+ self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64')))
+ self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32')))
+ self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32')))
+ self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64')))
+ self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32')))
+ self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
+ self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
+
+ def time_dtype_infer_int32(self):
+ (self.df_int32['A'] + self.df_int32['B'])
+
+
+class dtype_infer_int64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500000
+ self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64')))
+ self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32')))
+ self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32')))
+ self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64')))
+ self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32')))
+ self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
+ self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
+
+ def time_dtype_infer_int64(self):
+ (self.df_int64['A'] + self.df_int64['B'])
+
+
+class dtype_infer_timedelta64_1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500000
+ self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64')))
+ self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32')))
+ self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32')))
+ self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64')))
+ self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32')))
+ self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
+ self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
+
+ def time_dtype_infer_timedelta64_1(self):
+ (self.df_timedelta64['A'] + self.df_timedelta64['B'])
+
+
+class dtype_infer_timedelta64_2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500000
+ self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64')))
+ self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32')))
+ self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32')))
+ self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64')))
+ self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32')))
+ self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
+ self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
+
+ def time_dtype_infer_timedelta64_2(self):
+ (self.df_timedelta64['A'] + self.df_timedelta64['A'])
+
+
+class dtype_infer_uint32(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 500000
+ self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64')))
+ self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32')))
+ self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32')))
+ self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64')))
+ self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32')))
+ self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
+ self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
+
+ def time_dtype_infer_uint32(self):
+ (self.df_uint32['A'] + self.df_uint32['B'])
\ No newline at end of file
diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py
new file mode 100644
index 0000000000000..9eee932de8b7c
--- /dev/null
+++ b/asv_bench/benchmarks/io_bench.py
@@ -0,0 +1,135 @@
+from pandas_vb_common import *
+from pandas import concat, Timestamp
+from StringIO import StringIO
+
+
+class frame_to_csv(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(3000, 30))
+
+ def time_frame_to_csv(self):
+ self.df.to_csv('__test__.csv')
+
+
+class frame_to_csv2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame({'A': range(50000), })
+ self.df['B'] = (self.df.A + 1.0)
+ self.df['C'] = (self.df.A + 2.0)
+ self.df['D'] = (self.df.A + 3.0)
+
+ def time_frame_to_csv2(self):
+ self.df.to_csv('__test__.csv')
+
+
+class frame_to_csv_date_formatting(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=1000)
+ self.data = DataFrame(self.rng, index=self.rng)
+
+ def time_frame_to_csv_date_formatting(self):
+ self.data.to_csv('__test__.csv', date_format='%Y%m%d')
+
+
+class frame_to_csv_mixed(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def create_cols(name):
+ return [('%s%03d' % (name, i)) for i in xrange(5)]
+ self.df_float = DataFrame(np.random.randn(5000, 5), dtype='float64', columns=create_cols('float'))
+ self.df_int = DataFrame(np.random.randn(5000, 5), dtype='int64', columns=create_cols('int'))
+ self.df_bool = DataFrame(True, index=self.df_float.index, columns=create_cols('bool'))
+ self.df_object = DataFrame('foo', index=self.df_float.index, columns=create_cols('object'))
+ self.df_dt = DataFrame(Timestamp('20010101'), index=self.df_float.index, columns=create_cols('date'))
+ self.df_float.ix[30:500, 1:3] = np.nan
+ self.df = concat([self.df_float, self.df_int, self.df_bool, self.df_object, self.df_dt], axis=1)
+
+ def time_frame_to_csv_mixed(self):
+ self.df.to_csv('__test__.csv')
+
+
+class read_csv_infer_datetime_format_custom(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=1000)
+ self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%m/%d/%Y %H:%M:%S.%f'))))
+
+ def time_read_csv_infer_datetime_format_custom(self):
+ read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True)
+
+
+class read_csv_infer_datetime_format_iso8601(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=1000)
+ self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))))
+
+ def time_read_csv_infer_datetime_format_iso8601(self):
+ read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True)
+
+
+class read_csv_infer_datetime_format_ymd(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=1000)
+ self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y%m%d'))))
+
+ def time_read_csv_infer_datetime_format_ymd(self):
+ read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True)
+
+
+class read_csv_skiprows(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(20000)
+ self.df = DataFrame({'float1': randn(20000), 'float2': randn(20000), 'string1': (['foo'] * 20000), 'bool1': ([True] * 20000), 'int1': np.random.randint(0, 200000, size=20000), }, index=self.index)
+ self.df.to_csv('__test__.csv')
+
+ def time_read_csv_skiprows(self):
+ read_csv('__test__.csv', skiprows=10000)
+
+
+class read_csv_standard(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(10000)
+ self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index)
+ self.df.to_csv('__test__.csv')
+
+ def time_read_csv_standard(self):
+ read_csv('__test__.csv')
+
+
+class read_parse_dates_iso8601(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=1000)
+ self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))))
+
+ def time_read_parse_dates_iso8601(self):
+ read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'])
+
+
+class write_csv_standard(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = tm.makeStringIndex(10000)
+ self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index)
+
+ def time_write_csv_standard(self):
+ self.df.to_csv('__test__.csv')
\ No newline at end of file
diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py
new file mode 100644
index 0000000000000..e75e691b61c96
--- /dev/null
+++ b/asv_bench/benchmarks/io_sql.py
@@ -0,0 +1,215 @@
+from pandas_vb_common import *
+from sqlalchemy import create_engine
+import sqlite3
+import sqlalchemy
+
+
+class sql_datetime_read_and_parse_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df['datetime_string'] = self.df['datetime'].map(str)
+ self.df.to_sql('test_type', self.engine, if_exists='replace')
+ self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace')
+
+ def time_sql_datetime_read_and_parse_sqlalchemy(self):
+ read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string'])
+
+
+class sql_datetime_read_as_native_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df['datetime_string'] = self.df['datetime'].map(str)
+ self.df.to_sql('test_type', self.engine, if_exists='replace')
+ self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace')
+
+ def time_sql_datetime_read_as_native_sqlalchemy(self):
+ read_sql_table('test_type', self.engine, columns=['datetime'])
+
+
+class sql_datetime_write_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df.loc[1000:3000, 'float'] = np.nan
+
+ def time_sql_datetime_write_sqlalchemy(self):
+ self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace')
+
+
+class sql_float_read_query_fallback(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df['datetime_string'] = self.df['datetime'].map(str)
+ self.df.to_sql('test_type', self.engine, if_exists='replace')
+ self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace')
+
+ def time_sql_float_read_query_fallback(self):
+ read_sql_query('SELECT float FROM test_type', self.con)
+
+
+class sql_float_read_query_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df['datetime_string'] = self.df['datetime'].map(str)
+ self.df.to_sql('test_type', self.engine, if_exists='replace')
+ self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace')
+
+ def time_sql_float_read_query_sqlalchemy(self):
+ read_sql_query('SELECT float FROM test_type', self.engine)
+
+
+class sql_float_read_table_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df['datetime_string'] = self.df['datetime'].map(str)
+ self.df.to_sql('test_type', self.engine, if_exists='replace')
+ self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace')
+
+ def time_sql_float_read_table_sqlalchemy(self):
+ read_sql_table('test_type', self.engine, columns=['float'])
+
+
+class sql_float_write_fallback(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df.loc[1000:3000, 'float'] = np.nan
+
+ def time_sql_float_write_fallback(self):
+ self.df[['float']].to_sql('test_float', self.con, if_exists='replace')
+
+
+class sql_float_write_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df.loc[1000:3000, 'float'] = np.nan
+
+ def time_sql_float_write_sqlalchemy(self):
+ self.df[['float']].to_sql('test_float', self.engine, if_exists='replace')
+
+
+class sql_read_query_fallback(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.index = tm.makeStringIndex(10000)
+ self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index)
+ self.df.to_sql('test2', self.engine, if_exists='replace')
+ self.df.to_sql('test2', self.con, if_exists='replace')
+
+ def time_sql_read_query_fallback(self):
+ read_sql_query('SELECT * FROM test2', self.con)
+
+
+class sql_read_query_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.index = tm.makeStringIndex(10000)
+ self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index)
+ self.df.to_sql('test2', self.engine, if_exists='replace')
+ self.df.to_sql('test2', self.con, if_exists='replace')
+
+ def time_sql_read_query_sqlalchemy(self):
+ read_sql_query('SELECT * FROM test2', self.engine)
+
+
+class sql_read_table_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.index = tm.makeStringIndex(10000)
+ self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index)
+ self.df.to_sql('test2', self.engine, if_exists='replace')
+ self.df.to_sql('test2', self.con, if_exists='replace')
+
+ def time_sql_read_table_sqlalchemy(self):
+ read_sql_table('test2', self.engine)
+
+
+class sql_string_write_fallback(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df.loc[1000:3000, 'float'] = np.nan
+
+ def time_sql_string_write_fallback(self):
+ self.df[['string']].to_sql('test_string', self.con, if_exists='replace')
+
+
+class sql_string_write_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), })
+ self.df.loc[1000:3000, 'float'] = np.nan
+
+ def time_sql_string_write_sqlalchemy(self):
+ self.df[['string']].to_sql('test_string', self.engine, if_exists='replace')
+
+
+class sql_write_fallback(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.index = tm.makeStringIndex(10000)
+ self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index)
+
+ def time_sql_write_fallback(self):
+ self.df.to_sql('test1', self.con, if_exists='replace')
+
+
+class sql_write_sqlalchemy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.engine = create_engine('sqlite:///:memory:')
+ self.con = sqlite3.connect(':memory:')
+ self.index = tm.makeStringIndex(10000)
+ self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index)
+
+ def time_sql_write_sqlalchemy(self):
+ self.df.to_sql('test1', self.engine, if_exists='replace')
\ No newline at end of file
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
new file mode 100644
index 0000000000000..08ae439e8fd5d
--- /dev/null
+++ b/asv_bench/benchmarks/join_merge.py
@@ -0,0 +1,359 @@
+from pandas_vb_common import *
+
+
+class append_frame_single_homogenous(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D'])
+ self.df2 = self.df1.copy()
+ self.df2.index = np.arange(10000, 20000)
+ self.mdf1 = self.df1.copy()
+ self.mdf1['obj1'] = 'bar'
+ self.mdf1['obj2'] = 'bar'
+ self.mdf1['int1'] = 5
+ try:
+ self.mdf1.consolidate(inplace=True)
+ except:
+ pass
+ self.mdf2 = self.mdf1.copy()
+ self.mdf2.index = self.df2.index
+
+ def time_append_frame_single_homogenous(self):
+ self.df1.append(self.df2)
+
+
+class append_frame_single_mixed(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D'])
+ self.df2 = self.df1.copy()
+ self.df2.index = np.arange(10000, 20000)
+ self.mdf1 = self.df1.copy()
+ self.mdf1['obj1'] = 'bar'
+ self.mdf1['obj2'] = 'bar'
+ self.mdf1['int1'] = 5
+ try:
+ self.mdf1.consolidate(inplace=True)
+ except:
+ pass
+ self.mdf2 = self.mdf1.copy()
+ self.mdf2.index = self.df2.index
+
+ def time_append_frame_single_mixed(self):
+ self.mdf1.append(self.mdf2)
+
+
+class concat_empty_frames1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s'))
+ self.empty = pd.DataFrame()
+
+ def time_concat_empty_frames1(self):
+ concat([self.df, self.empty])
+
+
+class concat_empty_frames2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s'))
+ self.empty = pd.DataFrame()
+
+ def time_concat_empty_frames2(self):
+ concat([self.empty, self.df])
+
+
+class concat_series_axis1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 1000
+ self.indices = tm.makeStringIndex(1000)
+ self.s = Series(self.n, index=self.indices)
+ self.pieces = [self.s[i:(- i)] for i in range(1, 10)]
+ self.pieces = (self.pieces * 50)
+
+ def time_concat_series_axis1(self):
+ concat(self.pieces, axis=1)
+
+
+class concat_small_frames(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = pd.DataFrame(randn(5, 4))
+
+ def time_concat_small_frames(self):
+ concat(([self.df] * 1000))
+
+
+class i8merge(object):
+ goal_time = 0.2
+
+ def setup(self):
+ (low, high, n) = (((-1) << 10), (1 << 10), (1 << 20))
+ self.left = pd.DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG'))
+ self.left['left'] = self.left.sum(axis=1)
+ self.i = np.random.permutation(len(self.left))
+ self.right = self.left.iloc[self.i].copy()
+ self.right.columns = (self.right.columns[:(-1)].tolist() + ['right'])
+ self.right.index = np.arange(len(self.right))
+ self.right['right'] *= (-1)
+
+ def time_i8merge(self):
+ merge(self.left, self.right, how='outer')
+
+
+class join_dataframe_index_multi(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.level1 = tm.makeStringIndex(10).values
+ self.level2 = tm.makeStringIndex(1000).values
+ self.label1 = np.arange(10).repeat(1000)
+ self.label2 = np.tile(np.arange(1000), 10)
+ self.key1 = np.tile(self.level1.take(self.label1), 10)
+ self.key2 = np.tile(self.level2.take(self.label2), 10)
+ self.shuf = np.arange(100000)
+ random.shuffle(self.shuf)
+ try:
+ self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2])
+ self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
+ except:
+ pass
+ try:
+ self.DataFrame = DataMatrix
+ except:
+ pass
+ self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
+ self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
+ self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
+ self.df_shuf = self.df.reindex(self.df.index[self.shuf])
+
+ def time_join_dataframe_index_multi(self):
+ self.df.join(self.df_multi, on=['key1', 'key2'])
+
+
+class join_dataframe_index_single_key_bigger(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.level1 = tm.makeStringIndex(10).values
+ self.level2 = tm.makeStringIndex(1000).values
+ self.label1 = np.arange(10).repeat(1000)
+ self.label2 = np.tile(np.arange(1000), 10)
+ self.key1 = np.tile(self.level1.take(self.label1), 10)
+ self.key2 = np.tile(self.level2.take(self.label2), 10)
+ self.shuf = np.arange(100000)
+ random.shuffle(self.shuf)
+ try:
+ self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2])
+ self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
+ except:
+ pass
+ try:
+ self.DataFrame = DataMatrix
+ except:
+ pass
+ self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
+ self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
+ self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
+ self.df_shuf = self.df.reindex(self.df.index[self.shuf])
+
+ def time_join_dataframe_index_single_key_bigger(self):
+ self.df.join(self.df_key2, on='key2')
+
+
+class join_dataframe_index_single_key_bigger_sort(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.level1 = tm.makeStringIndex(10).values
+ self.level2 = tm.makeStringIndex(1000).values
+ self.label1 = np.arange(10).repeat(1000)
+ self.label2 = np.tile(np.arange(1000), 10)
+ self.key1 = np.tile(self.level1.take(self.label1), 10)
+ self.key2 = np.tile(self.level2.take(self.label2), 10)
+ self.shuf = np.arange(100000)
+ random.shuffle(self.shuf)
+ try:
+ self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2])
+ self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
+ except:
+ pass
+ try:
+ self.DataFrame = DataMatrix
+ except:
+ pass
+ self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
+ self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
+ self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
+ self.df_shuf = self.df.reindex(self.df.index[self.shuf])
+
+ def time_join_dataframe_index_single_key_bigger_sort(self):
+ self.df_shuf.join(self.df_key2, on='key2', sort=True)
+
+
+class join_dataframe_index_single_key_small(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.level1 = tm.makeStringIndex(10).values
+ self.level2 = tm.makeStringIndex(1000).values
+ self.label1 = np.arange(10).repeat(1000)
+ self.label2 = np.tile(np.arange(1000), 10)
+ self.key1 = np.tile(self.level1.take(self.label1), 10)
+ self.key2 = np.tile(self.level2.take(self.label2), 10)
+ self.shuf = np.arange(100000)
+ random.shuffle(self.shuf)
+ try:
+ self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2])
+ self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
+ except:
+ pass
+ try:
+ self.DataFrame = DataMatrix
+ except:
+ pass
+ self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
+ self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
+ self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
+ self.df_shuf = self.df.reindex(self.df.index[self.shuf])
+
+ def time_join_dataframe_index_single_key_small(self):
+ self.df.join(self.df_key1, on='key1')
+
+
+class join_dataframe_integer_2key(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), })
+ self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), })
+ self.df3 = self.df[:5000]
+
+ def time_join_dataframe_integer_2key(self):
+ merge(self.df, self.df3)
+
+
+class join_dataframe_integer_key(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), })
+ self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), })
+ self.df3 = self.df[:5000]
+
+ def time_join_dataframe_integer_key(self):
+ merge(self.df, self.df2, on='key1')
+
+
+class join_non_unique_equal(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T')
+ self.daily_dates = self.date_index.to_period('D').to_timestamp('S', 'S')
+ self.fracofday = (self.date_index.view(np.ndarray) - self.daily_dates.view(np.ndarray))
+ self.fracofday = (self.fracofday.astype('timedelta64[ns]').astype(np.float64) / 86400000000000.0)
+ self.fracofday = TimeSeries(self.fracofday, self.daily_dates)
+ self.index = date_range(self.date_index.min().to_period('A').to_timestamp('D', 'S'), self.date_index.max().to_period('A').to_timestamp('D', 'E'), freq='D')
+ self.temp = TimeSeries(1.0, self.index)
+
+ def time_join_non_unique_equal(self):
+ (self.fracofday * self.temp[self.fracofday.index])
+
+
+class left_outer_join_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ np.random.seed(2718281)
+ self.n = 50000
+ self.left = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jim', 'joe'])
+ self.right = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jolie', 'jolia']).set_index('jolie')
+
+ def time_left_outer_join_index(self):
+ self.left.join(self.right, on='jim')
+
+
+class merge_2intkey_nosort(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.indices = tm.makeStringIndex(self.N).values
+ self.indices2 = tm.makeStringIndex(self.N).values
+ self.key = np.tile(self.indices[:8000], 10)
+ self.key2 = np.tile(self.indices2[:8000], 10)
+ self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), })
+ self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), })
+
+ def time_merge_2intkey_nosort(self):
+ merge(self.left, self.right, sort=False)
+
+
+class merge_2intkey_sort(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.indices = tm.makeStringIndex(self.N).values
+ self.indices2 = tm.makeStringIndex(self.N).values
+ self.key = np.tile(self.indices[:8000], 10)
+ self.key2 = np.tile(self.indices2[:8000], 10)
+ self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), })
+ self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), })
+
+ def time_merge_2intkey_sort(self):
+ merge(self.left, self.right, sort=True)
+
+
+class series_align_int64_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 1000000
+
+ def sample(values, k):
+ self.sampler = np.random.permutation(len(values))
+ return values.take(self.sampler[:k])
+ self.sz = 500000
+ self.rng = np.arange(0, 10000000000000, 10000000)
+ self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng)
+ self.idx1 = np.sort(sample(self.stamps, self.sz))
+ self.idx2 = np.sort(sample(self.stamps, self.sz))
+ self.ts1 = Series(np.random.randn(self.sz), self.idx1)
+ self.ts2 = Series(np.random.randn(self.sz), self.idx2)
+
+ def time_series_align_int64_index(self):
+ (self.ts1 + self.ts2)
+
+
+class series_align_left_monotonic(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 1000000
+
+ def sample(values, k):
+ self.sampler = np.random.permutation(len(values))
+ return values.take(self.sampler[:k])
+ self.sz = 500000
+ self.rng = np.arange(0, 10000000000000, 10000000)
+ self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng)
+ self.idx1 = np.sort(sample(self.stamps, self.sz))
+ self.idx2 = np.sort(sample(self.stamps, self.sz))
+ self.ts1 = Series(np.random.randn(self.sz), self.idx1)
+ self.ts2 = Series(np.random.randn(self.sz), self.idx2)
+
+ def time_series_align_left_monotonic(self):
+ self.ts1.align(self.ts2, join='left')
\ No newline at end of file
diff --git a/asv_bench/benchmarks/miscellaneous.py b/asv_bench/benchmarks/miscellaneous.py
new file mode 100644
index 0000000000000..b9c02c85fb096
--- /dev/null
+++ b/asv_bench/benchmarks/miscellaneous.py
@@ -0,0 +1,30 @@
+from pandas_vb_common import *
+from pandas.util.decorators import cache_readonly
+
+
+class match_strings(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.uniques = tm.makeStringIndex(1000).values
+ self.all = self.uniques.repeat(10)
+
+ def time_match_strings(self):
+ match(self.all, self.uniques)
+
+
+class misc_cache_readonly(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+
+ class Foo:
+
+ @cache_readonly
+ def prop(self):
+ return 5
+ self.obj = Foo()
+
+ def time_misc_cache_readonly(self):
+ self.obj.prop
\ No newline at end of file
diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py
new file mode 100644
index 0000000000000..81fa7c2238d16
--- /dev/null
+++ b/asv_bench/benchmarks/packers.py
@@ -0,0 +1,857 @@
+from numpy.random import randint
+import pandas as pd
+from collections import OrderedDict
+from pandas.compat import BytesIO
+import sqlite3
+from pandas_vb_common import *
+import os
+from sqlalchemy import create_engine
+import numpy as np
+from random import randrange
+from pandas.core import common as com
+
+
+class packers_read_csv(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df.to_csv(self.f)
+
+ def time_packers_read_csv(self):
+ pd.read_csv(self.f)
+
+
+class packers_read_excel(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.bio = BytesIO()
+ self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
+ self.df[:2000].to_excel(self.writer)
+ self.writer.save()
+
+ def time_packers_read_excel(self):
+ self.bio.seek(0)
+ pd.read_excel(self.bio)
+
+
+class packers_read_hdf_store(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df2.to_hdf(self.f, 'df')
+
+ def time_packers_read_hdf_store(self):
+ pd.read_hdf(self.f, 'df')
+
+
+class packers_read_hdf_table(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df2.to_hdf(self.f, 'df', format='table')
+
+ def time_packers_read_hdf_table(self):
+ pd.read_hdf(self.f, 'df')
+
+
+class packers_read_json(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df.to_json(self.f, orient='split')
+ self.df.index = np.arange(self.N)
+
+ def time_packers_read_json(self):
+ pd.read_json(self.f, orient='split')
+
+
+class packers_read_json_date_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df.to_json(self.f, orient='split')
+
+ def time_packers_read_json_date_index(self):
+ pd.read_json(self.f, orient='split')
+
+
+class packers_read_pack(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df2.to_msgpack(self.f)
+
+ def time_packers_read_pack(self):
+ pd.read_msgpack(self.f)
+
+
+class packers_read_pickle(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df2.to_pickle(self.f)
+
+ def time_packers_read_pickle(self):
+ pd.read_pickle(self.f)
+
+
+class packers_read_sql(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.engine = create_engine('sqlite:///:memory:')
+ self.df2.to_sql('table', self.engine, if_exists='replace')
+
+ def time_packers_read_sql(self):
+ pd.read_sql_table('table', self.engine)
+
+
+class packers_read_stata(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df.to_stata(self.f, {'index': 'tc', })
+
+ def time_packers_read_stata(self):
+ pd.read_stata(self.f)
+
+
+class packers_read_stata_with_validation(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)]
+ self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)]
+ self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)]
+ self.df['float32_'] = np.array(randn(self.N), dtype=np.float32)
+ self.df.to_stata(self.f, {'index': 'tc', })
+
+ def time_packers_read_stata_with_validation(self):
+ pd.read_stata(self.f)
+
+
+class packers_write_csv(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+
+ def time_packers_write_csv(self):
+ self.df.to_csv(self.f)
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_excel_openpyxl(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.bio = BytesIO()
+
+ def time_packers_write_excel_openpyxl(self):
+ self.bio.seek(0)
+ self.writer = pd.io.excel.ExcelWriter(self.bio, engine='openpyxl')
+ self.df[:2000].to_excel(self.writer)
+ self.writer.save()
+
+
+class packers_write_excel_xlsxwriter(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.bio = BytesIO()
+
+ def time_packers_write_excel_xlsxwriter(self):
+ self.bio.seek(0)
+ self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
+ self.df[:2000].to_excel(self.writer)
+ self.writer.save()
+
+
+class packers_write_excel_xlwt(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.bio = BytesIO()
+
+ def time_packers_write_excel_xlwt(self):
+ self.bio.seek(0)
+ self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt')
+ self.df[:2000].to_excel(self.writer)
+ self.writer.save()
+
+
+class packers_write_hdf_store(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+
+ def time_packers_write_hdf_store(self):
+ self.df2.to_hdf(self.f, 'df')
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_hdf_table(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+
+ def time_packers_write_hdf_table(self):
+ self.df2.to_hdf(self.f, 'df', table=True)
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_json(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df.index = np.arange(self.N)
+
+ def time_packers_write_json(self):
+ self.df.to_json(self.f, orient='split')
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_json_T(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df.index = np.arange(self.N)
+
+ def time_packers_write_json_T(self):
+ self.df.to_json(self.f, orient='columns')
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_json_date_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+
+ def time_packers_write_json_date_index(self):
+ self.df.to_json(self.f, orient='split')
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_json_mixed_delta_int_tstamp(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.cols = [(lambda i: ('{0}_timedelta'.format(i), [pd.Timedelta(('%d seconds' % randrange(1000000.0))) for _ in range(self.N)])), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_timestamp'.format(i), [pd.Timestamp((1418842918083256000 + randrange(1000000000.0, 1e+18, 200))) for _ in range(self.N)]))]
+ self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index)
+
+ def time_packers_write_json_mixed_delta_int_tstamp(self):
+ self.df_mixed.to_json(self.f, orient='split')
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_json_mixed_float_int(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))]
+ self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index)
+
+ def time_packers_write_json_mixed_float_int(self):
+ self.df_mixed.to_json(self.f, orient='index')
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_json_mixed_float_int_T(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))]
+ self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index)
+
+ def time_packers_write_json_mixed_float_int_T(self):
+ self.df_mixed.to_json(self.f, orient='columns')
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_json_mixed_float_int_str(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))]
+ self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index)
+
+ def time_packers_write_json_mixed_float_int_str(self):
+ self.df_mixed.to_json(self.f, orient='split')
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_pack(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+
+ def time_packers_write_pack(self):
+ self.df2.to_msgpack(self.f)
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_pickle(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+
+ def time_packers_write_pickle(self):
+ self.df2.to_pickle(self.f)
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_sql(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.engine = create_engine('sqlite:///:memory:')
+
+ def time_packers_write_sql(self):
+ self.df2.to_sql('table', self.engine, if_exists='replace')
+
+
+class packers_write_stata(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df.to_stata(self.f, {'index': 'tc', })
+
+ def time_packers_write_stata(self):
+ self.df.to_stata(self.f, {'index': 'tc', })
+
+ def teardown(self):
+ remove(self.f)
+
+
+class packers_write_stata_with_validation(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.f = '__test__.msg'
+
+ def remove(f):
+ try:
+ os.remove(self.f)
+ except:
+ pass
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.N = 100000
+ self.C = 5
+ self.index = date_range('20000101', periods=self.N, freq='H')
+ self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
+ self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
+ remove(self.f)
+ self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)]
+ self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)]
+ self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)]
+ self.df['float32_'] = np.array(randn(self.N), dtype=np.float32)
+ self.df.to_stata(self.f, {'index': 'tc', })
+
+ def time_packers_write_stata_with_validation(self):
+ self.df.to_stata(self.f, {'index': 'tc', })
+
+ def teardown(self):
+ remove(self.f)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
new file mode 120000
index 0000000000000..6e2e449a4c00a
--- /dev/null
+++ b/asv_bench/benchmarks/pandas_vb_common.py
@@ -0,0 +1 @@
+../../vb_suite/pandas_vb_common.py
\ No newline at end of file
diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py
new file mode 100644
index 0000000000000..c755cb122a0bf
--- /dev/null
+++ b/asv_bench/benchmarks/panel_ctor.py
@@ -0,0 +1,64 @@
+from pandas_vb_common import *
+
+
+class panel_from_dict_all_different_indexes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data_frames = {}
+ self.start = datetime(1990, 1, 1)
+ self.end = datetime(2012, 1, 1)
+ for x in xrange(100):
+ self.end += timedelta(days=1)
+ self.dr = np.asarray(date_range(self.start, self.end))
+ self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr)
+ self.data_frames[x] = self.df
+
+ def time_panel_from_dict_all_different_indexes(self):
+ Panel.from_dict(self.data_frames)
+
+
+class panel_from_dict_equiv_indexes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data_frames = {}
+ for x in xrange(100):
+ self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq=datetools.Day(1)))
+ self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr)
+ self.data_frames[x] = self.df
+
+ def time_panel_from_dict_equiv_indexes(self):
+ Panel.from_dict(self.data_frames)
+
+
+class panel_from_dict_same_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq=datetools.Day(1)))
+ self.data_frames = {}
+ for x in xrange(100):
+ self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr)
+ self.data_frames[x] = self.df
+
+ def time_panel_from_dict_same_index(self):
+ Panel.from_dict(self.data_frames)
+
+
+class panel_from_dict_two_different_indexes(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data_frames = {}
+ self.start = datetime(1990, 1, 1)
+ self.end = datetime(2012, 1, 1)
+ for x in xrange(100):
+ if (x == 50):
+ self.end += timedelta(days=1)
+ self.dr = np.asarray(date_range(self.start, self.end))
+ self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr)
+ self.data_frames[x] = self.df
+
+ def time_panel_from_dict_two_different_indexes(self):
+ Panel.from_dict(self.data_frames)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py
new file mode 100644
index 0000000000000..4145b68dca997
--- /dev/null
+++ b/asv_bench/benchmarks/panel_methods.py
@@ -0,0 +1,56 @@
+from pandas_vb_common import *
+
+
+class panel_pct_change_items(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = date_range(start='2000', freq='D', periods=1000)
+ self.panel = Panel(np.random.randn(100, len(self.index), 1000))
+
+ def time_panel_pct_change_items(self):
+ self.panel.pct_change(1, axis='items')
+
+
+class panel_pct_change_major(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = date_range(start='2000', freq='D', periods=1000)
+ self.panel = Panel(np.random.randn(100, len(self.index), 1000))
+
+ def time_panel_pct_change_major(self):
+ self.panel.pct_change(1, axis='major')
+
+
+class panel_pct_change_minor(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = date_range(start='2000', freq='D', periods=1000)
+ self.panel = Panel(np.random.randn(100, len(self.index), 1000))
+
+ def time_panel_pct_change_minor(self):
+ self.panel.pct_change(1, axis='minor')
+
+
+class panel_shift(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = date_range(start='2000', freq='D', periods=1000)
+ self.panel = Panel(np.random.randn(100, len(self.index), 1000))
+
+ def time_panel_shift(self):
+ self.panel.shift(1)
+
+
+class panel_shift_minor(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = date_range(start='2000', freq='D', periods=1000)
+ self.panel = Panel(np.random.randn(100, len(self.index), 1000))
+
+ def time_panel_shift_minor(self):
+ self.panel.shift(1, axis='minor')
\ No newline at end of file
diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py
new file mode 100644
index 0000000000000..46167dc2bb33c
--- /dev/null
+++ b/asv_bench/benchmarks/parser_vb.py
@@ -0,0 +1,109 @@
+from cStringIO import StringIO
+from pandas_vb_common import *
+import os
+from pandas import read_csv, read_table
+
+
+class read_csv_comment2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = ['A,B,C']
+ self.data = (self.data + (['1,2,3 # comment'] * 100000))
+ self.data = '\n'.join(self.data)
+
+ def time_read_csv_comment2(self):
+ read_csv(StringIO(self.data), comment='#')
+
+
+class read_csv_default_converter(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
+ self.data = (self.data * 200)
+
+ def time_read_csv_default_converter(self):
+ read_csv(StringIO(self.data), sep=',', header=None, float_precision=None)
+
+
+class read_csv_precise_converter(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
+ self.data = (self.data * 200)
+
+ def time_read_csv_precise_converter(self):
+ read_csv(StringIO(self.data), sep=',', header=None, float_precision='high')
+
+
+class read_csv_roundtrip_converter(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
+ self.data = (self.data * 200)
+
+ def time_read_csv_roundtrip_converter(self):
+ read_csv(StringIO(self.data), sep=',', header=None, float_precision='round_trip')
+
+
+class read_csv_thou_vb(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 8
+ self.format = (lambda x: '{:,}'.format(x))
+ self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K))))
+ self.df = self.df.applymap(self.format)
+ self.df.to_csv('test.csv', sep='|')
+
+ def time_read_csv_thou_vb(self):
+ read_csv('test.csv', sep='|', thousands=',')
+
+ def teardown(self):
+ os.remove('test.csv')
+
+
+class read_csv_vb(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 8
+ self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K))))
+ self.df.to_csv('test.csv', sep='|')
+
+ def time_read_csv_vb(self):
+ read_csv('test.csv', sep='|')
+
+ def teardown(self):
+ os.remove('test.csv')
+
+
+class read_table_multiple_date(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 8
+ self.data = 'KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n '
+ self.data = (self.data * 200)
+
+ def time_read_table_multiple_date(self):
+ read_table(StringIO(self.data), sep=',', header=None, parse_dates=[[1, 2], [1, 3]])
+
+
+class read_table_multiple_date_baseline(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 8
+ self.data = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n '
+ self.data = (self.data * 200)
+
+ def time_read_table_multiple_date_baseline(self):
+ read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
\ No newline at end of file
diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py
new file mode 100644
index 0000000000000..d1df1b429c656
--- /dev/null
+++ b/asv_bench/benchmarks/plotting.py
@@ -0,0 +1,19 @@
+from pandas_vb_common import *
+try:
+ from pandas import date_range
+except ImportError:
+
+ def date_range(start=None, end=None, periods=None, freq=None):
+ return DatetimeIndex(start, end, periods=periods, offset=freq)
+
+
+class plot_timeseries_period(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 2000
+ self.M = 5
+ self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N))
+
+ def time_plot_timeseries_period(self):
+ self.df.plot()
\ No newline at end of file
diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py
new file mode 100644
index 0000000000000..d6fbd0d31c389
--- /dev/null
+++ b/asv_bench/benchmarks/reindex.py
@@ -0,0 +1,384 @@
+from pandas_vb_common import *
+from random import shuffle
+
+
+class dataframe_reindex(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq=datetools.Minute())
+ self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, columns=range(10))
+ self.df['foo'] = 'bar'
+ self.rng2 = Index(self.rng[::2])
+
+ def time_dataframe_reindex(self):
+ self.df.reindex(self.rng2)
+
+
+class frame_drop_dup_inplace(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 10
+ self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
+ self.col_array_list = list(self.df.values.T)
+
+ def time_frame_drop_dup_inplace(self):
+ self.df.drop_duplicates(['key1', 'key2'], inplace=True)
+
+
+class frame_drop_dup_na_inplace(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 10
+ self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
+ self.col_array_list = list(self.df.values.T)
+ self.df.ix[:10000, :] = np.nan
+
+ def time_frame_drop_dup_na_inplace(self):
+ self.df.drop_duplicates(['key1', 'key2'], inplace=True)
+
+
+class frame_drop_duplicates(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 10
+ self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
+ self.col_array_list = list(self.df.values.T)
+
+ def time_frame_drop_duplicates(self):
+ self.df.drop_duplicates(['key1', 'key2'])
+
+
+class frame_drop_duplicates_na(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 10
+ self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
+ self.col_array_list = list(self.df.values.T)
+ self.df.ix[:10000, :] = np.nan
+
+ def time_frame_drop_duplicates_na(self):
+ self.df.drop_duplicates(['key1', 'key2'])
+
+
+class frame_fillna_many_columns_pad(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.values = np.random.randn(1000, 1000)
+ self.values[::2] = np.nan
+ self.df = DataFrame(self.values)
+
+ def time_frame_fillna_many_columns_pad(self):
+ self.df.fillna(method='pad')
+
+
+class frame_reindex_columns(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30))
+
+ def time_frame_reindex_columns(self):
+ self.df.reindex(columns=self.df.columns[1:5])
+
+
+class frame_sort_index_by_columns(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 10
+ self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
+ self.col_array_list = list(self.df.values.T)
+
+ def time_frame_sort_index_by_columns(self):
+ self.df.sort_index(by=['key1', 'key2'])
+
+
+class lib_fast_zip(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 10
+ self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
+ self.col_array_list = list(self.df.values.T)
+
+ def time_lib_fast_zip(self):
+ lib.fast_zip(self.col_array_list)
+
+
+class lib_fast_zip_fillna(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 10000
+ self.K = 10
+ self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
+ self.col_array_list = list(self.df.values.T)
+ self.df.ix[:10000, :] = np.nan
+
+ def time_lib_fast_zip_fillna(self):
+ lib.fast_zip_fillna(self.col_array_list)
+
+
+class reindex_daterange_backfill(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute())
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+ self.ts2 = self.ts[::2]
+ self.ts3 = self.ts2.reindex(self.ts.index)
+ self.ts4 = self.ts3.astype('float32')
+
+ def pad(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='pad')
+ except:
+ source_series.reindex(target_index, fillMethod='pad')
+
+ def backfill(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='backfill')
+ except:
+ source_series.reindex(target_index, fillMethod='backfill')
+
+ def time_reindex_daterange_backfill(self):
+ backfill(self.ts2, self.ts.index)
+
+
+class reindex_daterange_pad(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute())
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+ self.ts2 = self.ts[::2]
+ self.ts3 = self.ts2.reindex(self.ts.index)
+ self.ts4 = self.ts3.astype('float32')
+
+ def pad(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='pad')
+ except:
+ source_series.reindex(target_index, fillMethod='pad')
+
+ def backfill(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='backfill')
+ except:
+ source_series.reindex(target_index, fillMethod='backfill')
+
+ def time_reindex_daterange_pad(self):
+ pad(self.ts2, self.ts.index)
+
+
+class reindex_fillna_backfill(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute())
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+ self.ts2 = self.ts[::2]
+ self.ts3 = self.ts2.reindex(self.ts.index)
+ self.ts4 = self.ts3.astype('float32')
+
+ def pad(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='pad')
+ except:
+ source_series.reindex(target_index, fillMethod='pad')
+
+ def backfill(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='backfill')
+ except:
+ source_series.reindex(target_index, fillMethod='backfill')
+
+ def time_reindex_fillna_backfill(self):
+ self.ts3.fillna(method='backfill')
+
+
+class reindex_fillna_backfill_float32(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute())
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+ self.ts2 = self.ts[::2]
+ self.ts3 = self.ts2.reindex(self.ts.index)
+ self.ts4 = self.ts3.astype('float32')
+
+ def pad(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='pad')
+ except:
+ source_series.reindex(target_index, fillMethod='pad')
+
+ def backfill(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='backfill')
+ except:
+ source_series.reindex(target_index, fillMethod='backfill')
+
+ def time_reindex_fillna_backfill_float32(self):
+ self.ts4.fillna(method='backfill')
+
+
+class reindex_fillna_pad(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute())
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+ self.ts2 = self.ts[::2]
+ self.ts3 = self.ts2.reindex(self.ts.index)
+ self.ts4 = self.ts3.astype('float32')
+
+ def pad(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='pad')
+ except:
+ source_series.reindex(target_index, fillMethod='pad')
+
+ def backfill(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='backfill')
+ except:
+ source_series.reindex(target_index, fillMethod='backfill')
+
+ def time_reindex_fillna_pad(self):
+ self.ts3.fillna(method='pad')
+
+
+class reindex_fillna_pad_float32(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute())
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+ self.ts2 = self.ts[::2]
+ self.ts3 = self.ts2.reindex(self.ts.index)
+ self.ts4 = self.ts3.astype('float32')
+
+ def pad(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='pad')
+ except:
+ source_series.reindex(target_index, fillMethod='pad')
+
+ def backfill(source_series, target_index):
+ try:
+ source_series.reindex(target_index, method='backfill')
+ except:
+ source_series.reindex(target_index, fillMethod='backfill')
+
+ def time_reindex_fillna_pad_float32(self):
+ self.ts4.fillna(method='pad')
+
+
+class reindex_frame_level_align(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ random.shuffle(self.index.values)
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
+
+ def time_reindex_frame_level_align(self):
+ self.df.align(self.df_level, level=1, copy=False)
+
+
+class reindex_frame_level_reindex(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ random.shuffle(self.index.values)
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
+
+ def time_reindex_frame_level_reindex(self):
+ self.df_level.reindex(self.df.index, level=1)
+
+
+class reindex_multiindex(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000
+ self.K = 20
+ self.level1 = tm.makeStringIndex(self.N).values.repeat(self.K)
+ self.level2 = np.tile(tm.makeStringIndex(self.K).values, self.N)
+ self.index = MultiIndex.from_arrays([self.level1, self.level2])
+ self.s1 = Series(np.random.randn((self.N * self.K)), index=self.index)
+ self.s2 = self.s1[::2]
+
+ def time_reindex_multiindex(self):
+ self.s1.reindex(self.s2.index)
+
+
+class series_align_irregular_string(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = 50000
+ self.indices = tm.makeStringIndex(self.n)
+
+ def sample(values, k):
+ self.sampler = np.arange(len(values))
+ shuffle(self.sampler)
+ return values.take(self.sampler[:k])
+ self.subsample_size = 40000
+ self.x = Series(np.random.randn(50000), self.indices)
+ self.y = Series(np.random.randn(self.subsample_size), index=sample(self.indices, self.subsample_size))
+
+ def time_series_align_irregular_string(self):
+ (self.x + self.y)
+
+
+class series_drop_duplicates_int(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.randint(0, 1000, size=10000))
+ self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))
+
+ def time_series_drop_duplicates_int(self):
+ self.s.drop_duplicates()
+
+
+class series_drop_duplicates_string(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.randint(0, 1000, size=10000))
+ self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))
+
+ def time_series_drop_duplicates_string(self):
+ self.s2.drop_duplicates()
\ No newline at end of file
diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
new file mode 100644
index 0000000000000..9b78c287c5ad4
--- /dev/null
+++ b/asv_bench/benchmarks/replace.py
@@ -0,0 +1,48 @@
+from pandas_vb_common import *
+from pandas.compat import range
+from datetime import timedelta
+
+
+class replace_fillna(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ try:
+ self.rng = date_range('1/1/2000', periods=self.N, freq='min')
+ except NameError:
+ self.rng = DatetimeIndex('1/1/2000', periods=self.N, offset=datetools.Minute())
+ self.date_range = DateRange
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+
+ def time_replace_fillna(self):
+ self.ts.fillna(0.0, inplace=True)
+
+
+class replace_large_dict(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.n = (10 ** 6)
+ self.start_value = (10 ** 5)
+ self.to_rep = dict(((i, (self.start_value + i)) for i in range(self.n)))
+ self.s = Series(np.random.randint(self.n, size=(10 ** 3)))
+
+ def time_replace_large_dict(self):
+ self.s.replace(self.to_rep, inplace=True)
+
+
+class replace_replacena(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 1000000
+ try:
+ self.rng = date_range('1/1/2000', periods=self.N, freq='min')
+ except NameError:
+ self.rng = DatetimeIndex('1/1/2000', periods=self.N, offset=datetools.Minute())
+ self.date_range = DateRange
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+
+ def time_replace_replacena(self):
+ self.ts.replace(np.nan, 0.0, inplace=True)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
new file mode 100644
index 0000000000000..b4081957af97b
--- /dev/null
+++ b/asv_bench/benchmarks/reshape.py
@@ -0,0 +1,76 @@
+from pandas_vb_common import *
+from pandas.core.reshape import melt
+
+
+class melt_dataframe(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
+ self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
+ self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C'])
+ self.df['id1'] = np.random.randint(0, 10, 10000)
+ self.df['id2'] = np.random.randint(100, 1000, 10000)
+
+ def time_melt_dataframe(self):
+ melt(self.df, id_vars=['id1', 'id2'])
+
+
+class reshape_pivot_time_series(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
+ self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
+
+ def unpivot(frame):
+ (N, K) = frame.shape
+ self.data = {'value': frame.values.ravel('F'), 'variable': np.asarray(frame.columns).repeat(N), 'date': np.tile(np.asarray(frame.index), K), }
+ return DataFrame(self.data, columns=['date', 'variable', 'value'])
+ self.index = date_range('1/1/2000', periods=10000, freq='h')
+ self.df = DataFrame(randn(10000, 50), index=self.index, columns=range(50))
+ self.pdf = unpivot(self.df)
+ self.f = (lambda : self.pdf.pivot('date', 'variable', 'value'))
+
+ def time_reshape_pivot_time_series(self):
+ self.f()
+
+
+class reshape_stack_simple(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
+ self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
+ self.udf = self.df.unstack(1)
+
+ def time_reshape_stack_simple(self):
+ self.udf.stack()
+
+
+class reshape_unstack_simple(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
+ self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
+
+ def time_reshape_unstack_simple(self):
+ self.df.unstack(1)
+
+
+class unstack_sparse_keyspace(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
+ self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
+ self.NUM_ROWS = 1000
+ for iter in range(10):
+ self.df = DataFrame({'A': np.random.randint(50, size=self.NUM_ROWS), 'B': np.random.randint(50, size=self.NUM_ROWS), 'C': np.random.randint((-10), 10, size=self.NUM_ROWS), 'D': np.random.randint((-10), 10, size=self.NUM_ROWS), 'E': np.random.randint(10, size=self.NUM_ROWS), 'F': np.random.randn(self.NUM_ROWS), })
+ self.idf = self.df.set_index(['A', 'B', 'C', 'D', 'E'])
+ if (len(self.idf.index.unique()) == self.NUM_ROWS):
+ break
+
+ def time_unstack_sparse_keyspace(self):
+ self.idf.unstack()
\ No newline at end of file
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
new file mode 100644
index 0000000000000..9cd61c741dae1
--- /dev/null
+++ b/asv_bench/benchmarks/series_methods.py
@@ -0,0 +1,74 @@
+from pandas_vb_common import *
+
+
+class series_isin_int64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s1 = Series(np.random.randn(10000))
+ self.s2 = Series(np.random.randint(1, 10, 10000))
+ self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
+ self.values = [1, 2]
+ self.s4 = self.s3.astype('object')
+
+ def time_series_isin_int64(self):
+ self.s3.isin(self.values)
+
+
+class series_isin_object(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s1 = Series(np.random.randn(10000))
+ self.s2 = Series(np.random.randint(1, 10, 10000))
+ self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
+ self.values = [1, 2]
+ self.s4 = self.s3.astype('object')
+
+ def time_series_isin_object(self):
+ self.s4.isin(self.values)
+
+
+class series_nlargest1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s1 = Series(np.random.randn(10000))
+ self.s2 = Series(np.random.randint(1, 10, 10000))
+ self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
+ self.values = [1, 2]
+ self.s4 = self.s3.astype('object')
+
+ def time_series_nlargest1(self):
+ self.s1.nlargest(3, take_last=True)
+ self.s1.nlargest(3, take_last=False)
+
+
+class series_nlargest2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s1 = Series(np.random.randn(10000))
+ self.s2 = Series(np.random.randint(1, 10, 10000))
+ self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
+ self.values = [1, 2]
+ self.s4 = self.s3.astype('object')
+
+ def time_series_nlargest2(self):
+ self.s2.nlargest(3, take_last=True)
+ self.s2.nlargest(3, take_last=False)
+
+
+class series_nsmallest2(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s1 = Series(np.random.randn(10000))
+ self.s2 = Series(np.random.randint(1, 10, 10000))
+ self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
+ self.values = [1, 2]
+ self.s4 = self.s3.astype('object')
+
+ def time_series_nsmallest2(self):
+ self.s2.nsmallest(3, take_last=True)
+ self.s2.nsmallest(3, take_last=False)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
new file mode 100644
index 0000000000000..dbf35f5e40f55
--- /dev/null
+++ b/asv_bench/benchmarks/sparse.py
@@ -0,0 +1,55 @@
+from pandas_vb_common import *
+import scipy.sparse
+import pandas.sparse.series
+from pandas.core.sparse import SparseSeries, SparseDataFrame
+from pandas.core.sparse import SparseDataFrame
+
+
+class sparse_series_to_frame(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.K = 50
+ self.N = 50000
+ self.rng = np.asarray(date_range('1/1/2000', periods=self.N, freq='T'))
+ self.series = {}
+ for i in range(1, (self.K + 1)):
+ self.data = np.random.randn(self.N)[:(- i)]
+ self.this_rng = self.rng[:(- i)]
+ self.data[100:] = np.nan
+ self.series[i] = SparseSeries(self.data, index=self.this_rng)
+
+ def time_sparse_series_to_frame(self):
+ SparseDataFrame(self.series)
+
+
+class sparse_frame_constructor(object):
+ goal_time = 0.2
+
+ def time_sparse_frame_constructor(self):
+ SparseDataFrame(columns=np.arange(100), index=np.arange(1000))
+
+
+class sparse_series_from_coo(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100))
+
+ def time_sparse_series_from_coo(self):
+ self.ss = pandas.sparse.series.SparseSeries.from_coo(self.A)
+
+
+class sparse_series_to_coo(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = pd.Series(([np.nan] * 10000))
+ self.s[0] = 3.0
+ self.s[100] = (-1.0)
+ self.s[999] = 12.1
+ self.s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10)))
+ self.ss = self.s.to_sparse()
+
+ def time_sparse_series_to_coo(self):
+ self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
new file mode 100644
index 0000000000000..98e2bbfce1a44
--- /dev/null
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -0,0 +1,236 @@
+from pandas_vb_common import *
+
+
+class stat_ops_frame_mean_float_axis_0(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(100000, 4))
+ self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape))
+
+ def time_stat_ops_frame_mean_float_axis_0(self):
+ self.df.mean()
+
+
+class stat_ops_frame_mean_float_axis_1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(100000, 4))
+ self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape))
+
+ def time_stat_ops_frame_mean_float_axis_1(self):
+ self.df.mean(1)
+
+
+class stat_ops_frame_mean_int_axis_0(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(100000, 4))
+ self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape))
+
+ def time_stat_ops_frame_mean_int_axis_0(self):
+ self.dfi.mean()
+
+
+class stat_ops_frame_mean_int_axis_1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(100000, 4))
+ self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape))
+
+ def time_stat_ops_frame_mean_int_axis_1(self):
+ self.dfi.mean(1)
+
+
+class stat_ops_frame_sum_float_axis_0(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(100000, 4))
+ self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape))
+
+ def time_stat_ops_frame_sum_float_axis_0(self):
+ self.df.sum()
+
+
+class stat_ops_frame_sum_float_axis_1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(100000, 4))
+ self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape))
+
+ def time_stat_ops_frame_sum_float_axis_1(self):
+ self.df.sum(1)
+
+
+class stat_ops_frame_sum_int_axis_0(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(100000, 4))
+ self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape))
+
+ def time_stat_ops_frame_sum_int_axis_0(self):
+ self.dfi.sum()
+
+
+class stat_ops_frame_sum_int_axis_1(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(100000, 4))
+ self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape))
+
+ def time_stat_ops_frame_sum_int_axis_1(self):
+ self.dfi.sum(1)
+
+
+class stat_ops_level_frame_sum(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ random.shuffle(self.index.values)
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
+
+ def time_stat_ops_level_frame_sum(self):
+ self.df.sum(level=1)
+
+
+class stat_ops_level_frame_sum_multiple(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ random.shuffle(self.index.values)
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
+
+ def time_stat_ops_level_frame_sum_multiple(self):
+ self.df.sum(level=[0, 1])
+
+
+class stat_ops_level_series_sum(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ random.shuffle(self.index.values)
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
+
+ def time_stat_ops_level_series_sum(self):
+ self.df[1].sum(level=1)
+
+
+class stat_ops_level_series_sum_multiple(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
+ random.shuffle(self.index.values)
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
+
+ def time_stat_ops_level_series_sum_multiple(self):
+ self.df[1].sum(level=[0, 1])
+
+
+class stat_ops_series_std(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.s = Series(np.random.randn(100000), index=np.arange(100000))
+ self.s[::2] = np.nan
+
+ def time_stat_ops_series_std(self):
+ self.s.std()
+
+
+class stats_corr_spearman(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(1000, 30))
+
+ def time_stats_corr_spearman(self):
+ self.df.corr(method='spearman')
+
+
+class stats_rank2d_axis0_average(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(5000, 50))
+
+ def time_stats_rank2d_axis0_average(self):
+ self.df.rank()
+
+
+class stats_rank2d_axis1_average(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.df = DataFrame(np.random.randn(5000, 50))
+
+ def time_stats_rank2d_axis1_average(self):
+ self.df.rank(1)
+
+
+class stats_rank_average(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
+ self.s = Series(self.values)
+
+ def time_stats_rank_average(self):
+ self.s.rank()
+
+
+class stats_rank_average_int(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.values = np.random.randint(0, 100000, size=200000)
+ self.s = Series(self.values)
+
+ def time_stats_rank_average_int(self):
+ self.s.rank()
+
+
+class stats_rank_pct_average(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
+ self.s = Series(self.values)
+
+ def time_stats_rank_pct_average(self):
+ self.s.rank(pct=True)
+
+
+class stats_rank_pct_average_old(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
+ self.s = Series(self.values)
+
+ def time_stats_rank_pct_average_old(self):
+ (self.s.rank() / len(self.s))
+
+
+class stats_rolling_mean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.arr = np.random.randn(100000)
+
+ def time_stats_rolling_mean(self):
+ rolling_mean(self.arr, 100)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
new file mode 100644
index 0000000000000..5adfbf4c2557d
--- /dev/null
+++ b/asv_bench/benchmarks/strings.py
@@ -0,0 +1,393 @@
+from pandas_vb_common import *
+import string
+import itertools as IT
+import pandas.util.testing as testing
+
+
+class strings_cat(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_cat(self):
+ self.many.str.cat(sep=',')
+
+
+class strings_center(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_center(self):
+ self.many.str.center(100)
+
+
+class strings_contains_few(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_contains_few(self):
+ self.few.str.contains('matchthis')
+
+
+class strings_contains_few_noregex(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_contains_few_noregex(self):
+ self.few.str.contains('matchthis', regex=False)
+
+
+class strings_contains_many(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_contains_many(self):
+ self.many.str.contains('matchthis')
+
+
+class strings_contains_many_noregex(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_contains_many_noregex(self):
+ self.many.str.contains('matchthis', regex=False)
+
+
+class strings_count(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_count(self):
+ self.many.str.count('matchthis')
+
+
+class strings_encode_decode(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.ser = Series(testing.makeUnicodeIndex())
+
+ def time_strings_encode_decode(self):
+ self.ser.str.encode('utf-8').str.decode('utf-8')
+
+
+class strings_endswith(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_endswith(self):
+ self.many.str.endswith('matchthis')
+
+
+class strings_extract(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_extract(self):
+ self.many.str.extract('(\\w*)matchthis(\\w*)')
+
+
+class strings_findall(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_findall(self):
+ self.many.str.findall('[A-Z]+')
+
+
+class strings_get(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_get(self):
+ self.many.str.get(0)
+
+
+class strings_get_dummies(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+ self.s = make_series(string.uppercase, strlen=10, size=10000).str.join('|')
+
+ def time_strings_get_dummies(self):
+ self.s.str.get_dummies('|')
+
+
+class strings_join_split(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_join_split(self):
+ self.many.str.join('--').str.split('--')
+
+
+class strings_join_split_expand(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_join_split_expand(self):
+ self.many.str.join('--').str.split('--', expand=True)
+
+
+class strings_len(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_len(self):
+ self.many.str.len()
+
+
+class strings_lower(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_lower(self):
+ self.many.str.lower()
+
+
+class strings_lstrip(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_lstrip(self):
+ self.many.str.lstrip('matchthis')
+
+
+class strings_match(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_match(self):
+ self.many.str.match('mat..this')
+
+
+class strings_pad(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_pad(self):
+ self.many.str.pad(100, side='both')
+
+
+class strings_repeat(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_repeat(self):
+ self.many.str.repeat(list(IT.islice(IT.cycle(range(1, 4)), len(self.many))))
+
+
+class strings_replace(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_replace(self):
+ self.many.str.replace('(matchthis)', '\x01\x01')
+
+
+class strings_rstrip(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_rstrip(self):
+ self.many.str.rstrip('matchthis')
+
+
+class strings_slice(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_slice(self):
+ self.many.str.slice(5, 15, 2)
+
+
+class strings_startswith(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_startswith(self):
+ self.many.str.startswith('matchthis')
+
+
+class strings_strip(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_strip(self):
+ self.many.str.strip('matchthis')
+
+
+class strings_title(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_title(self):
+ self.many.str.title()
+
+
+class strings_upper(object):
+ goal_time = 0.2
+
+ def setup(self):
+
+ def make_series(letters, strlen, size):
+ return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen)))
+ self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000)
+ self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000)
+
+ def time_strings_upper(self):
+ self.many.str.upper()
\ No newline at end of file
diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py
new file mode 100644
index 0000000000000..36a0f98e3f5ef
--- /dev/null
+++ b/asv_bench/benchmarks/timedelta.py
@@ -0,0 +1,34 @@
+from pandas_vb_common import *
+from pandas import to_timedelta
+
+
+class timedelta_convert_int(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.arr = np.random.randint(0, 1000, size=10000)
+
+ def time_timedelta_convert_int(self):
+ to_timedelta(self.arr, unit='s')
+
+
+class timedelta_convert_string(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.arr = np.random.randint(0, 1000, size=10000)
+ self.arr = ['{0} days'.format(i) for i in self.arr]
+
+ def time_timedelta_convert_string(self):
+ to_timedelta(self.arr)
+
+
+class timedelta_convert_string_seconds(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.arr = np.random.randint(0, 60, size=10000)
+ self.arr = ['00:00:{0:02d}'.format(i) for i in self.arr]
+
+ def time_timedelta_convert_string_seconds(self):
+ to_timedelta(self.arr)
\ No newline at end of file
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
new file mode 100644
index 0000000000000..266c198de1455
--- /dev/null
+++ b/asv_bench/benchmarks/timeseries.py
@@ -0,0 +1,1046 @@
+from pandas.tseries.converter import DatetimeConverter
+import pandas as pd
+from datetime import timedelta
+import datetime as dt
+from pandas_vb_common import *
+from pandas.tseries.frequencies import infer_freq
+import pandas.tseries.holiday
+import numpy as np
+
+
+class dataframe_resample_max_numpy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='20130101', periods=100000, freq='50L')
+ self.df = DataFrame(np.random.randn(100000, 2), index=self.rng)
+
+ def time_dataframe_resample_max_numpy(self):
+ self.df.resample('1s', how=np.max)
+
+
+class dataframe_resample_max_string(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='20130101', periods=100000, freq='50L')
+ self.df = DataFrame(np.random.randn(100000, 2), index=self.rng)
+
+ def time_dataframe_resample_max_string(self):
+ self.df.resample('1s', how='max')
+
+
+class dataframe_resample_mean_numpy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='20130101', periods=100000, freq='50L')
+ self.df = DataFrame(np.random.randn(100000, 2), index=self.rng)
+
+ def time_dataframe_resample_mean_numpy(self):
+ self.df.resample('1s', how=np.mean)
+
+
+class dataframe_resample_mean_string(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='20130101', periods=100000, freq='50L')
+ self.df = DataFrame(np.random.randn(100000, 2), index=self.rng)
+
+ def time_dataframe_resample_mean_string(self):
+ self.df.resample('1s', how='mean')
+
+
+class dataframe_resample_min_numpy(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='20130101', periods=100000, freq='50L')
+ self.df = DataFrame(np.random.randn(100000, 2), index=self.rng)
+
+ def time_dataframe_resample_min_numpy(self):
+ self.df.resample('1s', how=np.min)
+
+
+class dataframe_resample_min_string(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='20130101', periods=100000, freq='50L')
+ self.df = DataFrame(np.random.randn(100000, 2), index=self.rng)
+
+ def time_dataframe_resample_min_string(self):
+ self.df.resample('1s', how='min')
+
+
+class datetimeindex_add_offset(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', periods=10000, freq='T')
+
+ def time_datetimeindex_add_offset(self):
+ (self.rng + timedelta(minutes=2))
+
+
+class datetimeindex_converter(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+
+ def time_datetimeindex_converter(self):
+ DatetimeConverter.convert(self.rng, None, None)
+
+
+class datetimeindex_infer_dst(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S')
+ self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S')
+ self.index = self.index.append(self.dst_rng)
+ self.index = self.index.append(self.dst_rng)
+ self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S'))
+
+ def time_datetimeindex_infer_dst(self):
+ self.index.tz_localize('US/Eastern', infer_dst=True)
+
+
+class datetimeindex_normalize(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern')
+
+ def time_datetimeindex_normalize(self):
+ self.rng.normalize()
+
+
+class datetimeindex_unique(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', periods=1000, freq='T')
+ self.index = self.rng.repeat(10)
+
+ def time_datetimeindex_unique(self):
+ self.index.unique()
+
+
+class dti_reset_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', periods=1000, freq='H')
+ self.df = DataFrame(np.random.randn(len(self.rng), 2), self.rng)
+
+ def time_dti_reset_index(self):
+ self.df.reset_index()
+
+
+class dti_reset_index_tz(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern')
+ self.df = DataFrame(np.random.randn(len(self.rng), 2), index=self.rng)
+
+ def time_dti_reset_index_tz(self):
+ self.df.reset_index()
+
+
+class period_setitem(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = period_range(start='1/1/1990', freq='S', periods=20000)
+ self.df = DataFrame(index=range(len(self.rng)))
+
+ def time_period_setitem(self):
+ self.df['col'] = self.rng
+
+
+class timeseries_1min_5min_mean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+
+ def time_timeseries_1min_5min_mean(self):
+ self.ts[:10000].resample('5min', how='mean')
+
+
+class timeseries_1min_5min_ohlc(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+
+ def time_timeseries_1min_5min_ohlc(self):
+ self.ts[:10000].resample('5min', how='ohlc')
+
+
+class timeseries_add_irregular(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.lindex = np.random.permutation(self.N)[:(self.N // 2)]
+ self.rindex = np.random.permutation(self.N)[:(self.N // 2)]
+ self.left = Series(self.ts.values.take(self.lindex), index=self.ts.index.take(self.lindex))
+ self.right = Series(self.ts.values.take(self.rindex), index=self.ts.index.take(self.rindex))
+
+ def time_timeseries_add_irregular(self):
+ (self.left + self.right)
+
+
+class timeseries_asof(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 10000
+ self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
+
+ def time_timeseries_asof(self):
+ self.ts.asof(self.dates)
+
+
+class timeseries_asof_nan(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 10000
+ self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
+ self.ts[250:5000] = np.nan
+
+ def time_timeseries_asof_nan(self):
+ self.ts.asof(self.dates)
+
+
+class timeseries_asof_single(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 10000
+ self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s')
+
+ def time_timeseries_asof_single(self):
+ self.ts.asof(self.dates[0])
+
+
+class timeseries_custom_bday_apply(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bday_apply(self):
+ self.cday.apply(self.date)
+
+
+class timeseries_custom_bday_apply_dt64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bday_apply_dt64(self):
+ self.cday.apply(self.dt64)
+
+
+class timeseries_custom_bday_cal_decr(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bday_cal_decr(self):
+ (self.date - (1 * self.cdayh))
+
+
+class timeseries_custom_bday_cal_incr(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bday_cal_incr(self):
+ (self.date + (1 * self.cdayh))
+
+
+class timeseries_custom_bday_cal_incr_n(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bday_cal_incr_n(self):
+ (self.date + (10 * self.cdayh))
+
+
+class timeseries_custom_bday_cal_incr_neg_n(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bday_cal_incr_neg_n(self):
+ (self.date - (10 * self.cdayh))
+
+
+class timeseries_custom_bday_decr(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bday_decr(self):
+ (self.date - self.cday)
+
+
+class timeseries_custom_bday_incr(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bday_incr(self):
+ (self.date + self.cday)
+
+
+class timeseries_custom_bmonthbegin_decr_n(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bmonthbegin_decr_n(self):
+ (self.date - (10 * self.cmb))
+
+
+class timeseries_custom_bmonthbegin_incr_n(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bmonthbegin_incr_n(self):
+ (self.date + (10 * self.cmb))
+
+
+class timeseries_custom_bmonthend_decr_n(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bmonthend_decr_n(self):
+ (self.date - (10 * self.cme))
+
+
+class timeseries_custom_bmonthend_incr(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bmonthend_incr(self):
+ (self.date + self.cme)
+
+
+class timeseries_custom_bmonthend_incr_n(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_custom_bmonthend_incr_n(self):
+ (self.date + (10 * self.cme))
+
+
+class timeseries_day_apply(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_day_apply(self):
+ self.day.apply(self.date)
+
+
+class timeseries_day_incr(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_day_incr(self):
+ (self.date + self.day)
+
+
+class timeseries_infer_freq(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/1700', freq='D', periods=100000)
+ self.a = self.rng[:50000].append(self.rng[50002:])
+
+ def time_timeseries_infer_freq(self):
+ infer_freq(self.a)
+
+
+class timeseries_is_month_start(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 10000
+ self.rng = date_range(start='1/1/1', periods=self.N, freq='B')
+
+ def time_timeseries_is_month_start(self):
+ self.rng.is_month_start
+
+
+class timeseries_iter_datetimeindex(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 1000000
+ self.M = 10000
+ self.idx1 = date_range(start='20140101', freq='T', periods=self.N)
+ self.idx2 = period_range(start='20140101', freq='T', periods=self.N)
+
+ def iter_n(iterable, n=None):
+ self.i = 0
+ for _ in iterable:
+ self.i += 1
+ if ((n is not None) and (self.i > n)):
+ break
+
+ def time_timeseries_iter_datetimeindex(self):
+ iter_n(self.idx1)
+
+
+class timeseries_iter_datetimeindex_preexit(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 1000000
+ self.M = 10000
+ self.idx1 = date_range(start='20140101', freq='T', periods=self.N)
+ self.idx2 = period_range(start='20140101', freq='T', periods=self.N)
+
+ def iter_n(iterable, n=None):
+ self.i = 0
+ for _ in iterable:
+ self.i += 1
+ if ((n is not None) and (self.i > n)):
+ break
+
+ def time_timeseries_iter_datetimeindex_preexit(self):
+ iter_n(self.idx1, self.M)
+
+
+class timeseries_iter_periodindex(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 1000000
+ self.M = 10000
+ self.idx1 = date_range(start='20140101', freq='T', periods=self.N)
+ self.idx2 = period_range(start='20140101', freq='T', periods=self.N)
+
+ def iter_n(iterable, n=None):
+ self.i = 0
+ for _ in iterable:
+ self.i += 1
+ if ((n is not None) and (self.i > n)):
+ break
+
+ def time_timeseries_iter_periodindex(self):
+ iter_n(self.idx2)
+
+
+class timeseries_iter_periodindex_preexit(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 1000000
+ self.M = 10000
+ self.idx1 = date_range(start='20140101', freq='T', periods=self.N)
+ self.idx2 = period_range(start='20140101', freq='T', periods=self.N)
+
+ def iter_n(iterable, n=None):
+ self.i = 0
+ for _ in iterable:
+ self.i += 1
+ if ((n is not None) and (self.i > n)):
+ break
+
+ def time_timeseries_iter_periodindex_preexit(self):
+ iter_n(self.idx2, self.M)
+
+
+class timeseries_large_lookup_value(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', periods=1500000, freq='S')
+ self.ts = Series(1, index=self.rng)
+
+ def time_timeseries_large_lookup_value(self):
+ self.ts[self.ts.index[(len(self.ts) // 2)]]
+ self.ts.index._cleanup()
+
+
+class timeseries_period_downsample_mean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = period_range(start='1/1/2000', end='1/1/2001', freq='T')
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+
+ def time_timeseries_period_downsample_mean(self):
+ self.ts.resample('D', how='mean')
+
+
+class timeseries_resample_datetime64(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U')
+ self.int_ts = Series(5, self.rng, dtype='int64')
+ self.ts = self.int_ts.astype('datetime64[ns]')
+
+ def time_timeseries_resample_datetime64(self):
+ self.ts.resample('1S', how='last')
+
+
+class timeseries_slice_minutely(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+
+ def time_timeseries_slice_minutely(self):
+ self.ts[:10000]
+
+
+class timeseries_sort_index(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='s')
+ self.rng = self.rng.take(np.random.permutation(self.N))
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+
+ def time_timeseries_sort_index(self):
+ self.ts.sort_index()
+
+
+class timeseries_timestamp_downsample_mean(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', end='1/1/2001', freq='T')
+ self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
+
+ def time_timeseries_timestamp_downsample_mean(self):
+ self.ts.resample('D', how='mean')
+
+
+class timeseries_timestamp_tzinfo_cons(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern')
+
+ def time_timeseries_timestamp_tzinfo_cons(self):
+ self.rng[0]
+
+
+class timeseries_to_datetime_YYYYMMDD(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', periods=10000, freq='D')
+ self.strings = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str)
+
+ def time_timeseries_to_datetime_YYYYMMDD(self):
+ to_datetime(self.strings, format='%Y%m%d')
+
+
+class timeseries_to_datetime_iso8601(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
+ self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
+
+ def time_timeseries_to_datetime_iso8601(self):
+ to_datetime(self.strings)
+
+
+class timeseries_to_datetime_iso8601_format(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
+ self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
+
+ def time_timeseries_to_datetime_iso8601_format(self):
+ to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S')
+
+
+class timeseries_with_format_no_exact(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000))
+
+ def time_timeseries_with_format_no_exact(self):
+ to_datetime(self.s, format='%d%b%y', exact=False)
+
+
+class timeseries_with_format_replace(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000))
+
+ def time_timeseries_with_format_replace(self):
+ to_datetime(self.s.str.replace(':\\S+$', ''), format='%d%b%y')
+
+
+class timeseries_year_apply(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_year_apply(self):
+ self.year.apply(self.date)
+
+
+class timeseries_year_incr(object):
+ goal_time = 0.2
+
+ def setup(self):
+ self.N = 100000
+ self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
+ if hasattr(Series, 'convert'):
+ Series.resample = Series.convert
+ self.ts = Series(np.random.randn(self.N), index=self.rng)
+ self.date = dt.datetime(2011, 1, 1)
+ self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.hcal = pd.tseries.holiday.USFederalHolidayCalendar()
+ self.day = pd.offsets.Day()
+ self.year = pd.offsets.YearBegin()
+ self.cday = pd.offsets.CustomBusinessDay()
+ self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal)
+ self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal)
+ self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
+
+ def time_timeseries_year_incr(self):
+ (self.date + self.year)
\ No newline at end of file
diff --git a/asv_bench/vbench_to_asv.py b/asv_bench/vbench_to_asv.py
new file mode 100644
index 0000000000000..b3980ffed1a57
--- /dev/null
+++ b/asv_bench/vbench_to_asv.py
@@ -0,0 +1,151 @@
+import ast
+import vbench
+import os
+import sys
+import astor
+import glob
+
+
+def vbench_to_asv_source(bench, kinds=None):
+ tab = ' ' * 4
+ if kinds is None:
+ kinds = ['time']
+
+ output = 'class {}(object):\n'.format(bench.name)
+ output += tab + 'goal_time = 0.2\n\n'
+
+ if bench.setup:
+ indented_setup = [tab * 2 + '{}\n'.format(x) for x in bench.setup.splitlines()]
+ output += tab + 'def setup(self):\n' + ''.join(indented_setup) + '\n'
+
+ for kind in kinds:
+ output += tab + 'def {}_{}(self):\n'.format(kind, bench.name)
+ for line in bench.code.splitlines():
+ output += tab * 2 + line + '\n'
+ output += '\n\n'
+
+ if bench.cleanup:
+ output += tab + 'def teardown(self):\n' + tab * 2 + bench.cleanup
+
+ output += '\n\n'
+ return output
+
+
+class AssignToSelf(ast.NodeTransformer):
+ def __init__(self):
+ super(AssignToSelf, self).__init__()
+ self.transforms = {}
+ self.imports = []
+
+ self.in_class_define = False
+ self.in_setup = False
+
+ def visit_ClassDef(self, node):
+ self.transforms = {}
+ self.in_class_define = True
+ self.generic_visit(node)
+ return node
+
+ def visit_TryExcept(self, node):
+ if any([isinstance(x, (ast.Import, ast.ImportFrom)) for x in node.body]):
+ self.imports.append(node)
+ else:
+ self.generic_visit(node)
+ return node
+
+ def visit_Assign(self, node):
+ for target in node.targets:
+ if isinstance(target, ast.Name) and not isinstance(target.ctx, ast.Param) and not self.in_class_define:
+ self.transforms[target.id] = 'self.' + target.id
+ self.generic_visit(node)
+
+ return node
+
+ def visit_Name(self, node):
+ new_node = node
+ if node.id in self.transforms:
+ if not isinstance(node.ctx, ast.Param):
+ new_node = ast.Attribute(value=ast.Name(id='self', ctx=node.ctx), attr=node.id, ctx=node.ctx)
+
+ self.generic_visit(node)
+
+ return ast.copy_location(new_node, node)
+
+ def visit_Import(self, node):
+ self.imports.append(node)
+
+ def visit_ImportFrom(self, node):
+ self.imports.append(node)
+
+ def visit_FunctionDef(self, node):
+ """Delete functions that are empty due to imports being moved"""
+ self.in_class_define = False
+
+ if self.in_setup:
+ node.col_offset -= 4
+ ast.increment_lineno(node, -1)
+
+ if node.name == 'setup':
+ self.in_setup = True
+
+ self.generic_visit(node)
+
+ if node.name == 'setup':
+ self.in_setup = False
+
+ if node.body:
+ return node
+
+
+def translate_module(target_module):
+ g_vars = {}
+ l_vars = {}
+ exec('import ' + target_module) in g_vars
+
+ print target_module
+ module = eval(target_module, g_vars)
+
+ benchmarks = []
+ for obj_str in dir(module):
+ obj = getattr(module, obj_str)
+ if isinstance(obj, vbench.benchmark.Benchmark):
+ benchmarks.append(obj)
+
+ if not benchmarks:
+ return
+
+ rewritten_output = ''
+ for bench in benchmarks:
+ rewritten_output += vbench_to_asv_source(bench)
+
+ with open('rewrite.py', 'w') as f:
+ f.write(rewritten_output)
+
+ ast_module = ast.parse(rewritten_output)
+
+ transformer = AssignToSelf()
+ transformed_module = transformer.visit(ast_module)
+
+ unique_imports = {astor.to_source(node): node for node in transformer.imports}
+
+ transformed_module.body = unique_imports.values() + transformed_module.body
+
+ transformed_source = astor.to_source(transformed_module)
+
+ with open('benchmarks/{}.py'.format(target_module), 'w') as f:
+ f.write(transformed_source)
+
+
+if __name__ == '__main__':
+ cwd = os.getcwd()
+ new_dir = os.path.join(os.path.dirname(__file__), '../vb_suite')
+ sys.path.insert(0, new_dir)
+
+ for module in glob.glob(os.path.join(new_dir, '*.py')):
+ mod = os.path.basename(module)
+ if mod in ['make.py', 'measure_memory_consumption.py', 'perf_HEAD.py', 'run_suite.py', 'test_perf.py', 'generate_rst_files.py', 'test.py', 'suite.py']:
+ continue
+ print
+ print mod
+
+ translate_module(mod.replace('.py', ''))
diff --git a/bench/bench_sparse.py b/bench/bench_sparse.py
index 7dc2db05cfe20..0aa705118d970 100644
--- a/bench/bench_sparse.py
+++ b/bench/bench_sparse.py
@@ -1,4 +1,3 @@
-import sys
import numpy as np
from pandas import *
@@ -30,7 +29,7 @@
s1_dense = s1.to_dense()
s2_dense = s2.to_dense()
-if 'linux' in sys.platform:
+if compat.is_platform_linux():
pth = '/home/wesm/code/pandas/example'
else:
pth = '/Users/wesm/code/pandas/example'
diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt
index 0d515f300f5a7..951c8798bef15 100644
--- a/ci/requirements-2.7.txt
+++ b/ci/requirements-2.7.txt
@@ -17,7 +17,7 @@ boto=2.36.0
bottleneck=0.8.0
psycopg2=2.5.2
patsy
-pymysql=0.6.1
+pymysql=0.6.3
html5lib=1.0b2
beautiful-soup=4.2.1
httplib2=0.8
diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt
deleted file mode 100644
index 8c2f675b65603..0000000000000
--- a/ci/requirements-3.2.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-python-dateutil==2.1
-pytz==2013b
-numpy==1.7.1
-cython==0.19.1
diff --git a/ci/requirements-3.4.txt b/ci/requirements-3.4.txt
index 24af93fb16194..fd0a5bc53dd7e 100644
--- a/ci/requirements-3.4.txt
+++ b/ci/requirements-3.4.txt
@@ -3,6 +3,7 @@ pytz
openpyxl
xlsxwriter
xlrd
+xlwt
html5lib
patsy
beautiful-soup
diff --git a/ci/requirements-3.4_SLOW.txt b/ci/requirements-3.4_SLOW.txt
index 6372d9b4f6068..ecc31dad78d07 100644
--- a/ci/requirements-3.4_SLOW.txt
+++ b/ci/requirements-3.4_SLOW.txt
@@ -3,6 +3,7 @@ pytz
openpyxl
xlsxwriter
xlrd
+xlwt
html5lib
patsy
beautiful-soup
diff --git a/ci/script.sh b/ci/script.sh
index d5082234024d5..1126e8249646c 100755
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -15,8 +15,8 @@ fi
if [ "$BUILD_TEST" ]; then
echo "We are not running nosetests as this is simply a build test."
else
- echo nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml
- nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml
+ echo nosetests --exe -A "$NOSE_ARGS" pandas --doctest-tests --with-xunit --xunit-file=/tmp/nosetests.xml
+ nosetests --exe -A "$NOSE_ARGS" pandas --doctest-tests --with-xunit --xunit-file=/tmp/nosetests.xml
fi
RET="$?"
diff --git a/conda.recipe/bld.bat b/conda.recipe/bld.bat
index cc977c65dcbe1..284926fae8c04 100644
--- a/conda.recipe/bld.bat
+++ b/conda.recipe/bld.bat
@@ -1,2 +1,2 @@
@echo off
-%PYTHON% setup.py install --quiet
+%PYTHON% setup.py install
diff --git a/conda.recipe/build.sh b/conda.recipe/build.sh
index bce23bf0c6549..f341bce6fcf96 100644
--- a/conda.recipe/build.sh
+++ b/conda.recipe/build.sh
@@ -1,2 +1,2 @@
-#!/bin/bash
-$PYTHON setup.py install --quiet
+#!/bin/sh
+$PYTHON setup.py install
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 6817fbc9b43e0..6f0fd4fda47a3 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -1,6 +1,6 @@
package:
- name: pandas
- version: {{ environ.get('GIT_DESCRIBE_TAG', '') }}
+ name: pandas
+ version: {{ environ.get('GIT_DESCRIBE_TAG', '').replace('.dev', 'dev') }}
build:
number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }}
@@ -28,10 +28,9 @@ requirements:
test:
requires:
- nose
- - coverage
commands:
- - python -c "import pandas"
+ - nosetests --exe -A "not slow and not network and not disabled" pandas
about:
home: http://pandas.pydata.org
diff --git a/doc/source/api.rst b/doc/source/api.rst
index 76e03ce70342f..6b188deb9eb42 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -82,6 +82,15 @@ HDFStore: PyTables (HDF5)
HDFStore.get
HDFStore.select
+SAS
+~~~
+
+.. autosummary::
+ :toctree: generated/
+
+ read_sas
+ XportReader
+
SQL
~~~
@@ -509,6 +518,7 @@ These can be accessed like ``Series.dt.``.
Series.dt.tz_localize
Series.dt.tz_convert
Series.dt.normalize
+ Series.dt.strftime
**Timedelta Properties**
@@ -798,9 +808,7 @@ Binary operator functions
DataFrame.ne
DataFrame.eq
DataFrame.combine
- DataFrame.combineAdd
DataFrame.combine_first
- DataFrame.combineMult
Function application, GroupBy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -903,6 +911,8 @@ Reshaping, sorting, transposing
DataFrame.sort
DataFrame.sort_index
DataFrame.sortlevel
+ DataFrame.nlargest
+ DataFrame.nsmallest
DataFrame.swaplevel
DataFrame.stack
DataFrame.unstack
@@ -1443,6 +1453,7 @@ Conversion
DatetimeIndex.to_datetime
DatetimeIndex.to_period
+ DatetimeIndex.to_perioddelta
DatetimeIndex.to_pydatetime
DatetimeIndex.to_series
@@ -1558,7 +1569,6 @@ application to columns of a specific data type.
DataFrameGroupBy.hist
DataFrameGroupBy.idxmax
DataFrameGroupBy.idxmin
- DataFrameGroupBy.irow
DataFrameGroupBy.mad
DataFrameGroupBy.pct_change
DataFrameGroupBy.plot
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
index 349e7e25fdafb..71d16a40f0215 100644
--- a/doc/source/basics.rst
+++ b/doc/source/basics.rst
@@ -240,14 +240,14 @@ way to summarize a boolean result.
.. ipython:: python
- (df>0).all()
- (df>0).any()
+ (df > 0).all()
+ (df > 0).any()
You can reduce to a final boolean value.
.. ipython:: python
- (df>0).any().any()
+ (df > 0).any().any()
You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` property.
@@ -330,6 +330,48 @@ equality to be True:
df1.equals(df2)
df1.equals(df2.sort())
+Comparing array-like objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can conveniently do element-wise comparisons when comparing a pandas
+data structure with a scalar value:
+
+.. ipython:: python
+
+ pd.Series(['foo', 'bar', 'baz']) == 'foo'
+ pd.Index(['foo', 'bar', 'baz']) == 'foo'
+
+Pandas also handles element-wise comparisons between different array-like
+objects of the same length:
+
+.. ipython:: python
+
+ pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])
+ pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux'])
+
+Trying to compare ``Index`` or ``Series`` objects of different lengths will
+raise a ValueError:
+
+.. code-block:: python
+
+ In [55]: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar'])
+ ValueError: Series lengths must match to compare
+
+ In [56]: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo'])
+ ValueError: Series lengths must match to compare
+
+Note that this is different from the numpy behavior where a comparison can
+be broadcast:
+
+.. ipython:: python
+
+ np.array([1, 2, 3]) == np.array([2])
+
+or it can return False if broadcasting can not be done:
+
+.. ipython:: python
+
+ np.array([1, 2, 3]) == np.array([1, 2])
Combining overlapping data sets
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1058,6 +1100,30 @@ Note that the same result could have been achieved using
increasing or descreasing. :meth:`~Series.fillna` and :meth:`~Series.interpolate`
will not make any checks on the order of the index.
+.. _basics.limits_on_reindex_fill:
+
+Limits on filling while reindexing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``limit`` and ``tolerance`` arguments provide additional control over
+filling while reindexing. Limit specifies the maximum count of consecutive
+matches:
+
+.. ipython:: python
+
+ ts2.reindex(ts.index, method='ffill', limit=1)
+
+In contrast, tolerance specifies the maximum distance between the index and
+indexer values:
+
+.. ipython:: python
+
+ ts2.reindex(ts.index, method='ffill', tolerance='1 day')
+
+Notice that when used on a ``DatetimeIndex``, ``TimedeltaIndex`` or
+``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible.
+This allows you to specify tolerance with appropriate strings.
+
.. _basics.drop:
Dropping labels from an axis
@@ -1109,24 +1175,81 @@ parameter that is by default ``False`` and copies the underlying data. Pass
The Panel class has a related :meth:`~Panel.rename_axis` class which can rename
any of its three axes.
+.. _basics.iteration:
+
Iteration
---------
-Because Series is array-like, basic iteration produces the values. Other data
-structures follow the dict-like convention of iterating over the "keys" of the
-objects. In short:
+The behavior of basic iteration over pandas objects depends on the type.
+When iterating over a Series, it is regarded as array-like, and basic iteration
+produces the values. Other data structures, like DataFrame and Panel,
+follow the dict-like convention of iterating over the "keys" of the
+objects.
- * **Series**: values
- * **DataFrame**: column labels
- * **Panel**: item labels
+In short, basic iteration (``for i in object``) produces:
-Thus, for example:
+* **Series**: values
+* **DataFrame**: column labels
+* **Panel**: item labels
+
+Thus, for example, iterating over a DataFrame gives you the column names:
.. ipython::
- In [0]: for col in df:
- ...: print(col)
- ...:
+ In [0]: df = pd.DataFrame({'col1' : np.random.randn(3), 'col2' : np.random.randn(3)},
+ ...: index=['a', 'b', 'c'])
+
+ In [0]: for col in df:
+ ...: print(col)
+ ...:
+
+Pandas objects also have the dict-like :meth:`~DataFrame.iteritems` method to
+iterate over the (key, value) pairs.
+
+To iterate over the rows of a DataFrame, you can use the following methods:
+
+* :meth:`~DataFrame.iterrows`: Iterate over the rows of a DataFrame as (index, Series) pairs.
+ This converts the rows to Series objects, which can change the dtypes and has some
+ performance implications.
+* :meth:`~DataFrame.itertuples`: Iterate over the rows of a DataFrame as tuples of the values.
+ This is a lot faster as :meth:`~DataFrame.iterrows`, and is in most cases preferable to
+ use to iterate over the values of a DataFrame.
+
+.. warning::
+
+ Iterating through pandas objects is generally **slow**. In many cases,
+ iterating manually over the rows is not needed and can be avoided with
+ one of the following approaches:
+
+ * Look for a *vectorized* solution: many operations can be performed using
+ built-in methods or numpy functions, (boolean) indexing, ...
+
+ * When you have a function that cannot work on the full DataFrame/Series
+ at once, it is better to use :meth:`~DataFrame.apply` instead of iterating
+ over the values. See the docs on :ref:`function application `.
+
+ * If you need to do iterative manipulations on the values but performance is
+ important, consider writing the inner loop using e.g. cython or numba.
+ See the :ref:`enhancing performance ` section for some
+ examples of this approach.
+
+.. warning::
+
+ You should **never modify** something you are iterating over.
+ This is not guaranteed to work in all cases. Depending on the
+ data types, the iterator returns a copy and not a view, and writing
+ to it will have no effect!
+
+ For example, in the following case setting the value has no effect:
+
+ .. ipython:: python
+
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
+
+ for index, row in df.iterrows():
+ row['a'] = 10
+
+ df
iteritems
~~~~~~~~~
@@ -1134,9 +1257,9 @@ iteritems
Consistent with the dict-like interface, :meth:`~DataFrame.iteritems` iterates
through key-value pairs:
- * **Series**: (index, scalar value) pairs
- * **DataFrame**: (column, Series) pairs
- * **Panel**: (item, DataFrame) pairs
+* **Series**: (index, scalar value) pairs
+* **DataFrame**: (column, Series) pairs
+* **Panel**: (item, DataFrame) pairs
For example:
@@ -1147,22 +1270,46 @@ For example:
...: print(frame)
...:
-
.. _basics.iterrows:
iterrows
~~~~~~~~
-New in v0.7 is the ability to iterate efficiently through rows of a
-DataFrame with :meth:`~DataFrame.iterrows`. It returns an iterator yielding each
+:meth:`~DataFrame.iterrows` allows you to iterate through the rows of a
+DataFrame as Series objects. It returns an iterator yielding each
index value along with a Series containing the data in each row:
.. ipython::
- In [0]: for row_index, row in df2.iterrows():
+ In [0]: for row_index, row in df.iterrows():
...: print('%s\n%s' % (row_index, row))
...:
+.. note::
+
+ Because :meth:`~DataFrame.iterrows` returns a Series for each row,
+ it does **not** preserve dtypes across the rows (dtypes are
+ preserved across columns for DataFrames). For example,
+
+ .. ipython:: python
+
+ df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
+ df_orig.dtypes
+ row = next(df_orig.iterrows())[1]
+ row
+
+ All values in ``row``, returned as a Series, are now upcasted
+ to floats, also the original integer value in column `x`:
+
+ .. ipython:: python
+
+ row['int'].dtype
+ df_orig['int'].dtype
+
+ To preserve dtypes while iterating over the rows, it is better
+ to use :meth:`~DataFrame.itertuples` which returns tuples of the values
+ and which is generally much faster as ``iterrows``.
+
For instance, a contrived way to transpose the DataFrame would be:
.. ipython:: python
@@ -1174,45 +1321,38 @@ For instance, a contrived way to transpose the DataFrame would be:
df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows()))
print(df2_t)
-.. note::
-
- ``iterrows`` does **not** preserve dtypes across the rows (dtypes are
- preserved across columns for DataFrames). For example,
-
- .. ipython:: python
-
- df_iter = pd.DataFrame([[1, 1.0]], columns=['x', 'y'])
- row = next(df_iter.iterrows())[1]
- print(row['x'].dtype)
- print(df_iter['x'].dtype)
-
itertuples
~~~~~~~~~~
-The :meth:`~DataFrame.itertuples` method will return an iterator yielding a tuple for each row in the
-DataFrame. The first element of the tuple will be the row's corresponding index
-value, while the remaining values are the row values proper.
+The :meth:`~DataFrame.itertuples` method will return an iterator
+yielding a tuple for each row in the DataFrame. The first element
+of the tuple will be the row's corresponding index value,
+while the remaining values are the row values.
For instance,
.. ipython:: python
- for r in df2.itertuples():
- print(r)
+ for row in df.itertuples():
+ print(row)
+
+This method does not convert the row to a Series object but just returns the
+values inside a tuple. Therefore, :meth:`~DataFrame.itertuples` preserves the
+data type of the values and is generally faster as :meth:`~DataFrame.iterrows`.
.. _basics.dt_accessors:
.dt accessor
-~~~~~~~~~~~~
+------------
``Series`` has an accessor to succinctly return datetime like properties for the
-*values* of the Series, if its a datetime/period like Series.
+*values* of the Series, if it is a datetime/period like Series.
This will return a Series, indexed like the existing Series.
.. ipython:: python
# datetime
- s = pd.Series(pd.date_range('20130101 09:10:12',periods=4))
+ s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))
s
s.dt.hour
s.dt.second
@@ -1238,12 +1378,29 @@ You can also chain these types of operations:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
+You can also format datetime values as strings with :meth:`Series.dt.strftime` which
+supports the same format as the standard :meth:`~datetime.datetime.strftime`.
+
+.. ipython:: python
+
+ # DatetimeIndex
+ s = pd.Series(pd.date_range('20130101', periods=4))
+ s
+ s.dt.strftime('%Y/%m/%d')
+
+.. ipython:: python
+
+ # PeriodIndex
+ s = pd.Series(pd.period_range('20130101', periods=4))
+ s
+ s.dt.strftime('%Y/%m/%d')
+
The ``.dt`` accessor works for period and timedelta dtypes.
.. ipython:: python
# period
- s = pd.Series(pd.period_range('20130101', periods=4,freq='D'))
+ s = pd.Series(pd.period_range('20130101', periods=4, freq='D'))
s
s.dt.year
s.dt.day
@@ -1251,7 +1408,7 @@ The ``.dt`` accessor works for period and timedelta dtypes.
.. ipython:: python
# timedelta
- s = pd.Series(pd.timedelta_range('1 day 00:00:05',periods=4,freq='s'))
+ s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s'))
s
s.dt.days
s.dt.seconds
@@ -1364,6 +1521,20 @@ faster than sorting the entire Series and calling ``head(n)`` on the result.
s.nsmallest(3)
s.nlargest(3)
+.. versionadded:: 0.17.0
+
+``DataFrame`` also has the ``nlargest`` and ``nsmallest`` methods.
+
+.. ipython:: python
+
+ df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
+ 'b': list('abdceff'),
+ 'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
+ df.nlargest(3, 'a')
+ df.nlargest(5, ['a', 'c'])
+ df.nsmallest(3, 'a')
+ df.nsmallest(5, ['a', 'c'])
+
.. _basics.multi-index_sorting:
@@ -1522,26 +1693,36 @@ then the more *general* one will be used as the result of the operation.
object conversion
~~~~~~~~~~~~~~~~~
-:meth:`~DataFrame.convert_objects` is a method to try to force conversion of types from the ``object`` dtype to other types.
-To force conversion of specific types that are *number like*, e.g. could be a string that represents a number,
-pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise
-they will be set to ``np.nan``.
+.. note::
+
+ The syntax of :meth:`~DataFrame.convert_objects` changed in 0.17.0. See
+ :ref:`API changes `
+ for more details.
+
+:meth:`~DataFrame.convert_objects` is a method that converts columns from
+the ``object`` dtype to datetimes, timedeltas or floats. For example, to
+attempt conversion of object data that are *number like*, e.g. could be a
+string that represents a number, pass ``numeric=True``. By default, this will
+attempt a soft conversion and so will only succeed if the entire column is
+convertible. To force the conversion, add the keyword argument ``coerce=True``.
+This will force strings and number-like objects to be numbers if
+possible, and other values will be set to ``np.nan``.
.. ipython:: python
df3['D'] = '1.'
df3['E'] = '1'
- df3.convert_objects(convert_numeric=True).dtypes
+ df3.convert_objects(numeric=True).dtypes
# same, but specific dtype conversion
df3['D'] = df3['D'].astype('float16')
df3['E'] = df3['E'].astype('int32')
df3.dtypes
-To force conversion to ``datetime64[ns]``, pass ``convert_dates='coerce'``.
+To force conversion to ``datetime64[ns]``, pass ``datetime=True`` and ``coerce=True``.
This will convert any datetime-like object to dates, forcing other values to ``NaT``.
This might be useful if you are reading in data which is mostly dates,
-but occasionally has non-dates intermixed and you want to represent as missing.
+but occasionally contains non-dates that you wish to represent as missing.
.. ipython:: python
@@ -1550,10 +1731,15 @@ but occasionally has non-dates intermixed and you want to represent as missing.
'foo', 1.0, 1, pd.Timestamp('20010104'),
'20010105'], dtype='O')
s
- s.convert_objects(convert_dates='coerce')
+ s.convert_objects(datetime=True, coerce=True)
-In addition, :meth:`~DataFrame.convert_objects` will attempt the *soft* conversion of any *object* dtypes, meaning that if all
+Without passing ``coerce=True``, :meth:`~DataFrame.convert_objects` will attempt
+*soft* conversion of any *object* dtypes, meaning that if all
the objects in a Series are of the same type, the Series will have that dtype.
+Note that setting ``coerce=True`` does not *convert* arbitrary types to either
+``datetime64[ns]`` or ``timedelta64[ns]``. For example, a series containing string
+dates will not be converted to a series of datetimes. To convert between types,
+see :ref:`converting to timestamps `.
gotchas
~~~~~~~
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 08fc8483762ab..57c1667dca0c3 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -52,7 +52,7 @@
with open("index.rst") as f:
- lines = f.readlines()
+ index_rst_lines = f.readlines()
# only include the slow autosummary feature if we're building the API section
# of the docs
@@ -60,20 +60,21 @@
# JP: added from sphinxdocs
autosummary_generate = False
-if any([re.match("\s*api\s*",l) for l in lines]):
+if any([re.match("\s*api\s*",l) for l in index_rst_lines]):
autosummary_generate = True
-ds = []
+files_to_delete = []
for f in os.listdir(os.path.dirname(__file__)):
- if (not f.endswith(('.rst'))) or (f.startswith('.')) or os.path.basename(f) == 'index.rst':
+ if not f.endswith('.rst') or f.startswith('.') or os.path.basename(f) == 'index.rst':
continue
- _f = f.split('.rst')[0]
- if not any([re.match("\s*%s\s*$" % _f,l) for l in lines]):
- ds.append(f)
+ _file_basename = f.split('.rst')[0]
+ _regex_to_match = "\s*{}\s*$".format(_file_basename)
+ if not any([re.match(_regex_to_match, line) for line in index_rst_lines]):
+ files_to_delete.append(f)
-if ds:
- print("I'm about to DELETE the following:\n%s\n" % list(sorted(ds)))
+if files_to_delete:
+ print("I'm about to DELETE the following:\n%s\n" % list(sorted(files_to_delete)))
sys.stdout.write("WARNING: I'd like to delete those to speed up processing (yes/no)? ")
if PY3:
answer = input()
@@ -81,7 +82,7 @@
answer = raw_input()
if answer.lower().strip() in ('y','yes'):
- for f in ds:
+ for f in files_to_delete:
f = os.path.join(os.path.join(os.path.dirname(__file__),f))
f= os.path.abspath(f)
try:
diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst
index 1f58992dba017..4ec2258df56f2 100644
--- a/doc/source/contributing.rst
+++ b/doc/source/contributing.rst
@@ -247,6 +247,8 @@ just checked out. There are two primary methods of doing this.
from your development directory. Thus, you can always be using the development
version on your system without being inside the clone directory.
+.. _contributing.documentation:
+
Contributing to the documentation
=================================
@@ -316,6 +318,13 @@ Some other important things to know about the docs:
output saved) during the doc build. This way, they will always be up to date,
but it makes the doc building a bit more complex.
+The utility script ``scripts/api_rst_coverage.py`` can be used to compare
+the list of methods documented in ``doc/source/api.rst`` (which is used to generate
+the `API Reference `_ page)
+and the actual public methods.
+It will identify methods documented in in ``doc/source/api.rst`` that are not actually
+class methods, and existing methods that are not documented in ``doc/source/api.rst``.
+
How to build the pandas documentation
-------------------------------------
@@ -536,10 +545,23 @@ Documenting your code
Changes should be reflected in the release notes located in `doc/source/whatsnew/vx.y.z.txt`.
This file contains an ongoing change log for each release. Add an entry to this file to
document your fix, enhancement or (unavoidable) breaking change. Make sure to include the
-GitHub issue number when adding your entry.
+GitHub issue number when adding your entry (using `` :issue:`1234` `` where `1234` is the
+issue/pull request number).
+
+If your code is an enhancement, it is most likely necessary to add usage
+examples to the existing documentation. This can be done following the section
+regarding documentation :ref:`above `.
+Further, to let users know when this feature was added, the ``versionadded``
+directive is used. The sphinx syntax for that is:
+
+.. code-block:: rst
+
+ .. versionadded:: 0.17.0
-If your code is an enhancement, it is most likely necessary to add usage examples to the
-existing documentation. This can be done following the section regarding documentation.
+This will put the text *New in version 0.17.0* wherever you put the sphinx
+directive. This should also be put in the docstring when adding a new function
+or method (`example `__)
+or a new keyword argument (`example `__).
Contributing your changes to *pandas*
=====================================
diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst
index f69f926296020..9e7b9ad0b7582 100644
--- a/doc/source/cookbook.rst
+++ b/doc/source/cookbook.rst
@@ -745,6 +745,9 @@ Timeseries
`Vectorized Lookup
`__
+`Aggregation and plotting time series
+`__
+
Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series.
`How to rearrange a python pandas DataFrame?
`__
@@ -831,6 +834,9 @@ ignore_index is needed in pandas < v0.13, and depending on df construction
`Join with a criteria based on the values
`__
+`Using searchsorted to merge based on values inside a range
+`__
+
.. _cookbook.plotting:
Plotting
@@ -985,8 +991,14 @@ The :ref:`Excel ` docs
`Reading from a filelike handle
`__
+`Modifying formatting in XlsxWriter output
+`__
+
.. _cookbook.html:
+HTML
+****
+
`Reading HTML tables from a server that cannot handle the default request
header `__
diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst
index 9221f2685d79b..5a62e7dccea34 100644
--- a/doc/source/dsintro.rst
+++ b/doc/source/dsintro.rst
@@ -1,18 +1,23 @@
.. currentmodule:: pandas
-.. _dsintro:
-
.. ipython:: python
:suppress:
import numpy as np
- from pandas import *
- randn = np.random.randn
np.set_printoptions(precision=4, suppress=True)
- set_option('display.precision', 4, 'display.max_columns', 8)
- options.display.max_rows=15
import pandas as pd
+ pd.set_option('display.precision', 4, 'display.max_columns', 8)
+ pd.options.display.max_rows = 15
+
+ import matplotlib
+ try:
+ matplotlib.style.use('ggplot')
+ except AttributeError:
+ pd.options.display.mpl_style = 'default'
+ import matplotlib.pyplot as plt
+ plt.close('all')
+.. _dsintro:
************************
Intro to Data Structures
@@ -26,9 +31,7 @@ objects. To get started, import numpy and load pandas into your namespace:
.. ipython:: python
import numpy as np
- # will use a lot in examples
- randn = np.random.randn
- from pandas import *
+ import pandas as pd
Here is a basic tenet to keep in mind: **data alignment is intrinsic**. The link
between labels and data will not be broken unless done so explicitly by you.
@@ -36,13 +39,6 @@ between labels and data will not be broken unless done so explicitly by you.
We'll give a brief intro to the data structures, then consider all of the broad
categories of functionality and methods in separate sections.
-When using pandas, we recommend the following import convention:
-
-.. code-block:: python
-
- import pandas as pd
-
-
.. _basics.series:
Series
@@ -60,7 +56,7 @@ labels are collectively referred to as the **index**. The basic method to create
::
- >>> s = Series(data, index=index)
+ >>> s = pd.Series(data, index=index)
Here, ``data`` can be many different things:
@@ -78,11 +74,11 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``.
.. ipython:: python
- s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e'])
+ s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s
s.index
- Series(randn(5))
+ pd.Series(np.random.randn(5))
.. note::
@@ -101,8 +97,8 @@ constructed from the sorted keys of the dict, if possible.
.. ipython:: python
d = {'a' : 0., 'b' : 1., 'c' : 2.}
- Series(d)
- Series(d, index=['b', 'c', 'd', 'a'])
+ pd.Series(d)
+ pd.Series(d, index=['b', 'c', 'd', 'a'])
.. note::
@@ -113,7 +109,7 @@ provided. The value will be repeated to match the length of **index**
.. ipython:: python
- Series(5., index=['a', 'b', 'c', 'd', 'e'])
+ pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])
Series is ndarray-like
~~~~~~~~~~~~~~~~~~~~~~
@@ -211,7 +207,7 @@ Series can also have a ``name`` attribute:
.. ipython:: python
- s = Series(np.random.randn(5), name='something')
+ s = pd.Series(np.random.randn(5), name='something')
s
s.name
@@ -254,13 +250,13 @@ keys.
.. ipython:: python
- d = {'one' : Series([1., 2., 3.], index=['a', 'b', 'c']),
- 'two' : Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
- df = DataFrame(d)
+ d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
+ 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
+ df = pd.DataFrame(d)
df
- DataFrame(d, index=['d', 'b', 'a'])
- DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
+ pd.DataFrame(d, index=['d', 'b', 'a'])
+ pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
The row and column labels can be accessed respectively by accessing the
**index** and **columns** attributes:
@@ -286,8 +282,8 @@ result will be ``range(n)``, where ``n`` is the array length.
d = {'one' : [1., 2., 3., 4.],
'two' : [4., 3., 2., 1.]}
- DataFrame(d)
- DataFrame(d, index=['a', 'b', 'c', 'd'])
+ pd.DataFrame(d)
+ pd.DataFrame(d, index=['a', 'b', 'c', 'd'])
From structured or record array
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -296,12 +292,12 @@ This case is handled identically to a dict of arrays.
.. ipython:: python
- data = np.zeros((2,),dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])
- data[:] = [(1,2.,'Hello'),(2,3.,"World")]
+ data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])
+ data[:] = [(1,2.,'Hello'), (2,3.,"World")]
- DataFrame(data)
- DataFrame(data, index=['first', 'second'])
- DataFrame(data, columns=['C', 'A', 'B'])
+ pd.DataFrame(data)
+ pd.DataFrame(data, index=['first', 'second'])
+ pd.DataFrame(data, columns=['C', 'A', 'B'])
.. note::
@@ -316,9 +312,9 @@ From a list of dicts
.. ipython:: python
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
- DataFrame(data2)
- DataFrame(data2, index=['first', 'second'])
- DataFrame(data2, columns=['a', 'b'])
+ pd.DataFrame(data2)
+ pd.DataFrame(data2, index=['first', 'second'])
+ pd.DataFrame(data2, columns=['a', 'b'])
.. _basics.dataframe.from_dict_of_tuples:
@@ -329,11 +325,11 @@ You can automatically create a multi-indexed frame by passing a tuples dictionar
.. ipython:: python
- DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
- ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
- ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
- ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
- ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
+ pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
+ ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
+ ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
+ ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
+ ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
.. _basics.dataframe.from_series:
@@ -376,7 +372,7 @@ For example:
.. ipython:: python
data
- DataFrame.from_records(data, index='C')
+ pd.DataFrame.from_records(data, index='C')
.. _basics.dataframe.from_items:
@@ -391,15 +387,15 @@ of columns:
.. ipython:: python
- DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])])
+ pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])])
If you pass ``orient='index'``, the keys will be the row labels. But in this
case you must also pass the desired column names:
.. ipython:: python
- DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])],
- orient='index', columns=['one', 'two', 'three'])
+ pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])],
+ orient='index', columns=['one', 'two', 'three'])
Column selection, addition, deletion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -465,7 +461,7 @@ derived from existing columns.
.. ipython:: python
- iris = read_csv('data/iris.data')
+ iris = pd.read_csv('data/iris.data')
iris.head()
(iris.assign(sepal_ratio = iris['SepalWidth'] / iris['SepalLength'])
@@ -564,8 +560,8 @@ union of the column and row labels.
.. ipython:: python
- df = DataFrame(randn(10, 4), columns=['A', 'B', 'C', 'D'])
- df2 = DataFrame(randn(7, 3), columns=['A', 'B', 'C'])
+ df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
+ df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df + df2
When doing an operation between DataFrame and Series, the default behavior is
@@ -583,8 +579,8 @@ also contains dates, the broadcasting will be column-wise:
.. ipython:: python
:okwarning:
- index = date_range('1/1/2000', periods=8)
- df = DataFrame(randn(8, 3), index=index, columns=list('ABC'))
+ index = pd.date_range('1/1/2000', periods=8)
+ df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))
df
type(df['A'])
df - df['A']
@@ -619,8 +615,8 @@ Boolean operators work as well:
.. ipython:: python
- df1 = DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)
- df2 = DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)
+ df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)
+ df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)
df1 & df2
df1 | df2
df1 ^ df2
@@ -660,7 +656,7 @@ Similarly, the dot method on Series implements dot product:
.. ipython:: python
- s1 = Series(np.arange(5,10))
+ s1 = pd.Series(np.arange(5,10))
s1.dot(s1)
DataFrame is not intended to be a drop-in replacement for ndarray as its
@@ -682,7 +678,7 @@ R package):
.. ipython:: python
- baseball = read_csv('data/baseball.csv')
+ baseball = pd.read_csv('data/baseball.csv')
print(baseball)
baseball.info()
@@ -704,21 +700,21 @@ default:
.. ipython:: python
- DataFrame(randn(3, 12))
+ pd.DataFrame(np.random.randn(3, 12))
You can change how much to print on a single row by setting the ``display.width``
option:
.. ipython:: python
- set_option('display.width', 40) # default is 80
+ pd.set_option('display.width', 40) # default is 80
- DataFrame(randn(3, 12))
+ pd.DataFrame(np.random.randn(3, 12))
.. ipython:: python
:suppress:
- reset_option('display.width')
+ pd.reset_option('display.width')
You can also disable this feature via the ``expand_frame_repr`` option.
This will print the table in one block.
@@ -731,8 +727,8 @@ accessed like attributes:
.. ipython:: python
- df = DataFrame({'foo1' : np.random.randn(5),
- 'foo2' : np.random.randn(5)})
+ df = pd.DataFrame({'foo1' : np.random.randn(5),
+ 'foo2' : np.random.randn(5)})
df
df.foo1
@@ -770,9 +766,9 @@ From 3D ndarray with optional axis labels
.. ipython:: python
- wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'],
- major_axis=date_range('1/1/2000', periods=5),
- minor_axis=['A', 'B', 'C', 'D'])
+ wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
+ major_axis=pd.date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B', 'C', 'D'])
wp
@@ -781,9 +777,9 @@ From dict of DataFrame objects
.. ipython:: python
- data = {'Item1' : DataFrame(randn(4, 3)),
- 'Item2' : DataFrame(randn(4, 2))}
- Panel(data)
+ data = {'Item1' : pd.DataFrame(np.random.randn(4, 3)),
+ 'Item2' : pd.DataFrame(np.random.randn(4, 2))}
+ pd.Panel(data)
Note that the values in the dict need only be **convertible to
DataFrame**. Thus, they can be any of the other valid inputs to DataFrame as
@@ -803,7 +799,7 @@ For example, compare to the construction above:
.. ipython:: python
- Panel.from_dict(data, orient='minor')
+ pd.Panel.from_dict(data, orient='minor')
Orient is especially useful for mixed-type DataFrames. If you pass a dict of
DataFrame objects with mixed-type columns, all of the data will get upcasted to
@@ -811,11 +807,11 @@ DataFrame objects with mixed-type columns, all of the data will get upcasted to
.. ipython:: python
- df = DataFrame({'a': ['foo', 'bar', 'baz'],
- 'b': np.random.randn(3)})
+ df = pd.DataFrame({'a': ['foo', 'bar', 'baz'],
+ 'b': np.random.randn(3)})
df
data = {'item1': df, 'item2': df}
- panel = Panel.from_dict(data, orient='minor')
+ panel = pd.Panel.from_dict(data, orient='minor')
panel['a']
panel['b']
panel['b'].dtypes
@@ -838,8 +834,8 @@ a DataFrame with a two-level index to a Panel.
.. ipython:: python
- midx = MultiIndex(levels=[['one', 'two'], ['x','y']], labels=[[1,1,0,0],[1,0,1,0]])
- df = DataFrame({'A' : [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=midx)
+ midx = pd.MultiIndex(levels=[['one', 'two'], ['x','y']], labels=[[1,1,0,0],[1,0,1,0]])
+ df = pd.DataFrame({'A' : [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=midx)
df.to_panel()
.. _dsintro.panel_item_selection:
@@ -897,7 +893,7 @@ Another way to change the dimensionality of an object is to ``squeeze`` a 1-len
.. ipython:: python
wp.reindex(items=['Item1']).squeeze()
- wp.reindex(items=['Item1'],minor=['B']).squeeze()
+ wp.reindex(items=['Item1'], minor=['B']).squeeze()
Conversion to DataFrame
@@ -910,9 +906,9 @@ method:
.. ipython:: python
- panel = Panel(np.random.randn(3, 5, 4), items=['one', 'two', 'three'],
- major_axis=date_range('1/1/2000', periods=5),
- minor_axis=['a', 'b', 'c', 'd'])
+ panel = pd.Panel(np.random.randn(3, 5, 4), items=['one', 'two', 'three'],
+ major_axis=pd.date_range('1/1/2000', periods=5),
+ minor_axis=['a', 'b', 'c', 'd'])
panel.to_frame()
@@ -931,7 +927,6 @@ containers.
DataFrames
- **minor_axis**: axis 3, it is the **columns** of each of the DataFrames
-
``Panel4D`` is a sub-class of ``Panel``, so most methods that work on Panels are
applicable to Panel4D. The following methods are disabled:
@@ -944,11 +939,11 @@ From 4D ndarray with optional axis labels
.. ipython:: python
- p4d = Panel4D(randn(2, 2, 5, 4),
- labels=['Label1','Label2'],
- items=['Item1', 'Item2'],
- major_axis=date_range('1/1/2000', periods=5),
- minor_axis=['A', 'B', 'C', 'D'])
+ p4d = pd.Panel4D(np.random.randn(2, 2, 5, 4),
+ labels=['Label1','Label2'],
+ items=['Item1', 'Item2'],
+ major_axis=pd.date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B', 'C', 'D'])
p4d
@@ -957,9 +952,9 @@ From dict of Panel objects
.. ipython:: python
- data = { 'Label1' : Panel({ 'Item1' : DataFrame(randn(4, 3)) }),
- 'Label2' : Panel({ 'Item2' : DataFrame(randn(4, 2)) }) }
- Panel4D(data)
+ data = { 'Label1' : pd.Panel({ 'Item1' : pd.DataFrame(np.random.randn(4, 3)) }),
+ 'Label2' : pd.Panel({ 'Item2' : pd.DataFrame(np.random.randn(4, 2)) }) }
+ pd.Panel4D(data)
Note that the values in the dict need only be **convertible to Panels**.
Thus, they can be any of the other valid inputs to Panel as per above.
@@ -1022,7 +1017,7 @@ Here we slice to a Panel4D.
orders = [ 'cool', 'labels','items','major_axis','minor_axis'],
slices = { 'labels' : 'labels', 'items' : 'items',
'major_axis' : 'major_axis', 'minor_axis' : 'minor_axis' },
- slicer = Panel4D,
+ slicer = pd.Panel4D,
aliases = { 'major' : 'major_axis', 'minor' : 'minor_axis' },
stat_axis = 2)
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index c70b6deade36e..762656ba05bd6 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -80,6 +80,10 @@ The `Vincent `__ project leverages `Vega <
(that in turn, leverages `d3 `__) to create plots . It has great support
for pandas data objects.
+`Plotly `__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use.
.. _ecosystem.ide:
@@ -132,19 +136,19 @@ Pandas DataFrames with timeseries indexes.
`pydatastream `_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-PyDatastream is a Python interface to the
+PyDatastream is a Python interface to the
`Thomson Dataworks Enterprise (DWE/Datastream) `__
-SOAP API to return indexed Pandas DataFrames or Panels with financial data.
+SOAP API to return indexed Pandas DataFrames or Panels with financial data.
This package requires valid credentials for this API (non free).
`pandaSDMX `_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-pandaSDMX is an extensible library to retrieve and acquire statistical data
-and metadata disseminated in
-`SDMX `_ 2.1. This standard is currently supported by
+pandaSDMX is an extensible library to retrieve and acquire statistical data
+and metadata disseminated in
+`SDMX `_ 2.1. This standard is currently supported by
the European statistics office (Eurostat)
-and the European Central Bank (ECB). Datasets may be returned as pandas Series
-or multi-indexed DataFrames.
+and the European Central Bank (ECB). Datasets may be returned as pandas Series
+or multi-indexed DataFrames.
`fredapi `_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -183,8 +187,16 @@ Out-of-core
-------------
`Blaze `__
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Blaze provides a standard API for doing computations with various
in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, PyTables,
PySpark.
+
+`Odo `__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Odo provides a uniform API for moving data between different formats. It uses
+pandas own ``read_csv`` for CSV IO and leverages many existing packages such as
+PyTables, h5py, and pymongo to move data between non pandas formats. Its graph
+based approach is also extensible by end users for custom formats that may be
+too specific for the core of odo.
diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst
index 517c91c93d821..855a459f48cf4 100644
--- a/doc/source/enhancingperf.rst
+++ b/doc/source/enhancingperf.rst
@@ -5,17 +5,14 @@
.. ipython:: python
:suppress:
- import os
- import csv
- from pandas import DataFrame, Series
- import pandas as pd
- pd.options.display.max_rows=15
-
import numpy as np
np.random.seed(123456)
- randn = np.random.randn
- randint = np.random.randint
np.set_printoptions(precision=4, suppress=True)
+ import pandas as pd
+ pd.options.display.max_rows=15
+
+ import os
+ import csv
*********************
@@ -49,7 +46,10 @@ We have a DataFrame to which we want to apply a function row-wise.
.. ipython:: python
- df = DataFrame({'a': randn(1000), 'b': randn(1000),'N': randint(100, 1000, (1000)), 'x': 'x'})
+ df = pd.DataFrame({'a': np.random.randn(1000),
+ 'b': np.random.randn(1000),
+ 'N': np.random.randint(100, 1000, (1000)),
+ 'x': 'x'})
df
Here's the function in pure python:
@@ -94,7 +94,8 @@ hence we'll concentrate our efforts cythonizing these two functions.
Plain cython
~~~~~~~~~~~~
-First we're going to need to import the cython magic function to ipython:
+First we're going to need to import the cython magic function to ipython (for
+cython versions >=0.21 you can use ``%load_ext Cython``):
.. ipython:: python
@@ -306,7 +307,14 @@ Numba works by generating optimized machine code using the LLVM compiler infrast
You will need to install ``numba``. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda`.
-We simply take the plain python code from above and annotate with the ``@jit`` decorator.
+.. note::
+
+ As of ``numba`` version 0.20, pandas objects cannot be passed directly to numba-compiled functions. Instead, one must pass the ``numpy`` array underlying the ``pandas`` object to the numba-compiled function as demonstrated below.
+
+Jit
+~~~
+
+Using ``numba`` to just-in-time compile your code. We simply take the plain python code from above and annotate with the ``@jit`` decorator.
.. code-block:: python
@@ -335,16 +343,57 @@ We simply take the plain python code from above and annotate with the ``@jit`` d
def compute_numba(df):
result = apply_integrate_f_numba(df['a'].values, df['b'].values, df['N'].values)
- return Series(result, index=df.index, name='result')
+ return pd.Series(result, index=df.index, name='result')
-Similar to above, we directly pass ``numpy`` arrays directly to the numba function. Further
-we are wrapping the results to provide a nice interface by passing/returning pandas objects.
+Note that we directly pass ``numpy`` arrays to the numba function. ``compute_numba`` is just a wrapper that provides a nicer interface by passing/returning pandas objects.
.. code-block:: python
In [4]: %timeit compute_numba(df)
1000 loops, best of 3: 798 us per loop
+Vectorize
+~~~~~~~~~
+
+``numba`` can also be used to write vectorized functions that do not require the user to explicitly
+loop over the observations of a vector; a vectorized function will be applied to each row automatically.
+Consider the following toy example of doubling each observation:
+
+.. code-block:: python
+
+ import numba
+
+ def double_every_value_nonumba(x):
+ return x*2
+
+ @numba.vectorize
+ def double_every_value_withnumba(x):
+ return x*2
+
+
+ # Custom function without numba
+ In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba)
+ 1000 loops, best of 3: 797 us per loop
+
+ # Standard implementation (faster than a custom function)
+ In [6]: %timeit df['col1_doubled'] = df.a*2
+ 1000 loops, best of 3: 233 us per loop
+
+ # Custom function with numba
+ In [7]: %timeit df['col1_doubled'] = double_every_value_withnumba(df.a.values)
+ 1000 loops, best of 3: 145 us per loop
+
+Caveats
+~~~~~~~
+
+.. note::
+
+ ``numba`` will execute on any function, but can only accelerate certain classes of functions.
+
+``numba`` is best at accelerating functions that apply numerical functions to numpy arrays. When passed a function that only uses operations it knows how to accelerate, it will execute in ``nopython`` mode.
+
+If ``numba`` is passed a function that includes something it doesn't know how to work with -- a category that currently includes sets, lists, dictionaries, or string functions -- it will revert to ``object mode``. In ``object mode``, numba will execute but your code will not speed up significantly. If you would prefer that ``numba`` throw an error if it cannot compile a function in a way that speeds up your code, pass numba the argument ``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on troubleshooting ``numba`` modes, see the `numba troubleshooting page `__.
+
Read more in the `numba docs `__.
.. _enhancingperf.eval:
@@ -433,18 +482,13 @@ First let's create a few decent-sized arrays to play with:
.. ipython:: python
- import pandas as pd
- from pandas import DataFrame, Series
- from numpy.random import randn
- import numpy as np
nrows, ncols = 20000, 100
- df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols)) for _ in range(4)]
+ df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]
Now let's compare adding them together using plain ol' Python versus
:func:`~pandas.eval`:
-
.. ipython:: python
%timeit df1 + df2 + df3 + df4
@@ -467,10 +511,9 @@ Now let's do the same thing but with comparisons:
:func:`~pandas.eval` also works with unaligned pandas objects:
-
.. ipython:: python
- s = Series(randn(50))
+ s = pd.Series(np.random.randn(50))
%timeit df1 + df2 + df3 + df4 + s
.. ipython:: python
@@ -515,7 +558,7 @@ evaluate an expression in the "context" of a :class:`~pandas.DataFrame`.
.. ipython:: python
- df = DataFrame(randn(5, 2), columns=['a', 'b'])
+ df = pd.DataFrame(np.random.randn(5, 2), columns=['a', 'b'])
df.eval('a + b')
Any expression that is a valid :func:`pandas.eval` expression is also a valid
@@ -530,7 +573,7 @@ it must be a valid Python identifier.
.. ipython:: python
- df = DataFrame(dict(a=range(5), b=range(5, 10)))
+ df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
df.eval('c = a + b')
df.eval('d = a + b + c')
df.eval('a = 1')
@@ -540,7 +583,7 @@ The equivalent in standard Python would be
.. ipython:: python
- df = DataFrame(dict(a=range(5), b=range(5, 10)))
+ df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
df['c'] = df.a + df.b
df['d'] = df.a + df.b + df.c
df['a'] = 1
@@ -555,8 +598,8 @@ For example,
.. code-block:: python
- df = DataFrame(randn(5, 2), columns=['a', 'b'])
- newcol = randn(len(df))
+ df = pd.DataFrame(np.random.randn(5, 2), columns=['a', 'b'])
+ newcol = np.random.randn(len(df))
df.eval('b + newcol')
UndefinedVariableError: name 'newcol' is not defined
@@ -567,8 +610,8 @@ expression by placing the ``@`` character in front of the name. For example,
.. ipython:: python
- df = DataFrame(randn(5, 2), columns=list('ab'))
- newcol = randn(len(df))
+ df = pd.DataFrame(np.random.randn(5, 2), columns=list('ab'))
+ newcol = np.random.randn(len(df))
df.eval('b + @newcol')
df.query('b < @newcol')
@@ -582,7 +625,7 @@ name in an expression.
.. ipython:: python
- a = randn()
+ a = np.random.randn()
df.query('@a < a')
df.loc[a < df.a] # same as the previous expression
@@ -710,8 +753,8 @@ you have an expression--for example
.. ipython:: python
- df = DataFrame({'strings': np.repeat(list('cba'), 3),
- 'nums': np.repeat(range(3), 3)})
+ df = pd.DataFrame({'strings': np.repeat(list('cba'), 3),
+ 'nums': np.repeat(range(3), 3)})
df
df.query('strings == "a" and nums == 1')
diff --git a/doc/source/faq.rst b/doc/source/faq.rst
index 32290839ad71d..7714d937e15d6 100644
--- a/doc/source/faq.rst
+++ b/doc/source/faq.rst
@@ -8,26 +8,18 @@ Frequently Asked Questions (FAQ)
.. ipython:: python
:suppress:
- from datetime import datetime
import numpy as np
np.random.seed(123456)
- from pandas import *
- options.display.max_rows=15
- randn = np.random.randn
- randint = np.random.randint
np.set_printoptions(precision=4, suppress=True)
- from dateutil.relativedelta import relativedelta
- from pandas.tseries.api import *
- from pandas.tseries.offsets import *
- import matplotlib.pyplot as plt
- plt.close('all')
+ import pandas as pd
+ pd.options.display.max_rows = 15
import matplotlib
try:
matplotlib.style.use('ggplot')
except AttributeError:
- options.display.mpl_style = 'default'
- from pandas.compat import lrange
-
+ pd.options.display.mpl_style = 'default'
+ import matplotlib.pyplot as plt
+ plt.close('all')
.. _df-memory-usage:
@@ -45,11 +37,11 @@ when calling ``df.info()``:
.. ipython:: python
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
- 'complex128', 'object', 'bool']
+ 'complex128', 'object', 'bool']
n = 5000
data = dict([ (t, np.random.randint(100, size=n).astype(t))
for t in dtypes])
- df = DataFrame(data)
+ df = pd.DataFrame(data)
df['categorical'] = df['object'].astype('category')
df.info()
@@ -89,149 +81,6 @@ representation; i.e., 1KB = 1024 bytes).
See also :ref:`Categorical Memory Usage `.
-
-.. _ref-scikits-migration:
-
-Migrating from scikits.timeseries to pandas >= 0.8.0
-----------------------------------------------------
-
-Starting with pandas 0.8.0, users of scikits.timeseries should have all of the
-features that they need to migrate their code to use pandas. Portions of the
-scikits.timeseries codebase for implementing calendar logic and timespan
-frequency conversions (but **not** resampling, that has all been implemented
-from scratch from the ground up) have been ported to the pandas codebase.
-
-The scikits.timeseries notions of ``Date`` and ``DateArray`` are responsible
-for implementing calendar logic:
-
-::
-
- In [16]: dt = ts.Date('Q', '1984Q3')
-
- # sic
- In [17]: dt
- Out[17]:
-
- In [18]: dt.asfreq('D', 'start')
- Out[18]:
-
- In [19]: dt.asfreq('D', 'end')
- Out[19]:
-
- In [20]: dt + 3
- Out[20]:
-
-``Date`` and ``DateArray`` from scikits.timeseries have been reincarnated in
-pandas ``Period`` and ``PeriodIndex``:
-
-.. ipython:: python
-
- pnow('D') # scikits.timeseries.now()
- Period(year=2007, month=3, day=15, freq='D')
- p = Period('1984Q3')
- p
- p.asfreq('D', 'start')
- p.asfreq('D', 'end')
- (p + 3).asfreq('T') + 6 * 60 + 30
- rng = period_range('1990', '2010', freq='A')
- rng
- rng.asfreq('B', 'end') - 3
-
-.. csv-table::
- :header: "scikits.timeseries", "pandas", "Notes"
- :widths: 20, 20, 60
-
- Date, Period, "A span of time, from yearly through to secondly"
- DateArray, PeriodIndex, "An array of timespans"
- convert, resample, "Frequency conversion in scikits.timeseries"
- convert_to_annual, pivot_annual, "currently supports up to daily frequency, see :issue:`736`"
-
-
-PeriodIndex / DateArray properties and functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The scikits.timeseries ``DateArray`` had a number of information
-properties. Here are the pandas equivalents:
-
-.. csv-table::
- :header: "scikits.timeseries", "pandas", "Notes"
- :widths: 20, 60, 20
-
- get_steps, ``np.diff(idx.values)``,
- has_missing_dates, ``not idx.is_full``,
- is_full, ``idx.is_full``,
- is_valid, ``idx.is_monotonic and idx.is_unique``,
- is_chronological, ``is_monotonic``,
- ``arr.sort_chronologically()``, ``idx.order()``,
-
-Frequency conversion
-~~~~~~~~~~~~~~~~~~~~
-
-Frequency conversion is implemented using the ``resample`` method on Series
-and DataFrame objects with a DatetimeIndex or PeriodIndex. ``resample`` also
-works on panels (3D). Here is some code that resamples daily data to montly:
-
-.. ipython:: python
-
- rng = period_range('Jan-2000', periods=50, freq='M')
- data = Series(np.random.randn(50), index=rng)
- data
- data.resample('A', how=np.mean)
-
-Plotting
-~~~~~~~~
-
-Much of the plotting functionality of scikits.timeseries has been ported and
-adopted to pandas's data structures. For example:
-
-.. ipython:: python
-
- rng = period_range('1987Q2', periods=10, freq='Q-DEC')
- data = Series(np.random.randn(10), index=rng)
-
- @savefig skts_ts_plot.png
- plt.figure(); data.plot()
-
-Converting to and from period format
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Use the ``to_timestamp`` and ``to_period`` instance methods.
-
-Treatment of missing data
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Unlike scikits.timeseries, pandas data structures are not based on NumPy's
-``MaskedArray`` object. Missing data is represented as ``NaN`` in numerical
-arrays and either as ``None`` or ``NaN`` in non-numerical arrays. Implementing
-a version of pandas's data structures that use MaskedArray is possible but
-would require the involvement of a dedicated maintainer. Active pandas
-developers are not interested in this.
-
-Resampling with timestamps and periods
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-``resample`` has a ``kind`` argument which allows you to resample time series
-with a DatetimeIndex to PeriodIndex:
-
-.. ipython:: python
-
- rng = date_range('1/1/2000', periods=200, freq='D')
- data = Series(np.random.randn(200), index=rng)
- data[:10]
- data.index
- data.resample('M', kind='period')
-
-Similarly, resampling from periods to timestamps is possible with an optional
-interval (``'start'`` or ``'end'``) convention:
-
-.. ipython:: python
-
- rng = period_range('Jan-2000', periods=50, freq='M')
- data = Series(np.random.randn(50), index=rng)
- resampled = data.resample('A', kind='timestamp', convention='end')
- resampled.index
-
-
Byte-Ordering Issues
--------------------
Occasionally you may have to deal with data that were created on a machine with
@@ -244,7 +93,7 @@ using something similar to the following:
x = np.array(list(range(10)), '>i4') # big endian
newx = x.byteswap().newbyteorder() # force native byteorder
- s = Series(newx)
+ s = pd.Series(newx)
See `the NumPy documentation on byte order
`__ for more
diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst
index addeddcb0bdde..cf4a86d530180 100644
--- a/doc/source/gotchas.rst
+++ b/doc/source/gotchas.rst
@@ -4,13 +4,11 @@
.. ipython:: python
:suppress:
- import os
import numpy as np
- from pandas import *
- options.display.max_rows=15
- randn = np.random.randn
np.set_printoptions(precision=4, suppress=True)
- from pandas.compat import lrange
+ import pandas as pd
+ pd.options.display.max_rows=15
+
*******************
Caveats and Gotchas
@@ -27,7 +25,7 @@ what the result of
.. code-block:: python
- >>> if Series([False, True, False]):
+ >>> if pd.Series([False, True, False]):
...
should be. Should it be ``True`` because it's not zero-length? ``False`` because there are ``False`` values?
@@ -64,10 +62,10 @@ To evaluate single-element pandas objects in a boolean context, use the method `
.. ipython:: python
- Series([True]).bool()
- Series([False]).bool()
- DataFrame([[True]]).bool()
- DataFrame([[False]]).bool()
+ pd.Series([True]).bool()
+ pd.Series([False]).bool()
+ pd.DataFrame([[True]]).bool()
+ pd.DataFrame([[False]]).bool()
Bitwise boolean
~~~~~~~~~~~~~~~
@@ -147,7 +145,7 @@ arrays. For example:
.. ipython:: python
- s = Series([1, 2, 3, 4, 5], index=list('abcde'))
+ s = pd.Series([1, 2, 3, 4, 5], index=list('abcde'))
s
s.dtype
@@ -228,9 +226,9 @@ following code will generate exceptions:
.. code-block:: python
- s = Series(range(5))
+ s = pd.Series(range(5))
s[-1]
- df = DataFrame(np.random.randn(5, 4))
+ df = pd.DataFrame(np.random.randn(5, 4))
df
df.ix[-2:]
@@ -255,7 +253,7 @@ consider the following Series:
.. ipython:: python
- s = Series(randn(6), index=list('abcdef'))
+ s = pd.Series(np.random.randn(6), index=list('abcdef'))
s
Suppose we wished to slice from ``c`` to ``e``, using integers this would be
@@ -294,8 +292,8 @@ concise means of selecting data from a pandas object:
.. ipython:: python
- df = DataFrame(randn(6, 4), columns=['one', 'two', 'three', 'four'],
- index=list('abcdef'))
+ df = pd.DataFrame(np.random.randn(6, 4), columns=['one', 'two', 'three', 'four'],
+ index=list('abcdef'))
df
df.ix[['b', 'c', 'e']]
@@ -326,7 +324,7 @@ cases where an index contains, say, both integers and strings:
.. ipython:: python
- s = Series([1, 2, 3], index=['a', 0, 1])
+ s = pd.Series([1, 2, 3], index=['a', 0, 1])
s
s.ix[[0, 1]]
s.reindex([0, 1])
@@ -345,10 +343,10 @@ The use of ``reindex_like`` can potentially change the dtype of a ``Series``.
.. ipython:: python
- series = Series([1, 2, 3])
- x = Series([True])
+ series = pd.Series([1, 2, 3])
+ x = pd.Series([True])
x.dtype
- x = Series([True]).reindex_like(series)
+ x = pd.Series([True]).reindex_like(series)
x.dtype
This is because ``reindex_like`` silently inserts ``NaNs`` and the ``dtype``
@@ -371,10 +369,10 @@ can be represented using a 64-bit integer is limited to approximately 584 years:
.. ipython:: python
- begin = Timestamp.min
+ begin = pd.Timestamp.min
begin
- end = Timestamp.max
+ end = pd.Timestamp.max
end
See :ref:`here ` for ways to represent data outside these bound.
@@ -404,10 +402,10 @@ of the new set of columns rather than the original ones:
print(open('tmp.csv').read())
date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
- df = read_csv('tmp.csv', header=None,
- parse_dates=date_spec,
- keep_date_col=True,
- index_col=0)
+ df = pd.read_csv('tmp.csv', header=None,
+ parse_dates=date_spec,
+ keep_date_col=True,
+ index_col=0)
# index_col=0 refers to the combined column "nominal" and not the original
# first column of 'KORD' strings
@@ -417,6 +415,7 @@ of the new set of columns rather than the original ones:
.. ipython:: python
:suppress:
+ import os
os.remove('tmp.csv')
@@ -569,7 +568,7 @@ using something similar to the following:
x = np.array(list(range(10)), '>i4') # big endian
newx = x.byteswap().newbyteorder() # force native byteorder
- s = Series(newx)
+ s = pd.Series(newx)
See `the NumPy documentation on byte order
`__ for more
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
index c9e18b585c764..acddf1bb3fe30 100644
--- a/doc/source/groupby.rst
+++ b/doc/source/groupby.rst
@@ -6,18 +6,16 @@
import numpy as np
np.random.seed(123456)
- from pandas import *
- options.display.max_rows=15
- randn = np.random.randn
np.set_printoptions(precision=4, suppress=True)
- import matplotlib.pyplot as plt
- plt.close('all')
+ import pandas as pd
+ pd.options.display.max_rows = 15
import matplotlib
try:
matplotlib.style.use('ggplot')
except AttributeError:
- options.display.mpl_style = 'default'
- from pandas.compat import zip
+ pd.options.display.mpl_style = 'default'
+ import matplotlib.pyplot as plt
+ plt.close('all')
*****************************
Group By: split-apply-combine
@@ -105,11 +103,12 @@ consider the following DataFrame:
.. ipython:: python
- df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'foo', 'foo'],
- 'B' : ['one', 'one', 'two', 'three',
- 'two', 'two', 'one', 'three'],
- 'C' : randn(8), 'D' : randn(8)})
+ df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B' : ['one', 'one', 'two', 'three',
+ 'two', 'two', 'one', 'three'],
+ 'C' : np.random.randn(8),
+ 'D' : np.random.randn(8)})
df
We could naturally group by either the ``A`` or ``B`` columns or both:
@@ -142,7 +141,7 @@ output of aggregation functions will only contain unique index values:
lst = [1, 2, 3, 1, 2, 3]
- s = Series([1, 2, 3, 10, 20, 30], lst)
+ s = pd.Series([1, 2, 3, 10, 20, 30], lst)
grouped = s.groupby(level=0)
@@ -189,7 +188,7 @@ however pass ``sort=False`` for potential speedups:
.. ipython:: python
- df2 = DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]})
+ df2 = pd.DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]})
df2.groupby(['X'], sort=True).sum()
df2.groupby(['X'], sort=False).sum()
@@ -203,10 +202,10 @@ however pass ``sort=False`` for potential speedups:
n = 10
weight = np.random.normal(166, 20, size=n)
height = np.random.normal(60, 10, size=n)
- time = date_range('1/1/2000', periods=n)
+ time = pd.date_range('1/1/2000', periods=n)
gender = tm.choice(['male', 'female'], size=n)
- df = DataFrame({'height': height, 'weight': weight,
- 'gender': gender}, index=time)
+ df = pd.DataFrame({'height': height, 'weight': weight,
+ 'gender': gender}, index=time)
.. ipython:: python
@@ -226,11 +225,12 @@ however pass ``sort=False`` for potential speedups:
.. ipython:: python
:suppress:
- df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'foo', 'foo'],
- 'B' : ['one', 'one', 'two', 'three',
- 'two', 'two', 'one', 'three'],
- 'C' : randn(8), 'D' : randn(8)})
+ df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B' : ['one', 'one', 'two', 'three',
+ 'two', 'two', 'one', 'three'],
+ 'C' : np.random.randn(8),
+ 'D' : np.random.randn(8)})
.. _groupby.multiindex:
@@ -248,8 +248,8 @@ natural to group by one of the levels of the hierarchy.
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip(*arrays))
tuples
- index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
- s = Series(randn(8), index=index)
+ index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
+ s = pd.Series(np.random.randn(8), index=index)
.. ipython:: python
@@ -281,13 +281,13 @@ Also as of v0.6, grouping with multiple levels is supported.
['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip(*arrays))
- index = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third'])
- s = Series(randn(8), index=index)
+ index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second', 'third'])
+ s = pd.Series(np.random.randn(8), index=index)
.. ipython:: python
s
- s.groupby(level=['first','second']).sum()
+ s.groupby(level=['first', 'second']).sum()
More on the ``sum`` function and aggregation later.
@@ -499,9 +499,9 @@ to standardize the data within each group:
.. ipython:: python
- index = date_range('10/1/1999', periods=1100)
- ts = Series(np.random.normal(0.5, 2, 1100), index)
- ts = rolling_mean(ts, 100, 100).dropna()
+ index = pd.date_range('10/1/1999', periods=1100)
+ ts = pd.Series(np.random.normal(0.5, 2, 1100), index)
+ ts = pd.rolling_mean(ts, 100, 100).dropna()
ts.head()
ts.tail()
@@ -528,7 +528,7 @@ We can also visually compare the original and transformed data sets.
.. ipython:: python
- compare = DataFrame({'Original': ts, 'Transformed': transformed})
+ compare = pd.DataFrame({'Original': ts, 'Transformed': transformed})
@savefig groupby_transform_plot.png
compare.plot()
@@ -539,11 +539,11 @@ Another common data transform is to replace missing data with the group mean.
:suppress:
cols = ['A', 'B', 'C']
- values = randn(1000, 3)
+ values = np.random.randn(1000, 3)
values[np.random.randint(0, 1000, 100), 0] = np.nan
values[np.random.randint(0, 1000, 50), 1] = np.nan
values[np.random.randint(0, 1000, 200), 2] = np.nan
- data_df = DataFrame(values, columns=cols)
+ data_df = pd.DataFrame(values, columns=cols)
.. ipython:: python
@@ -599,7 +599,7 @@ than 2.
.. ipython:: python
- sf = Series([1, 1, 2, 3, 3, 3])
+ sf = pd.Series([1, 1, 2, 3, 3, 3])
sf.groupby(sf).filter(lambda x: x.sum() > 2)
The argument of ``filter`` must be a function that, applied to the group as a
@@ -610,7 +610,7 @@ with only a couple members.
.. ipython:: python
- dff = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})
+ dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})
dff.groupby('B').filter(lambda x: len(x) > 2)
Alternatively, instead of dropping the offending groups, we can return a
@@ -672,9 +672,9 @@ next). This enables some operations to be carried out rather succinctly:
.. ipython:: python
- tsdf = DataFrame(randn(1000, 3),
- index=date_range('1/1/2000', periods=1000),
- columns=['A', 'B', 'C'])
+ tsdf = pd.DataFrame(np.random.randn(1000, 3),
+ index=pd.date_range('1/1/2000', periods=1000),
+ columns=['A', 'B', 'C'])
tsdf.ix[::2] = np.nan
grouped = tsdf.groupby(lambda x: x.year)
grouped.fillna(method='pad')
@@ -689,8 +689,8 @@ The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys:
.. ipython:: python
- s = Series([9, 8, 7, 5, 19, 1, 4.2, 3.3])
- g = Series(list('abababab'))
+ s = pd.Series([9, 8, 7, 5, 19, 1, 4.2, 3.3])
+ g = pd.Series(list('abababab'))
gb = s.groupby(g)
gb.nlargest(3)
gb.nsmallest(3)
@@ -721,8 +721,8 @@ The dimension of the returned result can also change:
In [8]: grouped = df.groupby('A')['C']
In [10]: def f(group):
- ....: return DataFrame({'original' : group,
- ....: 'demeaned' : group - group.mean()})
+ ....: return pd.DataFrame({'original' : group,
+ ....: 'demeaned' : group - group.mean()})
....:
In [11]: grouped.apply(f)
@@ -732,8 +732,8 @@ The dimension of the returned result can also change:
.. ipython:: python
def f(x):
- return Series([ x, x**2 ], index = ['x', 'x^s'])
- s = Series(np.random.rand(5))
+ return pd.Series([ x, x**2 ], index = ['x', 'x^s'])
+ s = pd.Series(np.random.rand(5))
s
s.apply(f)
@@ -754,7 +754,7 @@ The dimension of the returned result can also change:
.. ipython:: python
- d = DataFrame({"a":["x", "y"], "b":[1,2]})
+ d = pd.DataFrame({"a":["x", "y"], "b":[1,2]})
def identity(df):
print df
return df
@@ -784,6 +784,8 @@ will be (silently) dropped. Thus, this does not pose any problems:
df.groupby('A').std()
+.. _groupby.missing:
+
NA and NaT group handling
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -800,9 +802,9 @@ can be used as group keys. If so, the order of the levels will be preserved:
.. ipython:: python
- data = Series(np.random.randn(100))
+ data = pd.Series(np.random.randn(100))
- factor = qcut(data, [0, .25, .5, .75, 1.])
+ factor = pd.qcut(data, [0, .25, .5, .75, 1.])
data.groupby(factor).mean()
@@ -811,27 +813,28 @@ can be used as group keys. If so, the order of the levels will be preserved:
Grouping with a Grouper specification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Your may need to specify a bit more data to properly group. You can
+You may need to specify a bit more data to properly group. You can
use the ``pd.Grouper`` to provide this local control.
.. ipython:: python
- import datetime as DT
-
- df = DataFrame({
- 'Branch' : 'A A A A A A A B'.split(),
- 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
- 'Quantity': [1,3,5,1,8,1,9,3],
- 'Date' : [
- DT.datetime(2013,1,1,13,0),
- DT.datetime(2013,1,1,13,5),
- DT.datetime(2013,10,1,20,0),
- DT.datetime(2013,10,2,10,0),
- DT.datetime(2013,10,1,20,0),
- DT.datetime(2013,10,2,10,0),
- DT.datetime(2013,12,2,12,0),
- DT.datetime(2013,12,2,14,0),
- ]})
+ import datetime
+
+ df = pd.DataFrame({
+ 'Branch' : 'A A A A A A A B'.split(),
+ 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
+ 'Quantity': [1,3,5,1,8,1,9,3],
+ 'Date' : [
+ datetime.datetime(2013,1,1,13,0),
+ datetime.datetime(2013,1,1,13,5),
+ datetime.datetime(2013,10,1,20,0),
+ datetime.datetime(2013,10,2,10,0),
+ datetime.datetime(2013,10,1,20,0),
+ datetime.datetime(2013,10,2,10,0),
+ datetime.datetime(2013,12,2,12,0),
+ datetime.datetime(2013,12,2,14,0),
+ ]
+ })
df
@@ -860,7 +863,7 @@ Just like for a DataFrame or Series you can call head and tail on a groupby:
.. ipython:: python
- df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
+ df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
df
g = df.groupby('A')
@@ -892,7 +895,7 @@ To select from a DataFrame or Series the nth item, use the nth method. This is a
.. ipython:: python
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+ df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
g.nth(0)
@@ -917,7 +920,7 @@ As with other methods, passing ``as_index=False``, will achieve a filtration, wh
.. ipython:: python
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+ df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A',as_index=False)
g.nth(0)
@@ -927,8 +930,8 @@ You can also select multiple rows from each group by specifying multiple nth val
.. ipython:: python
- business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B')
- df = DataFrame(1, index=business_dates, columns=['a', 'b'])
+ business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')
+ df = pd.DataFrame(1, index=business_dates, columns=['a', 'b'])
# get the first, 4th, and last date index for each month
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
@@ -959,7 +962,7 @@ the values in column 1 where the group is "B" are 3 higher on average.
.. ipython:: python
np.random.seed(1234)
- df = DataFrame(np.random.randn(50, 2))
+ df = pd.DataFrame(np.random.randn(50, 2))
df['g'] = np.random.choice(['A', 'B'], size=50)
df.loc[df['g'] == 'B', 1] += 3
@@ -1008,11 +1011,11 @@ column index name will be used as the name of the inserted column:
.. ipython:: python
df = pd.DataFrame({
- 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
- 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
- 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
- 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
- })
+ 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
+ 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
+ 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
+ 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
+ })
def compute_metrics(x):
result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()}
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
index a1912032bc3bf..38629ee7baaea 100644
--- a/doc/source/indexing.rst
+++ b/doc/source/indexing.rst
@@ -6,15 +6,10 @@
:suppress:
import numpy as np
- import random
np.random.seed(123456)
- from pandas import *
- options.display.max_rows=15
- import pandas as pd
- randn = np.random.randn
- randint = np.random.randint
np.set_printoptions(precision=4, suppress=True)
- from pandas.compat import range, zip
+ import pandas as pd
+ pd.options.display.max_rows=15
***************************
Indexing and Selecting Data
@@ -126,18 +121,6 @@ the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to
DataFrame; ``df.loc[row_indexer,column_indexer]``
Panel; ``p.loc[item_indexer,major_indexer,minor_indexer]``
-Deprecations
-------------
-
-Beginning with version 0.11.0, it's recommended that you transition away from
-the following methods as they *may* be deprecated in future versions.
-
- - ``irow``
- - ``icol``
- - ``iget_value``
-
-See the section :ref:`Selection by Position ` for substitutes.
-
.. _indexing.basics:
Basics
@@ -162,10 +145,10 @@ indexing functionality:
.. ipython:: python
- dates = date_range('1/1/2000', periods=8)
- df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
+ dates = pd.date_range('1/1/2000', periods=8)
+ df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
- panel = Panel({'one' : df, 'two' : df - df.mean()})
+ panel = pd.Panel({'one' : df, 'two' : df - df.mean()})
panel
.. note::
@@ -208,7 +191,7 @@ as an attribute:
.. ipython:: python
- sa = Series([1,2,3],index=list('abc'))
+ sa = pd.Series([1,2,3],index=list('abc'))
dfa = df.copy()
.. ipython:: python
@@ -307,7 +290,7 @@ Selection By Label
.. ipython:: python
- dfl = DataFrame(np.random.randn(5,4), columns=list('ABCD'), index=date_range('20130101',periods=5))
+ dfl = pd.DataFrame(np.random.randn(5,4), columns=list('ABCD'), index=pd.date_range('20130101',periods=5))
dfl
.. code-block:: python
@@ -333,7 +316,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp
.. ipython:: python
- s1 = Series(np.random.randn(6),index=list('abcdef'))
+ s1 = pd.Series(np.random.randn(6),index=list('abcdef'))
s1
s1.loc['c':]
s1.loc['b']
@@ -349,9 +332,9 @@ With a DataFrame
.. ipython:: python
- df1 = DataFrame(np.random.randn(6,4),
- index=list('abcdef'),
- columns=list('ABCD'))
+ df1 = pd.DataFrame(np.random.randn(6,4),
+ index=list('abcdef'),
+ columns=list('ABCD'))
df1
df1.loc[['a','b','d'],:]
@@ -403,7 +386,7 @@ The ``.iloc`` attribute is the primary access method. The following are valid in
.. ipython:: python
- s1 = Series(np.random.randn(5),index=list(range(0,10,2)))
+ s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))
s1
s1.iloc[:3]
s1.iloc[3]
@@ -419,9 +402,9 @@ With a DataFrame
.. ipython:: python
- df1 = DataFrame(np.random.randn(6,4),
- index=list(range(0,12,2)),
- columns=list(range(0,8,2)))
+ df1 = pd.DataFrame(np.random.randn(6,4),
+ index=list(range(0,12,2)),
+ columns=list(range(0,8,2)))
df1
Select via integer slicing
@@ -437,20 +420,14 @@ Select via integer list
df1.iloc[[1,3,5],[1,3]]
-For slicing rows explicitly (equiv to deprecated ``df.irow(slice(1,3))``).
-
.. ipython:: python
df1.iloc[1:3,:]
-For slicing columns explicitly (equiv to deprecated ``df.icol(slice(1,3))``).
-
.. ipython:: python
df1.iloc[:,1:3]
-For getting a scalar via integer position (equiv to deprecated ``df.get_value(1,1)``)
-
.. ipython:: python
# this is also equivalent to ``df1.iat[1,1]``
@@ -472,7 +449,7 @@ Out of range slice indexes are handled gracefully just as in Python/Numpy.
x
x[4:10]
x[8:10]
- s = Series(x)
+ s = pd.Series(x)
s
s.iloc[4:10]
s.iloc[8:10]
@@ -488,7 +465,7 @@ returned)
.. ipython:: python
- dfl = DataFrame(np.random.randn(5,2),columns=list('AB'))
+ dfl = pd.DataFrame(np.random.randn(5,2), columns=list('AB'))
dfl
dfl.iloc[:,2:3]
dfl.iloc[:,1:3]
@@ -516,7 +493,7 @@ A random selection of rows or columns from a Series, DataFrame, or Panel with th
.. ipython :: python
- s = Series([0,1,2,3,4,5])
+ s = pd.Series([0,1,2,3,4,5])
# When no arguments are passed, returns 1 row.
s.sample()
@@ -532,7 +509,7 @@ using the ``replace`` option:
.. ipython :: python
- s = Series([0,1,2,3,4,5])
+ s = pd.Series([0,1,2,3,4,5])
# Without replacement (default):
s.sample(n=6, replace=False)
@@ -547,7 +524,7 @@ to have different probabilities, you can pass the ``sample`` function sampling w
.. ipython :: python
- s = Series([0,1,2,3,4,5])
+ s = pd.Series([0,1,2,3,4,5])
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights)
@@ -561,21 +538,21 @@ as a string.
.. ipython :: python
- df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+ df2 = pd.DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
df2.sample(n = 3, weights = 'weight_column')
``sample`` also allows users to sample columns instead of rows using the ``axis`` argument.
.. ipython :: python
- df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+ df3 = pd.DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
df3.sample(n=1, axis=1)
Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a numpy RandomState object.
.. ipython :: python
- df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+ df4 = pd.DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
# With a given seed, the sample will always draw the same rows.
df4.sample(n=2, random_state=2)
@@ -594,7 +571,7 @@ In the ``Series`` case this is effectively an appending operation
.. ipython:: python
- se = Series([1,2,3])
+ se = pd.Series([1,2,3])
se
se[5] = 5.
se
@@ -603,7 +580,7 @@ A ``DataFrame`` can be enlarged on either axis via ``.loc``
.. ipython:: python
- dfi = DataFrame(np.arange(6).reshape(3,2),
+ dfi = pd.DataFrame(np.arange(6).reshape(3,2),
columns=['A','B'])
dfi
dfi.loc[:,'C'] = dfi.loc[:,'A']
@@ -661,7 +638,7 @@ Using a boolean vector to index a Series works exactly as in a numpy ndarray:
.. ipython:: python
- s = Series(range(-3, 4))
+ s = pd.Series(range(-3, 4))
s
s[s > 0]
s[(s < -1) | (s > 0.5)]
@@ -680,9 +657,9 @@ more complex criteria:
.. ipython:: python
- df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
- 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
- 'c' : randn(7)})
+ df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
+ 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
+ 'c' : np.random.randn(7)})
# only want 'two' or 'three'
criterion = df2['a'].map(lambda x: x.startswith('t'))
@@ -713,7 +690,7 @@ select rows where one or more columns have values you want:
.. ipython:: python
- s = Series(np.arange(5),index=np.arange(5)[::-1],dtype='int64')
+ s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s
s.isin([2, 4, 6])
s[s.isin([2, 4, 6])]
@@ -733,8 +710,8 @@ in the membership check:
.. ipython:: python
- s_mi = Series(np.arange(6),
- index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
+ s_mi = pd.Series(np.arange(6),
+ index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]
@@ -746,8 +723,8 @@ wherever the element is in the sequence of values.
.. ipython:: python
- df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
- 'ids2': ['a', 'n', 'c', 'n']})
+ df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
+ 'ids2': ['a', 'n', 'c', 'n']})
values = ['a', 'b', 1, 3]
@@ -801,8 +778,8 @@ Equivalent is ``df.where(df < 0)``
.. ipython:: python
:suppress:
- dates = date_range('1/1/2000', periods=8)
- df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
+ dates = pd.date_range('1/1/2000', periods=8)
+ df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
.. ipython:: python
@@ -889,16 +866,10 @@ method that allows selection using an expression.
You can get the value of the frame where column ``b`` has values
between the values of columns ``a`` and ``c``. For example:
-.. ipython:: python
- :suppress:
-
- from numpy.random import randint, rand
- np.random.seed(1234)
-
.. ipython:: python
n = 10
- df = DataFrame(rand(n, 3), columns=list('abc'))
+ df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df
# pure python
@@ -912,7 +883,7 @@ with the name ``a``.
.. ipython:: python
- df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc'))
+ df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))
df.index.name = 'a'
df
df.query('a < b and b < c')
@@ -928,7 +899,7 @@ If instead you don't want to or cannot name your index, you can use the name
.. ipython:: python
- df = DataFrame(randint(n, size=(n, 2)), columns=list('bc'))
+ df = pd.DataFrame(np.random.randint(n, size=(n, 2)), columns=list('bc'))
df
df.query('index < b < c')
@@ -946,7 +917,7 @@ If instead you don't want to or cannot name your index, you can use the name
.. ipython:: python
- df = DataFrame({'a': randint(5, size=5)})
+ df = pd.DataFrame({'a': np.random.randint(5, size=5)})
df.index.name = 'a'
df.query('a > 2') # uses the column 'a', not the index
@@ -970,23 +941,20 @@ You can also use the levels of a ``DataFrame`` with a
.. ipython:: python
- import pandas.util.testing as tm
-
n = 10
- colors = tm.choice(['red', 'green'], size=n)
- foods = tm.choice(['eggs', 'ham'], size=n)
+ colors = np.random.choice(['red', 'green'], size=n)
+ foods = np.random.choice(['eggs', 'ham'], size=n)
colors
foods
- index = MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
- df = DataFrame(randn(n, 2), index=index)
+ index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
+ df = pd.DataFrame(np.random.randn(n, 2), index=index)
df
df.query('color == "red"')
If the levels of the ``MultiIndex`` are unnamed, you can refer to them using
special names:
-
.. ipython:: python
df.index.names = [None, None]
@@ -1008,9 +976,9 @@ having to specify which frame you're interested in querying
.. ipython:: python
- df = DataFrame(rand(n, 3), columns=list('abc'))
+ df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df
- df2 = DataFrame(rand(n + 2, 3), columns=df.columns)
+ df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns)
df2
expr = '0.0 <= a <= c <= 0.5'
map(lambda frame: frame.query(expr), [df, df2])
@@ -1022,7 +990,7 @@ Full numpy-like syntax
.. ipython:: python
- df = DataFrame(randint(n, size=(n, 3)), columns=list('abc'))
+ df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc'))
df
df.query('(a < b) & (b < c)')
df[(df.a < df.b) & (df.b < df.c)]
@@ -1065,8 +1033,9 @@ The ``in`` and ``not in`` operators
.. ipython:: python
# get all rows where columns "a" and "b" have overlapping values
- df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
- 'c': randint(5, size=12), 'd': randint(9, size=12)})
+ df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
+ 'c': np.random.randint(5, size=12),
+ 'd': np.random.randint(9, size=12)})
df
df.query('a in b')
@@ -1139,8 +1108,8 @@ You can negate boolean expressions with the word ``not`` or the ``~`` operator.
.. ipython:: python
- df = DataFrame(rand(n, 3), columns=list('abc'))
- df['bools'] = rand(len(df)) > 0.5
+ df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
+ df['bools'] = np.random.rand(len(df)) > 0.5
df.query('~bools')
df.query('not bools')
df.query('not bools') == df[~df.bools]
@@ -1192,7 +1161,7 @@ floating point values generated using ``numpy.random.randn()``.
.. ipython:: python
:suppress:
- df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
+ df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df2 = df.copy()
@@ -1209,28 +1178,45 @@ takes as an argument the columns to use to identify duplicated rows.
- ``drop_duplicates`` removes duplicate rows.
By default, the first observed row of a duplicate set is considered unique, but
-each method has a ``take_last`` parameter that indicates the last observed row
-should be taken instead.
+each method has a ``keep`` parameter to specify targets to be kept.
+
+- ``keep='first'`` (default): mark / drop duplicates except for the first occurrence.
+- ``keep='last'``: mark / drop duplicates except for the last occurrence.
+- ``keep=False``: mark / drop all duplicates.
.. ipython:: python
- df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
- 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
- 'c' : np.random.randn(7)})
- df2.duplicated(['a','b'])
- df2.drop_duplicates(['a','b'])
- df2.drop_duplicates(['a','b'], take_last=True)
+ df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
+ 'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
+ 'c': np.random.randn(7)})
+ df2
+ df2.duplicated('a')
+ df2.duplicated('a', keep='last')
+ df2.duplicated('a', keep=False)
+ df2.drop_duplicates('a')
+ df2.drop_duplicates('a', keep='last')
+ df2.drop_duplicates('a', keep=False)
-An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``.
+Also, you can pass a list of columns to identify duplications.
.. ipython:: python
- df3 = df2.set_index('b')
- df3
- df3.groupby(level=0).first()
+ df2.duplicated(['a', 'b'])
+ df2.drop_duplicates(['a', 'b'])
- # a bit more verbose
- df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b')
+To drop duplicates by index value, use ``Index.duplicated`` then perform slicing.
+Same options are available in ``keep`` parameter.
+
+.. ipython:: python
+
+ df3 = pd.DataFrame({'a': np.arange(6),
+ 'b': np.random.randn(6)},
+ index=['a', 'a', 'b', 'c', 'b', 'a'])
+ df3
+ df3.index.duplicated()
+ df3[~df3.index.duplicated()]
+ df3[~df3.index.duplicated(keep='last')]
+ df3[~df3.index.duplicated(keep=False)]
.. _indexing.dictionarylike:
@@ -1242,7 +1228,7 @@ default value.
.. ipython:: python
- s = Series([1,2,3], index=['a','b','c'])
+ s = pd.Series([1,2,3], index=['a','b','c'])
s.get('a') # equivalent to s['a']
s.get('x', default=-1)
@@ -1267,7 +1253,7 @@ numpy array. For instance,
.. ipython:: python
- dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D'])
+ dflookup = pd.DataFrame(np.random.rand(20,4), columns = ['A','B','C','D'])
dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D'])
.. _indexing.class:
@@ -1287,7 +1273,7 @@ lookups, data alignment, and reindexing. The easiest way to create an
.. ipython:: python
- index = Index(['e', 'd', 'a', 'b'])
+ index = pd.Index(['e', 'd', 'a', 'b'])
index
'd' in index
@@ -1296,26 +1282,26 @@ You can also pass a ``name`` to be stored in the index:
.. ipython:: python
- index = Index(['e', 'd', 'a', 'b'], name='something')
+ index = pd.Index(['e', 'd', 'a', 'b'], name='something')
index.name
The name, if set, will be shown in the console display:
.. ipython:: python
- index = Index(list(range(5)), name='rows')
- columns = Index(['A', 'B', 'C'], name='cols')
- df = DataFrame(np.random.randn(5, 3), index=index, columns=columns)
+ index = pd.Index(list(range(5)), name='rows')
+ columns = pd.Index(['A', 'B', 'C'], name='cols')
+ df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns)
df
df['A']
+.. _indexing.set_metadata:
+
Setting metadata
~~~~~~~~~~~~~~~~
.. versionadded:: 0.13.0
-.. _indexing.set_metadata:
-
Indexes are "mostly immutable", but it is possible to set and change their
metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and
``labels``).
@@ -1328,7 +1314,7 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes.
.. ipython:: python
- ind = Index([1, 2, 3])
+ ind = pd.Index([1, 2, 3])
ind.rename("apple")
ind
ind.set_names(["apple"], inplace=True)
@@ -1342,8 +1328,7 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes.
.. ipython:: python
-
- index = MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
+ index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
index
index.levels[1]
index.set_levels(["a", "b"], level=1)
@@ -1364,8 +1349,8 @@ operators. Difference is provided via the ``.difference()`` method.
.. ipython:: python
- a = Index(['c', 'b', 'a'])
- b = Index(['c', 'e', 'd'])
+ a = pd.Index(['c', 'b', 'a'])
+ b = pd.Index(['c', 'e', 'd'])
a | b
a & b
a.difference(b)
@@ -1377,8 +1362,8 @@ with duplicates dropped.
.. ipython:: python
- idx1 = Index([1, 2, 3, 4])
- idx2 = Index([2, 3, 4, 5])
+ idx1 = pd.Index([1, 2, 3, 4])
+ idx2 = pd.Index([2, 3, 4, 5])
idx1.sym_diff(idx2)
idx1 ^ idx2
@@ -1401,10 +1386,10 @@ indexed DataFrame:
.. ipython:: python
:suppress:
- data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo'],
- 'b' : ['one', 'two', 'one', 'two'],
- 'c' : ['z', 'y', 'x', 'w'],
- 'd' : [1., 2., 3, 4]})
+ data = pd.DataFrame({'a' : ['bar', 'bar', 'foo', 'foo'],
+ 'b' : ['one', 'two', 'one', 'two'],
+ 'c' : ['z', 'y', 'x', 'w'],
+ 'd' : [1., 2., 3, 4]})
.. ipython:: python
@@ -1482,12 +1467,12 @@ When setting values in a pandas object, care must be taken to avoid what is call
.. ipython:: python
- dfmi = DataFrame([list('abcd'),
- list('efgh'),
- list('ijkl'),
- list('mnop')],
- columns=MultiIndex.from_product([['one','two'],
- ['first','second']]))
+ dfmi = pd.DataFrame([list('abcd'),
+ list('efgh'),
+ list('ijkl'),
+ list('mnop')],
+ columns=pd.MultiIndex.from_product([['one','two'],
+ ['first','second']]))
dfmi
Compare these two access methods:
@@ -1543,9 +1528,9 @@ which can take the values ``['raise','warn',None]``, where showing a warning is
.. ipython:: python
:okwarning:
- dfb = DataFrame({'a' : ['one', 'one', 'two',
- 'three', 'two', 'one', 'six'],
- 'c' : np.arange(7)})
+ dfb = pd.DataFrame({'a' : ['one', 'one', 'two',
+ 'three', 'two', 'one', 'six'],
+ 'c' : np.arange(7)})
# This will show the SettingWithCopyWarning
# but the frame values will be set
@@ -1573,7 +1558,7 @@ This is the correct access method
.. ipython:: python
- dfc = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]})
+ dfc = pd.DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]})
dfc.loc[0,'A'] = 11
dfc
diff --git a/doc/source/install.rst b/doc/source/install.rst
index b3f86db5e3e59..42cfd95becabb 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -18,7 +18,7 @@ Instructions for installing from source,
Python version support
----------------------
-Officially Python 2.6, 2.7, 3.2, 3.3, and 3.4.
+Officially Python 2.6, 2.7, 3.3, and 3.4.
Installing pandas
-----------------
@@ -153,7 +153,8 @@ and can take a few minutes to complete.
Installing using your Linux distribution's package manager.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+The commands in this table will install pandas for Python 2 from your distribution.
+To install pandas for Python 3 you may need to use the package ``python3-pandas``.
.. csv-table::
:header: "Distribution", "Status", "Download / Repository Link", "Install method"
@@ -212,6 +213,7 @@ installed), make sure you have `nose
Dependencies
------------
+* `setuptools `__
* `NumPy `__: 1.7.0 or higher
* `python-dateutil `__ 1.5 or higher
* `pytz `__
@@ -249,10 +251,9 @@ Optional Dependencies
* `statsmodels `__
* Needed for parts of :mod:`pandas.stats`
* `openpyxl `__, `xlrd/xlwt `__
- * openpyxl version 1.6.1 or higher, but lower than 2.0.0
* Needed for Excel I/O
* `XlsxWriter `__
- * Alternative Excel writer.
+ * Alternative Excel writer
* `boto `__: necessary for Amazon S3
access.
* `blosc `__: for msgpack compression using ``blosc``
@@ -267,11 +268,11 @@ Optional Dependencies
installation.
* Google's `python-gflags `__
and `google-api-python-client `__
- * Needed for :mod:`~pandas.io.gbq`
+ * Needed for :mod:`~pandas.io.gbq`
* `setuptools `__
- * Needed for :mod:`~pandas.io.gbq` (specifically, it utilizes `pkg_resources`)
+ * Needed for :mod:`~pandas.io.gbq` (specifically, it utilizes `pkg_resources`)
* `httplib2 `__
- * Needed for :mod:`~pandas.io.gbq`
+ * Needed for :mod:`~pandas.io.gbq`
* One of the following combinations of libraries is needed to use the
top-level :func:`~pandas.io.html.read_html` function:
diff --git a/doc/source/internals.rst b/doc/source/internals.rst
index 8b4f7360fc235..3d96b93de4cc9 100644
--- a/doc/source/internals.rst
+++ b/doc/source/internals.rst
@@ -6,15 +6,10 @@
:suppress:
import numpy as np
- import random
np.random.seed(123456)
- from pandas import *
- options.display.max_rows=15
- import pandas as pd
- randn = np.random.randn
- randint = np.random.randint
np.set_printoptions(precision=4, suppress=True)
- from pandas.compat import range, zip
+ import pandas as pd
+ pd.options.display.max_rows = 15
*********
Internals
@@ -40,7 +35,7 @@ containers for the axis labels:
- ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values)
- ``PeriodIndex``: An Index object with Period elements
-These are range generates to make the creation of a regular index easy:
+There are functions that make the creation of a regular index easy:
- ``date_range``: fixed frequency date range generated from a time rule or
DateOffset. An ndarray of Python datetime objects
@@ -81,7 +76,7 @@ integer **labels**, and the level **names**:
.. ipython:: python
- index = MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
+ index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
index
index.levels
index.labels
@@ -198,7 +193,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
Define Original Properties
~~~~~~~~~~~~~~~~~~~~~~~~~~
-To let original data structures have additional properties, you should let ``pandas`` knows what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways:
+To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways:
1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results.
2. Define ``_metadata`` for normal properties which will be passed to manipulation results.
@@ -210,7 +205,7 @@ Below is an example to define 2 original properties, "internal_cache" as a tempo
class SubclassedDataFrame2(DataFrame):
# temporary properties
- _internal_names = DataFrame._internal_names + ['internal_cache']
+ _internal_names = pd.DataFrame._internal_names + ['internal_cache']
_internal_names_set = set(_internal_names)
# normal properties
@@ -244,5 +239,3 @@ Below is an example to define 2 original properties, "internal_cache" as a tempo
# properties defined in _metadata are retained
>>> df[['A', 'B']].added_property
property
-
-
diff --git a/doc/source/io.rst b/doc/source/io.rst
index 73a2f2f1d3531..2f2c4c7566413 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -41,6 +41,7 @@ object.
* :ref:`read_html`
* :ref:`read_gbq` (experimental)
* :ref:`read_stata`
+ * :ref:`read_sas`
* :ref:`read_clipboard`
* :ref:`read_pickle`
@@ -115,7 +116,7 @@ They can take a number of arguments:
as the index.
- ``names``: List of column names to use as column names. To replace header
existing in file, explicitly pass ``header=0``.
- - ``na_values``: optional list of strings to recognize as NaN (missing
+ - ``na_values``: optional string or list of strings to recognize as NaN (missing
values), either in addition to or in lieu of the default set.
- ``true_values``: list of strings to recognize as ``True``
- ``false_values``: list of strings to recognize as ``False``
@@ -723,7 +724,8 @@ NA Values
~~~~~~~~~
To control which values are parsed as missing values (which are signified by ``NaN``), specifiy a
-list of strings in ``na_values``. If you specify a number (a ``float``, like ``5.0`` or an ``integer`` like ``5``),
+string in ``na_values``. If you specify a list of strings, then all values in
+it are considered to be missing values. If you specify a number (a ``float``, like ``5.0`` or an ``integer`` like ``5``),
the corresponding equivalent values will also imply a missing value (in this case effectively
``[5.0,5]`` are recognized as ``NaN``.
@@ -2130,7 +2132,9 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
df1.to_excel(writer, sheet_name='Sheet1')
df2.to_excel(writer, sheet_name='Sheet2')
-.. note:: Wringing a little more performance out of ``read_excel``
+.. note::
+
+ Wringing a little more performance out of ``read_excel``
Internally, Excel stores all numeric data as floats. Because this can
produce unexpected behavior when reading in data, pandas defaults to trying
to convert integers to floats if it doesn't lose information (``1.0 -->
@@ -2182,6 +2186,45 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are:
df.to_excel('path_to_file.xlsx', sheet_name='Sheet1')
+.. _io.excel_writing_buffer:
+
+Writing Excel Files to Memory
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.17
+
+Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or
+``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`.
+
+.. code-block:: python
+
+ # Safe import for either Python 2.x or 3.x
+ try:
+ from io import BytesIO
+ except ImportError:
+ from cStringIO import StringIO as BytesIO
+
+ bio = BytesIO()
+
+ # By setting the 'engine' in the ExcelWriter constructor.
+ writer = ExcelWriter(bio, engine='xlsxwriter')
+ df.to_excel(writer, sheet_name='Sheet1')
+
+ # Save the workbook
+ writer.save()
+
+ # Seek to the beginning and read to copy the workbook to a variable in memory
+ bio.seek(0)
+ workbook = bio.read()
+
+.. note::
+
+ ``engine`` is optional but recommended. Setting the engine determines
+ the version of workbook produced. Setting ``engine='xlrd'`` will produce an
+ Excel 2003-format workbook (xls). Using either ``'openpyxl'`` or
+ ``'xlsxwriter'`` will produce an Excel 2007-format workbook (xlsx). If
+ omitted, an Excel 2007-formatted workbook is produced.
+
.. _io.clipboard:
Clipboard
@@ -2365,9 +2408,13 @@ for some advanced strategies
As of version 0.15.0, pandas requires ``PyTables`` >= 3.0.0. Stores written with prior versions of pandas / ``PyTables`` >= 2.3 are fully compatible (this was the previous minimum ``PyTables`` required version).
.. warning::
-
+
There is a ``PyTables`` indexing bug which may appear when querying stores using an index. If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. Stores created previously will need to be rewritten using the updated version.
+.. warning::
+
+ As of version 0.17.0, ``HDFStore`` will not drop rows that have all missing values by default. Previously, if all values (except the index) were missing, ``HDFStore`` would not write those rows to disk.
+
.. ipython:: python
:suppress:
:okexcept:
@@ -2444,6 +2491,8 @@ Closing a Store, Context Manager
import os
os.remove('store.h5')
+
+
Read/Write API
~~~~~~~~~~~~~~
@@ -2462,6 +2511,65 @@ similar to how ``read_csv`` and ``to_csv`` work. (new in 0.11.0)
os.remove('store_tl.h5')
+
+As of version 0.17.0, HDFStore will no longer drop rows that are all missing by default. This behavior can be enabled by setting ``dropna=True``.
+
+.. ipython:: python
+ :suppress:
+
+ import os
+
+.. ipython:: python
+
+ df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2],
+ 'col2':[1, np.nan, np.nan]})
+ df_with_missing
+
+ df_with_missing.to_hdf('file.h5', 'df_with_missing',
+ format = 'table', mode='w')
+
+ pd.read_hdf('file.h5', 'df_with_missing')
+
+ df_with_missing.to_hdf('file.h5', 'df_with_missing',
+ format = 'table', mode='w', dropna=True)
+ pd.read_hdf('file.h5', 'df_with_missing')
+
+
+.. ipython:: python
+ :suppress:
+
+ os.remove('file.h5')
+
+This is also true for the major axis of a ``Panel``:
+
+.. ipython:: python
+
+ matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]],
+ [[np.nan, np.nan, np.nan], [np.nan,5,6]],
+ [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]]
+
+ panel_with_major_axis_all_missing = Panel(matrix,
+ items=['Item1', 'Item2','Item3'],
+ major_axis=[1,2],
+ minor_axis=['A', 'B', 'C'])
+
+ panel_with_major_axis_all_missing
+
+ panel_with_major_axis_all_missing.to_hdf('file.h5', 'panel',
+ dropna = True,
+ format='table',
+ mode='w')
+ reloaded = read_hdf('file.h5', 'panel')
+ reloaded
+
+
+.. ipython:: python
+ :suppress:
+
+ os.remove('file.h5')
+
+
+
.. _io.hdf5-fixed:
Fixed Format
@@ -3117,8 +3225,7 @@ Notes & Caveats
``PyTables`` only supports concurrent reads (via threading or
processes). If you need reading and writing *at the same time*, you
need to serialize these operations in a single thread in a single
- process. You will corrupt your data otherwise. See the issue
- (:`2397`) for more information.
+ process. You will corrupt your data otherwise. See the (:issue:`2397`) for more information.
- If you use locks to manage write access between multiple processes, you
may want to use :py:func:`~os.fsync` before releasing write locks. For
convenience you can use ``store.flush(fsync=True)`` to do this for you.
@@ -3137,9 +3244,10 @@ Notes & Caveats
.. warning::
``PyTables`` will show a ``NaturalNameWarning`` if a column name
- cannot be used as an attribute selector. Generally identifiers that
- have spaces, start with numbers, or ``_``, or have ``-`` embedded are not considered
- *natural*. These types of identifiers cannot be used in a ``where`` clause
+ cannot be used as an attribute selector.
+ *Natural* identifiers contain only letters, numbers, and underscores,
+ and may not begin with a number.
+ Other identifiers cannot be used in a ``where`` clause
and are generally a bad idea.
DataTypes
@@ -3148,34 +3256,19 @@ DataTypes
``HDFStore`` will map an object dtype to the ``PyTables`` underlying
dtype. This means the following types are known to work:
- - floating : ``float64, float32, float16`` *(using* ``np.nan`` *to
- represent invalid values)*
- - integer : ``int64, int32, int8, uint64, uint32, uint8``
- - bool
- - datetime64[ns] *(using* ``NaT`` *to represent invalid values)*
- - object : ``strings`` *(using* ``np.nan`` *to represent invalid
- values)*
-
-Currently, ``unicode`` and ``datetime`` columns (represented with a
-dtype of ``object``), **WILL FAIL**. In addition, even though a column
-may look like a ``datetime64[ns]``, if it contains ``np.nan``, this
-**WILL FAIL**. You can try to convert datetimelike columns to proper
-``datetime64[ns]`` columns, that possibly contain ``NaT`` to represent
-invalid values. (Some of these issues have been addressed and these
-conversion may not be necessary in future versions of pandas)
-
- .. ipython:: python
-
- import datetime
- df = DataFrame(dict(datelike=Series([datetime.datetime(2001, 1, 1),
- datetime.datetime(2001, 1, 2), np.nan])))
- df
- df.dtypes
-
- # to convert
- df['datelike'] = Series(df['datelike'].values, dtype='M8[ns]')
- df
- df.dtypes
+====================================================== =========================
+Type Represents missing values
+====================================================== =========================
+floating : ``float64, float32, float16`` ``np.nan``
+integer : ``int64, int32, int8, uint64,uint32, uint8``
+boolean
+``datetime64[ns]`` ``NaT``
+``timedelta64[ns]`` ``NaT``
+categorical : see the section below
+object : ``strings`` ``np.nan``
+====================================================== =========================
+
+``unicode`` columns are not supported, and **WILL FAIL**.
.. _io.hdf5-categorical:
@@ -3513,9 +3606,16 @@ below and the SQLAlchemy `documentation /
- # where is relative:
- engine = create_engine('sqlite:///foo.db')
+ # sqlite:///
+ # where is relative:
+ engine = create_engine('sqlite:///foo.db')
- # or absolute, starting with a slash:
- engine = create_engine('sqlite:////absolute/path/to/foo.db')
+ # or absolute, starting with a slash:
+ engine = create_engine('sqlite:////absolute/path/to/foo.db')
For more information see the examples the SQLAlchemy `documentation `__
@@ -3824,8 +3924,8 @@ will produce the dictionary representation of the schema.
.. code-block:: python
- df = pandas.DataFrame({'A': [1.0]})
- gbq.generate_bq_schema(df, default_type='STRING')
+ df = pandas.DataFrame({'A': [1.0]})
+ gbq.generate_bq_schema(df, default_type='STRING')
.. warning::
@@ -3939,8 +4039,11 @@ missing values are represented as ``np.nan``. If ``True``, missing values are
represented using ``StataMissingValue`` objects, and columns containing missing
values will have ``object`` data type.
-:func:`~pandas.read_stata` and :class:`~pandas.io.stata.StataReader` supports .dta
-formats 104, 105, 108, 113-115 (Stata 10-12) and 117 (Stata 13+).
+.. note::
+
+ :func:`~pandas.read_stata` and
+ :class:`~pandas.io.stata.StataReader` support .dta formats 113-115
+ (Stata 10-12), 117 (Stata 13), and 118 (Stata 14).
.. note::
@@ -4018,6 +4121,46 @@ easy conversion to and from pandas.
.. _xray: http://xray.readthedocs.org/
+.. _io.sas:
+
+SAS Format
+----------
+
+.. versionadded:: 0.17.0
+
+The top-level function :function:`read_sas` currently can read (but
+not write) SAS xport (.XPT) format files. Pandas cannot currently
+handle SAS7BDAT files.
+
+XPORT files only contain two value types: ASCII text and double
+precision numeric values. There is no automatic type conversion to
+integers, dates, or categoricals. By default the whole file is read
+and returned as a ``DataFrame``.
+
+Specify a ``chunksize`` or use ``iterator=True`` to obtain an
+``XportReader`` object for incrementally reading the file. The
+``XportReader`` object also has attributes that contain additional
+information about the file and its variables.
+
+Read a SAS XPORT file:
+
+.. code-block:: python
+
+ df = pd.read_sas('sas_xport.xpt')
+
+Obtain an iterator and read an XPORT file 100,000 lines at a time:
+
+.. code-block:: python
+
+ rdr = pd.read_sas('sas_xport.xpt', chunk=100000)
+ for chunk in rdr:
+ do_something(chunk)
+
+The specification_ for the xport file format is available from the SAS
+web site.
+
+.. _specification: https://support.sas.com/techsup/technote/ts140.pdf
+
.. _io.perf:
Performance Considerations
@@ -4027,14 +4170,16 @@ This is an informal comparison of various IO methods, using pandas 0.13.1.
.. code-block:: python
- In [3]: df = DataFrame(randn(1000000,2),columns=list('AB'))
+ In [1]: df = DataFrame(randn(1000000,2),columns=list('AB'))
+
+ In [2]: df.info()
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
- A 1000000 non-null values
- B 1000000 non-null values
+ A 1000000 non-null float64
+ B 1000000 non-null float64
dtypes: float64(2)
-
+ memory usage: 22.9 MB
Writing
diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst
index 04a6302f958a2..51293ca4240c6 100644
--- a/doc/source/missing_data.rst
+++ b/doc/source/missing_data.rst
@@ -68,26 +68,41 @@ detect this value with data of different types: floating point, integer,
boolean, and general object. In many cases, however, the Python ``None`` will
arise and we wish to also consider that "missing" or "null".
-Prior to version v0.10.0 ``inf`` and ``-inf`` were also
-considered to be "null" in computations. This is no longer the case by
-default; use the ``mode.use_inf_as_null`` option to recover it.
+.. note::
+
+ Prior to version v0.10.0 ``inf`` and ``-inf`` were also
+ considered to be "null" in computations. This is no longer the case by
+ default; use the ``mode.use_inf_as_null`` option to recover it.
.. _missing.isnull:
To make detecting missing values easier (and across different array dtypes),
pandas provides the :func:`~pandas.core.common.isnull` and
:func:`~pandas.core.common.notnull` functions, which are also methods on
-``Series`` objects:
+``Series`` and ``DataFrame`` objects:
.. ipython:: python
df2['one']
- isnull(df2['one'])
+ pd.isnull(df2['one'])
df2['four'].notnull()
+ df2.isnull()
+
+.. warning::
+
+ One has to be mindful that in python (and numpy), the ``nan's`` don't compare equal, but ``None's`` **do**.
+ Note that Pandas/numpy uses the fact that ``np.nan != np.nan``, and treats ``None`` like ``np.nan``.
+
+ .. ipython:: python
-**Summary:** ``NaN`` and ``None`` (in object arrays) are considered
-missing by the ``isnull`` and ``notnull`` functions. ``inf`` and
-``-inf`` are no longer considered missing by default.
+ None == None
+ np.nan == np.nan
+
+ So as compared to above, a scalar equality comparison versus a ``None/np.nan`` doesn't provide useful information.
+
+ .. ipython:: python
+
+ df2['one'] == np.nan
Datetimes
---------
@@ -99,7 +114,7 @@ pandas objects provide intercompatibility between ``NaT`` and ``NaN``.
.. ipython:: python
df2 = df.copy()
- df2['timestamp'] = Timestamp('20120101')
+ df2['timestamp'] = pd.Timestamp('20120101')
df2
df2.ix[['a','c','h'],['one','timestamp']] = np.nan
df2
@@ -158,10 +173,10 @@ The descriptive statistics and computational methods discussed in the
` and :ref:`here `) are all written to
account for missing data. For example:
- * When summing data, NA (missing) values will be treated as zero
- * If the data are all NA, the result will be NA
- * Methods like **cumsum** and **cumprod** ignore NA values, but preserve them
- in the resulting arrays
+* When summing data, NA (missing) values will be treated as zero
+* If the data are all NA, the result will be NA
+* Methods like **cumsum** and **cumprod** ignore NA values, but preserve them
+ in the resulting arrays
.. ipython:: python
@@ -174,9 +189,14 @@ NA values in GroupBy
~~~~~~~~~~~~~~~~~~~~
NA groups in GroupBy are automatically excluded. This behavior is consistent
-with R, for example.
+with R, for example:
+
+.. ipython:: python
+ df
+ df.groupby('one').mean()
+See the groupby section :ref:`here ` for more information.
Cleaning / filling missing data
--------------------------------
@@ -255,7 +275,7 @@ use case of this is to fill a DataFrame with the mean of that column.
.. ipython:: python
- dff = pd.DataFrame(np.random.randn(10,3),columns=list('ABC'))
+ dff = pd.DataFrame(np.random.randn(10,3), columns=list('ABC'))
dff.iloc[3:5,0] = np.nan
dff.iloc[4:6,1] = np.nan
dff.iloc[5:8,2] = np.nan
@@ -271,7 +291,7 @@ a Series in this case.
.. ipython:: python
- dff.where(notnull(dff),dff.mean(),axis='columns')
+ dff.where(pd.notnull(dff), dff.mean(), axis='columns')
.. _missing_data.dropna:
@@ -316,7 +336,7 @@ performs linear interpolation at missing datapoints.
:suppress:
np.random.seed(123456)
- idx = date_range('1/1/2000', periods=100, freq='BM')
+ idx = pd.date_range('1/1/2000', periods=100, freq='BM')
ts = pd.Series(np.random.randn(100), index=idx)
ts[1:20] = np.nan
ts[60:80] = np.nan
@@ -363,7 +383,7 @@ You can also interpolate with a DataFrame:
.. ipython:: python
df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8],
- 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]})
+ 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]})
df
df.interpolate()
@@ -420,7 +440,7 @@ at the new values.
ser = pd.Series(np.sort(np.random.uniform(size=100)))
# interpolate at new_index
- new_index = ser.index | Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75])
+ new_index = ser.index | pd.Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75])
interp_s = ser.reindex(new_index).interpolate(method='pchip')
interp_s[49:51]
diff --git a/doc/source/options.rst b/doc/source/options.rst
index 9ede87422b21c..26871a11473de 100644
--- a/doc/source/options.rst
+++ b/doc/source/options.rst
@@ -154,7 +154,7 @@ lines are replaced by an ellipsis.
.. ipython:: python
- df=pd.DataFrame(np.random.randn(7,2))
+ df = pd.DataFrame(np.random.randn(7,2))
pd.set_option('max_rows', 7)
df
pd.set_option('max_rows', 5)
@@ -166,7 +166,7 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise.
.. ipython:: python
- df=pd.DataFrame(np.random.randn(5,10))
+ df = pd.DataFrame(np.random.randn(5,10))
pd.set_option('expand_frame_repr', True)
df
pd.set_option('expand_frame_repr', False)
@@ -178,7 +178,7 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise.
.. ipython:: python
- df=pd.DataFrame(np.random.randn(10,10))
+ df = pd.DataFrame(np.random.randn(10,10))
pd.set_option('max_rows', 5)
pd.set_option('large_repr', 'truncate')
df
@@ -192,8 +192,8 @@ of this length or longer will be truncated with an ellipsis.
.. ipython:: python
- df=pd.DataFrame(np.array([['foo', 'bar', 'bim', 'uncomfortably long string'],
- ['horse', 'cow', 'banana', 'apple']]))
+ df = pd.DataFrame(np.array([['foo', 'bar', 'bim', 'uncomfortably long string'],
+ ['horse', 'cow', 'banana', 'apple']]))
pd.set_option('max_colwidth',40)
df
pd.set_option('max_colwidth', 6)
@@ -205,7 +205,7 @@ will be given.
.. ipython:: python
- df=pd.DataFrame(np.random.randn(10,10))
+ df = pd.DataFrame(np.random.randn(10,10))
pd.set_option('max_info_columns', 11)
df.info()
pd.set_option('max_info_columns', 5)
@@ -219,7 +219,7 @@ can specify the option ``df.info(null_counts=True)`` to override on showing a pa
.. ipython:: python
- df=pd.DataFrame(np.random.choice([0,1,np.nan],size=(10,10)))
+ df =pd.DataFrame(np.random.choice([0,1,np.nan], size=(10,10)))
df
pd.set_option('max_info_rows', 11)
df.info()
@@ -227,12 +227,12 @@ can specify the option ``df.info(null_counts=True)`` to override on showing a pa
df.info()
pd.reset_option('max_info_rows')
-``display.precision`` sets the output display precision. This is only a
+``display.precision`` sets the output display precision in terms of decimal places. This is only a
suggestion.
.. ipython:: python
- df=pd.DataFrame(np.random.randn(5,5))
+ df = pd.DataFrame(np.random.randn(5,5))
pd.set_option('precision',7)
df
pd.set_option('precision',4)
@@ -244,7 +244,7 @@ precision at which the number is stored.
.. ipython:: python
- df=pd.DataFrame(np.random.randn(6,6))
+ df = pd.DataFrame(np.random.randn(6,6))
pd.set_option('chop_threshold', 0)
df
pd.set_option('chop_threshold', .5)
@@ -256,7 +256,8 @@ Options are 'right', and 'left'.
.. ipython:: python
- df=pd.DataFrame(np.array([np.random.randn(6), np.random.randint(1,9,6)*.1, np.zeros(6)]).T, columns=['A', 'B', 'C'], dtype='float')
+ df = pd.DataFrame(np.array([np.random.randn(6), np.random.randint(1,9,6)*.1, np.zeros(6)]).T,
+ columns=['A', 'B', 'C'], dtype='float')
pd.set_option('colheader_justify', 'right')
df
pd.set_option('colheader_justify', 'left')
@@ -367,9 +368,11 @@ display.notebook_repr_html True When True, IPython notebook will
pandas objects (if it is available).
display.pprint_nest_depth 3 Controls the number of nested levels
to process when pretty-printing
-display.precision 7 Floating point output precision
- (number of significant digits). This is
- only a suggestion
+display.precision 6 Floating point output precision in
+ terms of number of places after the
+ decimal, for regular formatting as well
+ as scientific notation. Similar to
+ numpy's ``precision`` print option
display.show_dimensions truncate Whether to print out dimensions
at the end of DataFrame repr.
If 'truncate' is specified, only
diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst
index da37c92c88ecf..74cdc5a526585 100644
--- a/doc/source/r_interface.rst
+++ b/doc/source/r_interface.rst
@@ -5,8 +5,8 @@
.. ipython:: python
:suppress:
- from pandas import *
- options.display.max_rows=15
+ import pandas as pd
+ pd.options.display.max_rows = 15
******************
@@ -136,10 +136,8 @@ DataFrames into the equivalent R object (that is, **data.frame**):
.. ipython:: python
- from pandas import DataFrame
-
- df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]},
- index=["one", "two", "three"])
+ df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]},
+ index=["one", "two", "three"])
r_dataframe = com.convert_to_r_dataframe(df)
print(type(r_dataframe))
diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst
index 8215414e425fe..e62f4f9387526 100644
--- a/doc/source/timedeltas.rst
+++ b/doc/source/timedeltas.rst
@@ -97,6 +97,8 @@ It will construct Series if the input is a Series, a scalar if the input is scal
to_timedelta(np.arange(5),unit='s')
to_timedelta(np.arange(5),unit='d')
+.. _timedeltas.operations:
+
Operations
----------
diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
index ce1035e91391a..6f30ff3f51ad5 100644
--- a/doc/source/timeseries.rst
+++ b/doc/source/timeseries.rst
@@ -71,6 +71,23 @@ Resample:
ts.resample('D', how='mean')
+.. _timeseries.overview:
+
+Overview
+--------
+
+Following table shows the type of time-related classes pandas can handle and
+how to create them.
+
+================= ============================== ==================================================
+Class Remarks How to create
+================= ============================== ==================================================
+``Timestamp`` Represents a single time stamp ``to_datetime``, ``Timestamp``
+``DatetimeIndex`` Index of ``Timestamps`` ``to_datetime``, ``date_range``, ``DatetimeIndex``
+``Period`` Represents a single time span ``Period``
+``PeriodIndex`` Index of ``Period`` ``period_range``, ``PeriodIndex``
+================= ============================== ==================================================
+
.. _timeseries.representation:
Time Stamps vs. Time Spans
@@ -78,30 +95,45 @@ Time Stamps vs. Time Spans
Time-stamped data is the most basic type of timeseries data that associates
values with points in time. For pandas objects it means using the points in
-time to create the index
+time.
.. ipython:: python
- dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)]
- ts = Series(np.random.randn(3), dates)
-
- type(ts.index)
-
- ts
+ Timestamp(datetime(2012, 5, 1))
+ Timestamp('2012-05-01')
However, in many cases it is more natural to associate things like change
-variables with a time span instead.
+variables with a time span instead. The span represented by ``Period`` can be
+specified explicitly, or inferred from datetime string format.
For example:
.. ipython:: python
- periods = PeriodIndex([Period('2012-01'), Period('2012-02'),
- Period('2012-03')])
+ Period('2011-01')
+
+ Period('2012-05', freq='D')
+
+``Timestamp`` and ``Period`` can be the index. Lists of ``Timestamp`` and
+``Period`` are automatically coerce to ``DatetimeIndex`` and ``PeriodIndex``
+respectively.
+
+.. ipython:: python
+
+ dates = [Timestamp('2012-05-01'), Timestamp('2012-05-02'), Timestamp('2012-05-03')]
+ ts = Series(np.random.randn(3), dates)
+
+ type(ts.index)
+ ts.index
+
+ ts
+
+ periods = [Period('2012-01'), Period('2012-02'), Period('2012-03')]
ts = Series(np.random.randn(3), periods)
type(ts.index)
+ ts.index
ts
@@ -150,24 +182,39 @@ you can pass the ``dayfirst`` flag:
considerably and on versions later then 0.13.0 explicitly specifying
a format string of '%Y%m%d' takes a faster path still.
+If you pass a single string to ``to_datetime``, it returns single ``Timestamp``.
+Also, ``Timestamp`` can accept the string input.
+Note that ``Timestamp`` doesn't accept string parsing option like ``dayfirst``
+or ``format``, use ``to_datetime`` if these are required.
-Invalid Data
-~~~~~~~~~~~~
+.. ipython:: python
-Pass ``coerce=True`` to convert invalid data to ``NaT`` (not a time):
+ to_datetime('2010/11/12')
-.. ipython:: python
+ Timestamp('2010/11/12')
- to_datetime(['2009-07-31', 'asd'])
- to_datetime(['2009-07-31', 'asd'], coerce=True)
+Invalid Data
+~~~~~~~~~~~~
+.. note::
+
+ In version 0.17.0, the default for ``to_datetime`` is now ``errors='raise'``, rather than ``errors='ignore'``. This means
+ that invalid parsing will raise rather that return the original input as in previous versions.
-Take care, ``to_datetime`` may not act as you expect on mixed data:
+Pass ``errors='coerce'`` to convert invalid data to ``NaT`` (not a time):
.. ipython:: python
+ :okexcept:
- to_datetime([1, '1'])
+ # this is the default, raise when unparseable
+ to_datetime(['2009/07/31', 'asd'], errors='raise')
+
+ # return the original input when unparseable
+ to_datetime(['2009/07/31', 'asd'], errors='ignore')
+
+ # return NaT for input when unparseable
+ to_datetime(['2009/07/31', 'asd'], errors='coerce')
Epoch Timestamps
~~~~~~~~~~~~~~~~
@@ -592,6 +639,46 @@ Another example is parameterizing ``YearEnd`` with the specific ending month:
d + YearEnd()
d + YearEnd(month=6)
+
+.. _timeseries.offsetseries:
+
+Using offsets with ``Series`` / ``DatetimeIndex``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Offsets can be used with either a ``Series`` or ``DatetimeIndex`` to
+apply the offset to each element.
+
+.. ipython:: python
+
+ rng = date_range('2012-01-01', '2012-01-03')
+ s = Series(rng)
+ rng
+ rng + DateOffset(months=2)
+ s + DateOffset(months=2)
+ s - DateOffset(months=2)
+
+If the offset class maps directly to a ``Timedelta`` (``Day``, ``Hour``,
+``Minute``, ``Second``, ``Micro``, ``Milli``, ``Nano``) it can be
+used exactly like a ``Timedelta`` - see the
+:ref:`Timedelta section` for more examples.
+
+.. ipython:: python
+
+ s - Day(2)
+ td = s - Series(date_range('2011-12-29', '2011-12-31'))
+ td
+ td + Minute(15)
+
+Note that some offsets (such as ``BQuarterEnd``) do not have a
+vectorized implementation. They can still be used but may
+calculate signficantly slower and will raise a ``PerformanceWarning``
+
+.. ipython:: python
+ :okwarning:
+
+ rng + BQuarterEnd()
+
+
.. _timeseries.alias:
Custom Business Days (Experimental)
diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
index 51912b5d6b106..4378d182b3128 100644
--- a/doc/source/visualization.rst
+++ b/doc/source/visualization.rst
@@ -1649,6 +1649,7 @@ values, the resulting grid has two columns and two rows. A histogram is
displayed for each cell of the grid.
.. ipython:: python
+ :okwarning:
plt.figure()
@@ -1680,6 +1681,7 @@ Example below is the same as previous except the plot is set to kernel density
estimation. A ``seaborn`` example is included beneath.
.. ipython:: python
+ :okwarning:
plt.figure()
@@ -1706,6 +1708,7 @@ The plot below shows that it is possible to have two or more plots for the same
data displayed on the same Trellis grid cell.
.. ipython:: python
+ :okwarning:
plt.figure()
@@ -1745,6 +1748,7 @@ Below is a similar plot but with 2D kernel density estimation plot superimposed,
followed by a ``seaborn`` equivalent:
.. ipython:: python
+ :okwarning:
plt.figure()
@@ -1774,6 +1778,7 @@ only uses 'sex' attribute. If the second grouping attribute is not specified,
the plots will be arranged in a column.
.. ipython:: python
+ :okwarning:
plt.figure()
@@ -1792,6 +1797,7 @@ the plots will be arranged in a column.
If the first grouping attribute is not specified the plots will be arranged in a row.
.. ipython:: python
+ :okwarning:
plt.figure()
@@ -1816,6 +1822,7 @@ scale objects to specify these mappings. The list of scale classes is
given below with initialization arguments for quick reference.
.. ipython:: python
+ :okwarning:
plt.figure()
diff --git a/doc/source/whatsnew/v0.11.0.txt b/doc/source/whatsnew/v0.11.0.txt
index befdf848ad23b..50b74fc5af090 100644
--- a/doc/source/whatsnew/v0.11.0.txt
+++ b/doc/source/whatsnew/v0.11.0.txt
@@ -103,6 +103,7 @@ Conversion
Mixed Conversion
.. ipython:: python
+ :okwarning:
df3['D'] = '1.'
df3['E'] = '1'
@@ -116,6 +117,7 @@ Mixed Conversion
Forcing Date coercion (and setting ``NaT`` when not datelike)
.. ipython:: python
+ :okwarning:
from datetime import datetime
s = Series([datetime(2001,1,1,0,0), 'foo', 1.0, 1,
@@ -328,4 +330,3 @@ Enhancements
See the :ref:`full release notes
` or issue tracker
on GitHub for a complete list.
-
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
index 164ab73def894..039772f68ee85 100644
--- a/doc/source/whatsnew/v0.17.0.txt
+++ b/doc/source/whatsnew/v0.17.0.txt
@@ -1,14 +1,26 @@
.. _whatsnew_0170:
-v0.17.0 (July 31, 2015)
------------------------
+v0.17.0 (???)
+-------------
This is a major release from 0.16.2 and includes a small number of API changes, several new features,
enhancements, and performance improvements along with a large number of bug fixes. We recommend that all
users upgrade to this version.
+.. warning::
+
+ pandas >= 0.17.0 will no longer support compatibility with Python version 3.2 (:issue:`9118`)
+
Highlights include:
+- Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here `
+- The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
+ previously this would return the original input, see :ref:`here `
+- The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even
+ if they are all ``NaN``, see :ref:`here `
+- Support for ``Series.dt.strftime`` to generate formatted strings for datetime-likes, see :ref:`here `
+- Development installed versions of pandas will now have ``PEP440`` compliant version strings (:issue:`9518`)
+- Support for reading SAS xport files, see :ref:`here `
Check the :ref:`API Changes ` and :ref:`deprecations ` before updating.
@@ -21,40 +33,668 @@ Check the :ref:`API Changes ` and :ref:`deprecations `_
+
+.. _whatsnew_0170.enhancements.sas_xport:
+
+Support for SAS XPORT files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`~pandas.io.read_sas` provides support for reading *SAS XPORT* format files. (:issue:`4052`).
+
+.. code-block:: python
+
+ df = pd.read_sas('sas_xport.xpt')
+
+It is also possible to obtain an iterator and read an XPORT file
+incrementally.
+
+.. code-block:: python
+
+ for df in pd.read_sas('sas_xport.xpt', chunksize=10000)
+ do_something(df)
+
+See the :ref:`docs ` for more details.
.. _whatsnew_0170.enhancements.other:
Other enhancements
^^^^^^^^^^^^^^^^^^
+- `read_sql` and `to_sql` can accept database URI as con parameter (:issue:`10214`)
+
+- Enable `read_hdf` to be used without specifying a key when the HDF file contains a single dataset (:issue:`10443`)
+
+- Added functionality to use the ``base`` argument when resampling a ``TimeDeltaIndex`` (:issue:`10530`)
+
+- ``DatetimeIndex`` can be instantiated using strings contains ``NaT`` (:issue:`7599`)
+- The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has been made consistent. (:issue:`7599`)
+
+ Prior to v0.17.0, ``Timestamp`` and ``to_datetime`` may parse year-only datetime-string incorrectly using today's date, otherwise ``DatetimeIndex``
+ uses the beginning of the year. ``Timestamp`` and ``to_datetime`` may raise ``ValueError`` in some types of datetime-string which ``DatetimeIndex``
+ can parse, such as a quarterly string.
+
+ Previous Behavior
+
+ .. code-block:: python
+
+ In [1]: Timestamp('2012Q2')
+ Traceback
+ ...
+ ValueError: Unable to parse 2012Q2
+
+ # Results in today's date.
+ In [2]: Timestamp('2014')
+ Out [2]: 2014-08-12 00:00:00
+
+ v0.17.0 can parse them as below. It works on ``DatetimeIndex`` also.
+
+ New Behaviour
+
+ .. ipython:: python
+
+ Timestamp('2012Q2')
+ Timestamp('2014')
+ DatetimeIndex(['2012Q2', '2014'])
+
+ .. note:: If you want to perform calculations based on today's date, use ``Timestamp.now()`` and ``pandas.tseries.offsets``.
+
+ .. ipython:: python
+
+ import pandas.tseries.offsets as offsets
+ Timestamp.now()
+ Timestamp.now() + offsets.DateOffset(years=1)
+
+- ``to_datetime`` can now accept ``yearfirst`` keyword (:issue:`7599`)
+
+- ``pandas.tseries.offsets`` larger than the ``Day`` offset can now be used with with ``Series`` for addition/subtraction (:issue:`10699`). See the :ref:`Documentation ` for more details.
+
+- ``.as_blocks`` will now take a ``copy`` optional argument to return a copy of the data, default is to copy (no change in behavior from prior versions), (:issue:`9607`)
+
+- ``regex`` argument to ``DataFrame.filter`` now handles numeric column names instead of raising ``ValueError`` (:issue:`10384`).
+- ``pd.read_stata`` will now read Stata 118 type files. (:issue:`9882`)
+
+- ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`).
+
+- ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`).
+
+- ``read_sql_table`` will now allow reading from views (:issue:`10750`).
+
+- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations ` (:issue:`6511`, :issue:`8505`)
+
+- ``msgpack`` submodule has been updated to 0.4.6 with backward compatibility (:issue:`10581`)
+
+.. ipython :: python
+
+ s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
+ s.drop_duplicates()
+ s.drop_duplicates(keep='last')
+ s.drop_duplicates(keep=False)
+
+
+- Reindex now has a ``tolerance`` argument that allows for finer control of :ref:`basics.limits_on_reindex_fill`:
+
+ .. ipython:: python
+
+ df = pd.DataFrame({'x': range(5), 't': pd.date_range('2000-01-01', periods=5)})
+ df.reindex([0.1, 1.9, 3.5], method='nearest', tolerance=0.2)
+
+ When used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. This allows you to specify tolerance with a string:
+
+ .. ipython:: python
+
+ df = df.set_index('t')
+ df.reindex(pd.to_datetime(['1999-12-31']), method='nearest', tolerance='1 day')
+
+ ``tolerance`` is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods.
+
.. _whatsnew_0170.api:
+.. _whatsnew_0170.api_breaking:
+
Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. _whatsnew_0170.api_breaking:
+.. _whatsnew_0170.api_breaking.to_datetime:
+
+Changes to to_datetime and to_timedelta
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The default for ``pd.to_datetime`` error handling has changed to ``errors='raise'``. In prior versions it was ``errors='ignore'``.
+Furthermore, the ``coerce`` argument has been deprecated in favor of ``errors='coerce'``. This means that invalid parsing will raise rather that return the original
+input as in previous versions. (:issue:`10636`)
+
+Previous Behavior:
+
+.. code-block:: python
+
+ In [2]: pd.to_datetime(['2009-07-31', 'asd'])
+ Out[2]: array(['2009-07-31', 'asd'], dtype=object)
+
+New Behavior:
+
+.. code-block:: python
+
+ In [3]: pd.to_datetime(['2009-07-31', 'asd'])
+ ValueError: Unknown string format
+
+.. ipython:: python
+
+Of course you can coerce this as well.
+
+.. ipython:: python
+
+ to_datetime(['2009-07-31', 'asd'], errors='coerce')
+
+To keep the previous behaviour, you can use ``errors='ignore'``:
+
+.. ipython:: python
+
+ to_datetime(['2009-07-31', 'asd'], errors='ignore')
+
+Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
+has been deprecated in favor of ``errors='coerce'``.
+
+.. _whatsnew_0170.api_breaking.convert_objects:
+
+Changes to convert_objects
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``DataFrame.convert_objects`` keyword arguments have been shortened. (:issue:`10265`)
+
+ ===================== =============
+ Old New
+ ===================== =============
+ ``convert_dates`` ``datetime``
+ ``convert_numeric`` ``numeric``
+ ``convert_timedelta`` ``timedelta``
+ ===================== =============
+
+Coercing types with ``DataFrame.convert_objects`` is now implemented using the
+keyword argument ``coerce=True``. Previously types were coerced by setting a
+keyword argument to ``'coerce'`` instead of ``True``, as in ``convert_dates='coerce'``.
+
+.. ipython:: python
+
+ df = pd.DataFrame({'i': ['1','2'],
+ 'f': ['apple', '4.2'],
+ 's': ['apple','banana']})
+ df
+
+The old usage of ``DataFrame.convert_objects`` used `'coerce'` along with the
+type.
+
+.. code-block:: python
+
+ In [2]: df.convert_objects(convert_numeric='coerce')
+
+Now the ``coerce`` keyword must be explicitly used.
+
+.. ipython:: python
+
+ df.convert_objects(numeric=True, coerce=True)
+
+In earlier versions of pandas, ``DataFrame.convert_objects`` would not coerce
+numeric types when there were no values convertible to a numeric type. This returns
+the original DataFrame with no conversion. This change alters
+this behavior so that converts all non-number-like strings to ``NaN``.
+
+.. code-block:: python
+
+ In [1]: df = pd.DataFrame({'s': ['a','b']})
+ In [2]: df.convert_objects(convert_numeric='coerce')
+ Out[2]:
+ s
+ 0 a
+ 1 b
+
+.. ipython:: python
+
+ pd.DataFrame({'s': ['a','b']})
+ df.convert_objects(numeric=True, coerce=True)
+
+In earlier versions of pandas, the default behavior was to try and convert
+datetimes and timestamps. The new default is for ``DataFrame.convert_objects``
+to do nothing, and so it is necessary to pass at least one conversion target
+in the method call.
+
+Changes to Index Comparisons
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Operator equal on Index should behavior similarly to Series (:issue:`9947`, :issue:`10637`)
+
+Starting in v0.17.0, comparing ``Index`` objects of different lengths will raise
+a ``ValueError``. This is to be consistent with the behavior of ``Series``.
+
+Previous behavior:
+
+.. code-block:: python
+
+ In [2]: pd.Index([1, 2, 3]) == pd.Index([1, 4, 5])
+ Out[2]: array([ True, False, False], dtype=bool)
+
+ In [3]: pd.Index([1, 2, 3]) == pd.Index([2])
+ Out[3]: array([False, True, False], dtype=bool)
+
+ In [4]: pd.Index([1, 2, 3]) == pd.Index([1, 2])
+ Out[4]: False
+
+ In [5]: pd.Series([1, 2, 3]) == pd.Series([1, 4, 5])
+ Out[5]:
+ 0 True
+ 1 False
+ 2 False
+ dtype: bool
+
+ In [6]: pd.Series([1, 2, 3]) == pd.Series([2])
+ ValueError: Series lengths must match to compare
+
+ In [7]: pd.Series([1, 2, 3]) == pd.Series([1, 2])
+ ValueError: Series lengths must match to compare
+
+New behavior:
+
+.. code-block:: python
+
+ In [8]: pd.Index([1, 2, 3]) == pd.Index([1, 4, 5])
+ Out[8]: array([ True, False, False], dtype=bool)
+
+ In [9]: pd.Index([1, 2, 3]) == pd.Index([2])
+ ValueError: Lengths must match to compare
+
+ In [10]: pd.Index([1, 2, 3]) == pd.Index([1, 2])
+ ValueError: Lengths must match to compare
+
+ In [11]: pd.Series([1, 2, 3]) == pd.Series([1, 4, 5])
+ Out[11]:
+ 0 True
+ 1 False
+ 2 False
+ dtype: bool
+
+ In [12]: pd.Series([1, 2, 3]) == pd.Series([2])
+ ValueError: Series lengths must match to compare
+
+ In [13]: pd.Series([1, 2, 3]) == pd.Series([1, 2])
+ ValueError: Series lengths must match to compare
+
+Note that this is different from the ``numpy`` behavior where a comparison can
+be broadcast:
+
+.. ipython:: python
+
+ np.array([1, 2, 3]) == np.array([1])
+
+or it can return False if broadcasting can not be done:
+
+.. ipython:: python
+
+ np.array([1, 2, 3]) == np.array([1, 2])
+
+Changes to Boolean Comparisons vs. None
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to comparing with ``np.nan``, rather than raise ``TypeError``. xref (:issue:`1079`).
+
+.. ipython:: python
+
+ s = Series(range(3))
+ s.iloc[1] = None
+ s
+
+Previous behavior:
+
+.. code-block:: python
+
+ In [5]: s==None
+ TypeError: Could not compare type with Series
+
+New behavior:
+
+.. ipython:: python
+
+ s==None
+
+Usually you simply want to know which values are null.
+
+.. ipython:: python
+
+ s.isnull()
+
+.. warning::
+
+ You generally will want to use ``isnull/notnull`` for these types of comparisons, as ``isnull/notnull`` tells you which elements are null. One has to be
+ mindful that ``nan's`` don't compare equal, but ``None's`` do. Note that Pandas/numpy uses the fact that ``np.nan != np.nan``, and treats ``None`` like ``np.nan``.
+
+ .. ipython:: python
+
+ None == None
+ np.nan == np.nan
+
+.. _whatsnew_0170.api_breaking.hdf_dropna:
+
+HDFStore dropna behavior
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The default behavior for HDFStore write functions with ``format='table'`` is now to keep rows that are all missing. Previously, the behavior was to drop rows that were all missing save the index. The previous behavior can be replicated using the ``dropna=True`` option. (:issue:`9382`)
+
+Previously:
+
+.. ipython:: python
+
+ df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2],
+ 'col2':[1, np.nan, np.nan]})
+
+ df_with_missing
+
+
+.. code-block:: python
+
+ In [28]:
+ df_with_missing.to_hdf('file.h5', 'df_with_missing', format='table', mode='w')
+
+ pd.read_hdf('file.h5', 'df_with_missing')
+
+ Out [28]:
+ col1 col2
+ 0 0 1
+ 2 2 NaN
+
+
+New behavior:
+
+.. ipython:: python
+ :suppress:
+
+ import os
+
+.. ipython:: python
+
+ df_with_missing.to_hdf('file.h5', 'df_with_missing', format = 'table', mode='w')
+
+ pd.read_hdf('file.h5', 'df_with_missing')
+
+.. ipython:: python
+ :suppress:
+
+ os.remove('file.h5')
+
+See :ref:`documentation ` for more details.
+
+Changes to ``display.precision`` option
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``display.precision`` option has been clarified to refer to decimal places (:issue:`10451`).
+
+Earlier versions of pandas would format floating point numbers to have one less decimal place than the value in
+``display.precision``.
+
+.. code-block:: python
+
+ In [1]: pd.set_option('display.precision', 2)
+
+ In [2]: pd.DataFrame({'x': [123.456789]})
+ Out[2]:
+ x
+ 0 123.5
+
+If interpreting precision as "significant figures" this did work for scientific notation but that same interpretation
+did not work for values with standard formatting. It was also out of step with how numpy handles formatting.
+
+Going forward the value of ``display.precision`` will directly control the number of places after the decimal, for
+regular formatting as well as scientific notation, similar to how numpy's ``precision`` print option works.
+
+.. ipython:: python
+
+ pd.set_option('display.precision', 2)
+ pd.DataFrame({'x': [123.456789]})
+
+To preserve output behavior with prior versions the default value of ``display.precision`` has been reduced to ``6``
+from ``7``.
+
+.. ipython:: python
+ :suppress:
+
+ pd.set_option('display.precision', 6)
+
.. _whatsnew_0170.api_breaking.other:
Other API Changes
^^^^^^^^^^^^^^^^^
+- Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
+- Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a Series with a ``CategoricalIndex`` (:issue:`10704`)
+- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
+- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
+- Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
+- Serialize metadata properties of subclasses of pandas objects (:issue:`10553`).
+- ``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` are unique, rather than returning ``np.array`` (:issue:`10508`)
+
+ - unordered category: values and categories are sorted by appearance order.
+ - ordered category: values are sorted by appearance order, categories keeps existing order.
+
+ .. ipython :: python
+
+ cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=True)
+ cat
+ cat.unique()
+
+ cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'])
+ cat
+ cat.unique()
+
+- ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`)
+- ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`)
+
+ =============================== ===============================================================
+ Behavior Methods
+ =============================== ===============================================================
+ ``return np.nan`` ``weekday``, ``isoweekday``
+ ``return NaT`` ``date``, ``now``, ``replace``, ``to_datetime``, ``today``
+ ``return np.datetime64('NaT')`` ``to_datetime64`` (unchanged)
+ ``raise ValueError`` All other public methods (names not beginning with underscores)
+ =============================== ===============================================================
+
+- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
+
.. _whatsnew_0170.deprecations:
Deprecations
^^^^^^^^^^^^
+.. note:: These indexing function have been deprecated in the documentation since 0.11.0.
+
+- For ``Series`` the following indexing functions are deprecated (:issue:`10177`).
+
+ ===================== =================================
+ Deprecated Function Replacement
+ ===================== =================================
+ ``.irow(i)`` ``.iloc[i]`` or ``.iat[i]``
+ ``.iget(i)`` ``.iloc[i]``
+ ``.iget_value(i)`` ``.iloc[i]`` or ``.iat[i]``
+ ===================== =================================
+
+- For ``DataFrame`` the following indexing functions are deprecated (:issue:`10177`).
+
+ ===================== =================================
+ Deprecated Function Replacement
+ ===================== =================================
+ ``.irow(i)`` ``.iloc[i]``
+ ``.iget_value(i, j)`` ``.iloc[i, j]`` or ``.iat[i, j]``
+ ``.icol(j)`` ``.iloc[:, j]``
+ ===================== =================================
+
+- ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
+- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
+- ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They
+ can easily be replaced by using the ``add`` and ``mul`` methods:
+ ``DataFrame.add(other, fill_value=0)`` and ``DataFrame.mul(other, fill_value=1.)``
+ (:issue:`10735`).
+
.. _whatsnew_0170.prior_deprecations:
Removal of prior version deprecations/changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- Remove use of some deprecated numpy comparison operations, mainly in tests. (:issue:`10569`)
+
+
.. _whatsnew_0170.performance:
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~
+- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
+
+- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
+- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
+- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
+- 8x improvement in ``iloc`` using list-like input (:issue:`10791`)
+- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
+- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
+- Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`)
+- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`)
.. _whatsnew_0170.bug_fixes:
Bug Fixes
~~~~~~~~~
+- Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
+- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
+- Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
+- Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`)
+- Bug in ``pd.Series`` when setting a value on an empty ``Series`` whose index has a frequency. (:issue:`10193`)
+- Bug in ``DataFrame.plot`` raises ``ValueError`` when color name is specified by multiple characters (:issue:`10387`)
+- Bug in ``Index`` construction with a mixed list of tuples (:issue:`10697`)
+- Bug in ``DataFrame.reset_index`` when index contains `NaT`. (:issue:`10388`)
+- Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`)
+
+
+- Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`)
+
+- Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`)
+- Bug in ``offsets.generate_range`` where ``start`` and ``end`` have finer precision than ``offset`` (:issue:`9907`)
+- Bug in ``pd.rolling_*`` where ``Series.name`` would be lost in the output (:issue:`10565`)
+- Bug in ``stack`` when index or columns are not unique. (:issue:`10417`)
+- Bug in setting a Panel when an axis has a multi-index (:issue:`10360`)
+- Bug in ``USFederalHolidayCalendar`` where ``USMemorialDay`` and ``USMartinLutherKingJr`` were incorrect (:issue:`10278` and :issue:`9760` )
+- Bug in ``.sample()`` where returned object, if set, gives unnecessary ``SettingWithCopyWarning`` (:issue:`10738`)
+- Bug in ``.sample()`` where weights passed as Series were not aligned along axis before being treated positionally, potentially causing problems if weight indices were not aligned with sampled object. (:issue:`10738`)
+
+
+
+- Bug in ``DataFrame.interpolate`` with ``axis=1`` and ``inplace=True`` (:issue:`10395`)
+- Bug in ``io.sql.get_schema`` when specifying multiple columns as primary
+ key (:issue:`10385`).
+
+- Bug in ``groupby(sort=False)`` with datetime-like ``Categorical`` raises ``ValueError`` (:issue:`10505`)
+
+- Bug in ``test_categorical`` on big-endian builds (:issue:`10425`)
+- Bug in ``Series.shift`` and ``DataFrame.shift`` not supporting categorical data (:issue:`9416`)
+- Bug in ``Series.map`` using categorical ``Series`` raises ``AttributeError`` (:issue:`10324`)
+- Bug in ``MultiIndex.get_level_values`` including ``Categorical`` raises ``AttributeError`` (:issue:`10460`)
+- Bug in ``pd.get_dummies`` with `sparse=True` not returning ``SparseDataFrame`` (:issue:`10531`)
+- Bug in ``Index`` subtypes (such as ``PeriodIndex``) not returning their own type for ``.drop`` and ``.insert`` methods (:issue:`10620`)
+- Bug in ``algos.outer_join_indexer`` when ``right`` array is empty (:issue:`10618`)
+
+- Bug in ``filter`` (regression from 0.16.0) and ``transform`` when grouping on multiple keys, one of which is datetime-like (:issue:`10114`)
+
+
+
+
+
+- Bug that caused segfault when resampling an empty Series (:issue:`10228`)
+- Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)
+- Bug in ``pd.eval`` using ``numexpr`` engine coerces 1 element numpy array to scalar (:issue:`10546`)
+- Bug in ``pd.concat`` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`)
+- Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`, :issue:`10630`)
+- Bug in ``pd.read_csv`` with kwargs ``index_col=False``, ``index_col=['a', 'b']`` or ``dtype``
+ (:issue:`10413`, :issue:`10467`, :issue:`10577`)
+- Bug in ``Series.from_csv`` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`)
+- Bug in ``groupby.var`` which caused variance to be inaccurate for small float values (:issue:`10448`)
+- Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`)
+- Bug in ``read_csv`` when using a converter which generates a ``uint8`` type (:issue:`9266`)
+
+- Bug causes memory leak in time-series line and area plot (:issue:`9003`)
+
+
+- Bug in line and kde plot cannot accept multiple colors when ``subplots=True`` (:issue:`9894`)
+- Bug in ``DataFrame.plot`` raises ``ValueError`` when color name is specified by multiple characters (:issue:`10387`)
+
+- Bug in left and right ``align`` of ``Series`` with ``MultiIndex`` may be inverted (:issue:`10665`)
+- Bug in left and right ``join`` of with ``MultiIndex`` may be inverted (:issue:`10741`)
+
+- Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10757`)
+- Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`)
+- Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`)
+
+- Bug in ``read_csv`` with ``engine='c'``: EOF preceded by a comment, blank line, etc. was not handled correctly (:issue:`10728`, :issue:`10548`)
+
+- Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`).
+- Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`)
+- Bug in ``io.common.get_filepath_or_buffer`` which caused reading of valid S3 files to fail if the bucket also contained keys for which the user does not have read permission (:issue:`10604`)
+- Bug in vectorised setting of timestamp columns with python ``datetime.date`` and numpy ``datetime64`` (:issue:`10408`, :issue:`10412`)
+- Bug in ``Index.take`` may add unnecessary ``freq`` attribute (:issue:`10791`)
+- Bug in ``merge`` with empty ``DataFrame`` may raise ``IndexError`` (:issue:`10824`)
+
+
+- Bug in ``read_csv`` when using the ``nrows`` or ``chunksize`` parameters if file contains only a header line (:issue:`9535`)
+
+- Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`)
+- Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue: `9431`)
+- Bug in ``DatetimeIndex.take`` and ``TimedeltaIndex.take`` may not raise ``IndexError`` against invalid index (:issue:`10295`)
+- Bug in ``Series([np.nan]).astype('M8[ms]')``, which now returns ``Series([pd.NaT])`` (:issue:`10747`)
+- Bug in ``PeriodIndex.order`` reset freq (:issue:`10295`)
+- Bug in ``iloc`` allowing memory outside bounds of a Series to be accessed with negative integers (:issue:`10779`)
+- Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`)
+- Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`)
diff --git a/ez_setup.py b/ez_setup.py
deleted file mode 100644
index 6f63b856f06c9..0000000000000
--- a/ez_setup.py
+++ /dev/null
@@ -1,264 +0,0 @@
-#!python
-"""Bootstrap setuptools installation
-
-If you want to use setuptools in your package's setup.py, just include this
-file in the same directory with it, and add this to the top of your setup.py::
-
- from ez_setup import use_setuptools
- use_setuptools()
-
-If you want to require a specific version of setuptools, set a download
-mirror, or use an alternate download directory, you can do so by supplying
-the appropriate options to ``use_setuptools()``.
-
-This file can also be run as a script to install or upgrade setuptools.
-"""
-from __future__ import print_function
-import sys
-DEFAULT_VERSION = "0.6c11"
-DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[
- :3]
-
-md5_data = {
- 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca',
- 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb',
- 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b',
- 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a',
- 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618',
- 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac',
- 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5',
- 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4',
- 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c',
- 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b',
- 'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090',
- 'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4',
- 'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7',
- 'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5',
- 'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de',
- 'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b',
- 'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2',
- 'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086',
- 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27',
- 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277',
- 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa',
- 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e',
- 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e',
- 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f',
- 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2',
- 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc',
- 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167',
- 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64',
- 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d',
- 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20',
- 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab',
- 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53',
- 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2',
- 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e',
- 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372',
- 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902',
- 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de',
- 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b',
- 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03',
- 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a',
- 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6',
- 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a',
-}
-
-import sys
-import os
-try:
- from hashlib import md5
-except ImportError:
- from md5 import md5
-
-
-def _validate_md5(egg_name, data):
- if egg_name in md5_data:
- digest = md5(data).hexdigest()
- if digest != md5_data[egg_name]:
- print((
- "md5 validation of %s failed! (Possible download problem?)"
- % egg_name
- ), file=sys.stderr)
- sys.exit(2)
- return data
-
-
-def use_setuptools(
- version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
- download_delay=15
-):
- """Automatically find/download setuptools and make it available on sys.path
-
- `version` should be a valid setuptools version number that is available
- as an egg for download under the `download_base` URL (which should end with
- a '/'). `to_dir` is the directory where setuptools will be downloaded, if
- it is not already available. If `download_delay` is specified, it should
- be the number of seconds that will be paused before initiating a download,
- should one be required. If an older version of setuptools is installed,
- this routine will print a message to ``sys.stderr`` and raise SystemExit in
- an attempt to abort the calling script.
- """
- was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules
-
- def do_download():
- egg = download_setuptools(
- version, download_base, to_dir, download_delay)
- sys.path.insert(0, egg)
- import setuptools
- setuptools.bootstrap_install_from = egg
- try:
- import pkg_resources
- except ImportError:
- return do_download()
- try:
- pkg_resources.require("setuptools>=" + version)
- return
- except pkg_resources.VersionConflict as e:
- if was_imported:
- print((
- "The required version of setuptools (>=%s) is not available, and\n"
- "can't be installed while this script is running. Please install\n"
- " a more recent version first, using 'easy_install -U setuptools'."
- "\n\n(Currently using %r)"
- ) % (version, e.args[0]), file=sys.stderr)
- sys.exit(2)
- else:
- del pkg_resources, sys.modules['pkg_resources'] # reload ok
- return do_download()
- except pkg_resources.DistributionNotFound:
- return do_download()
-
-
-def download_setuptools(
- version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
- delay=15
-):
- """Download setuptools from a specified location and return its filename
-
- `version` should be a valid setuptools version number that is available
- as an egg for download under the `download_base` URL (which should end
- with a '/'). `to_dir` is the directory where the egg will be downloaded.
- `delay` is the number of seconds to pause before an actual download attempt.
- """
- import urllib2
- import shutil
- egg_name = "setuptools-%s-py%s.egg" % (version, sys.version[:3])
- url = download_base + egg_name
- saveto = os.path.join(to_dir, egg_name)
- src = dst = None
- if not os.path.exists(saveto): # Avoid repeated downloads
- try:
- from distutils import log
- if delay:
- log.warn("""
----------------------------------------------------------------------------
-This script requires setuptools version %s to run (even to display
-help). I will attempt to download it for you (from
-%s), but
-you may need to enable firewall access for this script first.
-I will start the download in %d seconds.
-
-(Note: if this machine does not have network access, please obtain the file
-
- %s
-
-and place it in this directory before rerunning this script.)
----------------------------------------------------------------------------""",
- version, download_base, delay, url
- )
- from time import sleep
- sleep(delay)
- log.warn("Downloading %s", url)
- src = urllib2.urlopen(url)
- # Read/write all in one block, so we don't create a corrupt file
- # if the download is interrupted.
- data = _validate_md5(egg_name, src.read())
- dst = open(saveto, "wb")
- dst.write(data)
- finally:
- if src:
- src.close()
- if dst:
- dst.close()
- return os.path.realpath(saveto)
-
-
-def main(argv, version=DEFAULT_VERSION):
- """Install or upgrade setuptools and EasyInstall"""
- try:
- import setuptools
- except ImportError:
- egg = None
- try:
- egg = download_setuptools(version, delay=0)
- sys.path.insert(0, egg)
- from setuptools.command.easy_install import main
- return main(list(argv) + [egg]) # we're done here
- finally:
- if egg and os.path.exists(egg):
- os.unlink(egg)
- else:
- if setuptools.__version__ == '0.0.1':
- print((
- "You have an obsolete version of setuptools installed. Please\n"
- "remove it from your system entirely before rerunning this script."
- ), file=sys.stderr)
- sys.exit(2)
-
- req = "setuptools>=" + version
- import pkg_resources
- try:
- pkg_resources.require(req)
- except pkg_resources.VersionConflict:
- try:
- from setuptools.command.easy_install import main
- except ImportError:
- from easy_install import main
- main(list(argv) + [download_setuptools(delay=0)])
- sys.exit(0) # try to force an exit
- else:
- if argv:
- from setuptools.command.easy_install import main
- main(argv)
- else:
- print("Setuptools version", version, "or greater has been installed.")
- print('(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)')
-
-
-def update_md5(filenames):
- """Update our built-in md5 registry"""
-
- import re
-
- for name in filenames:
- base = os.path.basename(name)
- f = open(name, 'rb')
- md5_data[base] = md5(f.read()).hexdigest()
- f.close()
-
- data = sorted([" %r: %r,\n" % it for it in md5_data.items()])
- repl = "".join(data)
-
- import inspect
- srcfile = inspect.getsourcefile(sys.modules[__name__])
- f = open(srcfile, 'rb')
- src = f.read()
- f.close()
-
- match = re.search("\nmd5_data = {\n([^}]+)}", src)
- if not match:
- print("Internal error!", file=sys.stderr)
- sys.exit(2)
-
- src = src[:match.start(1)] + repl + src[match.end(1):]
- f = open(srcfile, 'w')
- f.write(src)
- f.close()
-
-
-if __name__ == '__main__':
- if len(sys.argv) > 2 and sys.argv[1] == '--md5update':
- update_md5(sys.argv[2:])
- else:
- main(sys.argv[1:])
diff --git a/fake_pyrex/Pyrex/Distutils/__init__.py b/fake_pyrex/Pyrex/Distutils/__init__.py
deleted file mode 100644
index 51c8e16b8e546..0000000000000
--- a/fake_pyrex/Pyrex/Distutils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# work around broken setuptools monkey patching
diff --git a/fake_pyrex/Pyrex/Distutils/build_ext.py b/fake_pyrex/Pyrex/Distutils/build_ext.py
deleted file mode 100644
index 4f846f6282cbb..0000000000000
--- a/fake_pyrex/Pyrex/Distutils/build_ext.py
+++ /dev/null
@@ -1 +0,0 @@
-build_ext = "yes, it's there!"
diff --git a/fake_pyrex/Pyrex/__init__.py b/fake_pyrex/Pyrex/__init__.py
deleted file mode 100644
index 51c8e16b8e546..0000000000000
--- a/fake_pyrex/Pyrex/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# work around broken setuptools monkey patching
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 0e7bc628fdb6a..dbc697410da80 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -29,7 +29,6 @@
_np_version_under1p9 = LooseVersion(_np_version) < '1.9'
-from pandas.version import version as __version__
from pandas.info import __doc__
@@ -57,3 +56,8 @@
from pandas.util.print_versions import show_versions
import pandas.util.testing
+# use the closest tagged version if possible
+from ._version import get_versions
+v = get_versions()
+__version__ = v.get('closest-tag',v['version'])
+del get_versions, v
diff --git a/pandas/_version.py b/pandas/_version.py
new file mode 100644
index 0000000000000..61e9f3ff187ea
--- /dev/null
+++ b/pandas/_version.py
@@ -0,0 +1,460 @@
+
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.15 (https://github.com/warner/python-versioneer)
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+ # these strings will be replaced by git during git-archive.
+ # setup.py/versioneer.py will grep for the variable names, so they must
+ # each be defined on a line of their own. _version.py will just call
+ # get_keywords().
+ git_refnames = "$Format:%d$"
+ git_full = "$Format:%H$"
+ keywords = {"refnames": git_refnames, "full": git_full}
+ return keywords
+
+
+class VersioneerConfig:
+ pass
+
+
+def get_config():
+ # these strings are filled in when 'setup.py versioneer' creates
+ # _version.py
+ cfg = VersioneerConfig()
+ cfg.VCS = "git"
+ cfg.style = "pep440"
+ cfg.tag_prefix = "v"
+ cfg.parentdir_prefix = "pandas-"
+ cfg.versionfile_source = "pandas/_version.py"
+ cfg.verbose = False
+ return cfg
+
+
+class NotThisMethod(Exception):
+ pass
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method): # decorator
+ def decorate(f):
+ if vcs not in HANDLERS:
+ HANDLERS[vcs] = {}
+ HANDLERS[vcs][method] = f
+ return f
+ return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False):
+ assert isinstance(commands, list)
+ p = None
+ for c in commands:
+ try:
+ dispcmd = str([c] + args)
+ # remember shell=False, so use git.cmd on windows, not just git
+ p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE,
+ stderr=(subprocess.PIPE if hide_stderr
+ else None))
+ break
+ except EnvironmentError:
+ e = sys.exc_info()[1]
+ if e.errno == errno.ENOENT:
+ continue
+ if verbose:
+ print("unable to run %s" % dispcmd)
+ print(e)
+ return None
+ else:
+ if verbose:
+ print("unable to find command, tried %s" % (commands,))
+ return None
+ stdout = p.communicate()[0].strip()
+ if sys.version_info[0] >= 3:
+ stdout = stdout.decode()
+ if p.returncode != 0:
+ if verbose:
+ print("unable to run %s (error)" % dispcmd)
+ return None
+ return stdout
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+ # Source tarballs conventionally unpack into a directory that includes
+ # both the project name and a version string.
+ dirname = os.path.basename(root)
+ if not dirname.startswith(parentdir_prefix):
+ if verbose:
+ print("guessing rootdir is '%s', but '%s' doesn't start with "
+ "prefix '%s'" % (root, dirname, parentdir_prefix))
+ raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+ return {"version": dirname[len(parentdir_prefix):],
+ "full-revisionid": None,
+ "dirty": False, "error": None}
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+ # the code embedded in _version.py can just fetch the value of these
+ # keywords. When used from setup.py, we don't want to import _version.py,
+ # so we do it with a regexp instead. This function is not used from
+ # _version.py.
+ keywords = {}
+ try:
+ f = open(versionfile_abs, "r")
+ for line in f.readlines():
+ if line.strip().startswith("git_refnames ="):
+ mo = re.search(r'=\s*"(.*)"', line)
+ if mo:
+ keywords["refnames"] = mo.group(1)
+ if line.strip().startswith("git_full ="):
+ mo = re.search(r'=\s*"(.*)"', line)
+ if mo:
+ keywords["full"] = mo.group(1)
+ f.close()
+ except EnvironmentError:
+ pass
+ return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+ if not keywords:
+ raise NotThisMethod("no keywords at all, weird")
+ refnames = keywords["refnames"].strip()
+ if refnames.startswith("$Format"):
+ if verbose:
+ print("keywords are unexpanded, not using")
+ raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+ refs = set([r.strip() for r in refnames.strip("()").split(",")])
+ # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+ # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+ TAG = "tag: "
+ tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+ if not tags:
+ # Either we're using git < 1.8.3, or there really are no tags. We use
+ # a heuristic: assume all version tags have a digit. The old git %d
+ # expansion behaves like git log --decorate=short and strips out the
+ # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+ # between branches and tags. By ignoring refnames without digits, we
+ # filter out many common branch names like "release" and
+ # "stabilization", as well as "HEAD" and "master".
+ tags = set([r for r in refs if re.search(r'\d', r)])
+ if verbose:
+ print("discarding '%s', no digits" % ",".join(refs-tags))
+ if verbose:
+ print("likely tags: %s" % ",".join(sorted(tags)))
+ for ref in sorted(tags):
+ # sorting will prefer e.g. "2.0" over "2.0rc1"
+ if ref.startswith(tag_prefix):
+ r = ref[len(tag_prefix):]
+ if verbose:
+ print("picking %s" % r)
+ return {"version": r,
+ "full-revisionid": keywords["full"].strip(),
+ "dirty": False, "error": None
+ }
+ # no suitable tags, so version is "0+unknown", but full hex is still there
+ if verbose:
+ print("no suitable tags, using unknown + full revision id")
+ return {"version": "0+unknown",
+ "full-revisionid": keywords["full"].strip(),
+ "dirty": False, "error": "no suitable tags"}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+ # this runs 'git' from the root of the source tree. This only gets called
+ # if the git-archive 'subst' keywords were *not* expanded, and
+ # _version.py hasn't already been rewritten with a short version string,
+ # meaning we're inside a checked out source tree.
+
+ if not os.path.exists(os.path.join(root, ".git")):
+ if verbose:
+ print("no .git in %s" % root)
+ raise NotThisMethod("no .git directory")
+
+ GITS = ["git"]
+ if sys.platform == "win32":
+ GITS = ["git.cmd", "git.exe"]
+ # if there is a tag, this yields TAG-NUM-gHEX[-dirty]
+ # if there are no tags, this yields HEX[-dirty] (no NUM)
+ describe_out = run_command(GITS, ["describe", "--tags", "--dirty",
+ "--always", "--long"],
+ cwd=root)
+ # --long was added in git-1.5.5
+ if describe_out is None:
+ raise NotThisMethod("'git describe' failed")
+ describe_out = describe_out.strip()
+ full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+ if full_out is None:
+ raise NotThisMethod("'git rev-parse' failed")
+ full_out = full_out.strip()
+
+ pieces = {}
+ pieces["long"] = full_out
+ pieces["short"] = full_out[:7] # maybe improved later
+ pieces["error"] = None
+
+ # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+ # TAG might have hyphens.
+ git_describe = describe_out
+
+ # look for -dirty suffix
+ dirty = git_describe.endswith("-dirty")
+ pieces["dirty"] = dirty
+ if dirty:
+ git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+ # now we have TAG-NUM-gHEX or HEX
+
+ if "-" in git_describe:
+ # TAG-NUM-gHEX
+ mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+ if not mo:
+ # unparseable. Maybe git-describe is misbehaving?
+ pieces["error"] = ("unable to parse git-describe output: '%s'"
+ % describe_out)
+ return pieces
+
+ # tag
+ full_tag = mo.group(1)
+ if not full_tag.startswith(tag_prefix):
+ if verbose:
+ fmt = "tag '%s' doesn't start with prefix '%s'"
+ print(fmt % (full_tag, tag_prefix))
+ pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+ % (full_tag, tag_prefix))
+ return pieces
+ pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+ # distance: number of commits since tag
+ pieces["distance"] = int(mo.group(2))
+
+ # commit: short hex revision ID
+ pieces["short"] = mo.group(3)
+
+ else:
+ # HEX: no tags
+ pieces["closest-tag"] = None
+ count_out = run_command(GITS, ["rev-list", "HEAD", "--count"],
+ cwd=root)
+ pieces["distance"] = int(count_out) # total number of commits
+
+ return pieces
+
+
+def plus_or_dot(pieces):
+ if "+" in pieces.get("closest-tag", ""):
+ return "."
+ return "+"
+
+
+def render_pep440(pieces):
+ # now build up version string, with post-release "local version
+ # identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+ # get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+ # exceptions:
+ # 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ rendered += plus_or_dot(pieces)
+ rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ else:
+ # exception #1
+ rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+ pieces["short"])
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ return rendered
+
+
+def render_pep440_pre(pieces):
+ # TAG[.post.devDISTANCE] . No -dirty
+
+ # exceptions:
+ # 1: no tags. 0.post.devDISTANCE
+
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"]:
+ rendered += ".post.dev%d" % pieces["distance"]
+ else:
+ # exception #1
+ rendered = "0.post.dev%d" % pieces["distance"]
+ return rendered
+
+
+def render_pep440_post(pieces):
+ # TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that
+ # .dev0 sorts backwards (a dirty tree will appear "older" than the
+ # corresponding clean one), but you shouldn't be releasing software with
+ # -dirty anyways.
+
+ # exceptions:
+ # 1: no tags. 0.postDISTANCE[.dev0]
+
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ rendered += ".post%d" % pieces["distance"]
+ if pieces["dirty"]:
+ rendered += ".dev0"
+ rendered += plus_or_dot(pieces)
+ rendered += "g%s" % pieces["short"]
+ else:
+ # exception #1
+ rendered = "0.post%d" % pieces["distance"]
+ if pieces["dirty"]:
+ rendered += ".dev0"
+ rendered += "+g%s" % pieces["short"]
+ return rendered
+
+
+def render_pep440_old(pieces):
+ # TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty.
+
+ # exceptions:
+ # 1: no tags. 0.postDISTANCE[.dev0]
+
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ rendered += ".post%d" % pieces["distance"]
+ if pieces["dirty"]:
+ rendered += ".dev0"
+ else:
+ # exception #1
+ rendered = "0.post%d" % pieces["distance"]
+ if pieces["dirty"]:
+ rendered += ".dev0"
+ return rendered
+
+
+def render_git_describe(pieces):
+ # TAG[-DISTANCE-gHEX][-dirty], like 'git describe --tags --dirty
+ # --always'
+
+ # exceptions:
+ # 1: no tags. HEX[-dirty] (note: no 'g' prefix)
+
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"]:
+ rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+ else:
+ # exception #1
+ rendered = pieces["short"]
+ if pieces["dirty"]:
+ rendered += "-dirty"
+ return rendered
+
+
+def render_git_describe_long(pieces):
+ # TAG-DISTANCE-gHEX[-dirty], like 'git describe --tags --dirty
+ # --always -long'. The distance/hash is unconditional.
+
+ # exceptions:
+ # 1: no tags. HEX[-dirty] (note: no 'g' prefix)
+
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+ else:
+ # exception #1
+ rendered = pieces["short"]
+ if pieces["dirty"]:
+ rendered += "-dirty"
+ return rendered
+
+
+def render(pieces, style):
+ if pieces["error"]:
+ return {"version": "unknown",
+ "full-revisionid": pieces.get("long"),
+ "dirty": None,
+ "error": pieces["error"]}
+
+ if not style or style == "default":
+ style = "pep440" # the default
+
+ if style == "pep440":
+ rendered = render_pep440(pieces)
+ elif style == "pep440-pre":
+ rendered = render_pep440_pre(pieces)
+ elif style == "pep440-post":
+ rendered = render_pep440_post(pieces)
+ elif style == "pep440-old":
+ rendered = render_pep440_old(pieces)
+ elif style == "git-describe":
+ rendered = render_git_describe(pieces)
+ elif style == "git-describe-long":
+ rendered = render_git_describe_long(pieces)
+ else:
+ raise ValueError("unknown style '%s'" % style)
+
+ return {"version": rendered, "full-revisionid": pieces["long"],
+ "dirty": pieces["dirty"], "error": None}
+
+
+def get_versions():
+ # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+ # __file__, we can work backwards from there to the root. Some
+ # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+ # case we can only use expanded keywords.
+
+ cfg = get_config()
+ verbose = cfg.verbose
+
+ try:
+ return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+ verbose)
+ except NotThisMethod:
+ pass
+
+ try:
+ root = os.path.realpath(__file__)
+ # versionfile_source is the relative path from the top of the source
+ # tree (where the .git directory might live) to this file. Invert
+ # this to find the root from __file__.
+ for i in cfg.versionfile_source.split('/'):
+ root = os.path.dirname(root)
+ except NameError:
+ return {"version": "0+unknown", "full-revisionid": None,
+ "dirty": None,
+ "error": "unable to find root of source tree"}
+
+ try:
+ pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+ return render(pieces, cfg.style)
+ except NotThisMethod:
+ pass
+
+ try:
+ if cfg.parentdir_prefix:
+ return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+ except NotThisMethod:
+ pass
+
+ return {"version": "0+unknown", "full-revisionid": None,
+ "dirty": None,
+ "error": "unable to compute version"}
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
index 5f68c1ee26e87..9b6bdf57d4509 100644
--- a/pandas/algos.pyx
+++ b/pandas/algos.pyx
@@ -2157,6 +2157,8 @@ def group_nth_bin_object(ndarray[object, ndim=2] out,
nobs = np.zeros((